#!/usr/bin/env python3
"""
Enhanced eBay Product Scraper with better selectors and error handling

@TODT https://web.archive.org/cdx/search/cdx?url=https://www.ebay.com/itm/126480962681&output=json&fl=timestamp,original&collapse=digest&from=20250712
"""
import asyncio
from itertools import product
import json
import argparse
import sys
import signal
import time
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
from playwright.async_api import async_playwright, Page, Browser, BrowserContext
from urllib.parse import urljoin, urlparse,quote
import re
import aiohttp
product_limit=5
class EnhancedEbayProductScraper:

    def __init__(self, website: str = "ebay.com", headless: bool = True, demo_mode: bool = False, all_websites: bool = False, viewport: Optional[List[int]] = None, num_products: int = 5):
        self.website = website
        self.headless = headless
        self.demo_mode = demo_mode
        self.all_websites = all_websites
        self.viewport = viewport
        self.num_products = num_products
        self.scraped_products = []
        self.start_time = time.time()
        self.output_filename = None
        self.session_stats = {
            "total_urls_visited": 0,
            "successful_scrapes": 0,
            "failed_scrapes": 0,
            "total_products_found": 0,
            "duration": 0.0
        }

        # Set up signal handlers for graceful shutdown
        signal.signal(signal.SIGINT, self._signal_handler)
        signal.signal(signal.SIGTERM, self._signal_handler)

    def _format_duration(self, seconds: float) -> str:
        """Format duration from seconds to HH:mm:ss format"""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        seconds = int(seconds % 60)
        return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

        # List of eBay websites to scrape when all_websites is True
        self.ebay_websites = [
            "ebay.com",      # United States
            "ebay.co.uk",    # United Kingdom
            "ebay.de",       # Germany
            "ebay.ca",       # Canada
            "ebay.com.au",   # Australia
            "ebay.fr",       # France
            "ebay.it",       # Italy
            "ebay.es",       # Spain
            "ebay.at",       # Austria
            "ebay.ch",       # Switzerland
            "ebay.ph",       # Philippines
            "ebay.co.th",    # Thailand
        ]

    async def scrape_product_data(self, page: Page, url: str) -> List[Dict[str, Any]]:
        """Enhanced scraping with better selectors and error handling"""
        products = []
        self.session_stats["total_urls_visited"] += 1

        try:
            print(f"Scraping: {url}")

            # Navigate to the page
            response = await page.goto(url, wait_until="domcontentloaded", timeout=30000)
            # print(f"Page {url} loaded with status: {response.status}")

            if response.status != 200:
                print(f"HTTP {response.status} for {url}")
                self.session_stats["failed_scrapes"] += 1
                return products

            # Wait for content to load
            await page.wait_for_timeout(3000)

            # Try to find products using multiple strategies
            products = await self.extract_products_multiple_strategies(page, url)

            if products:
                self.session_stats["successful_scrapes"] += 1
                self.session_stats["total_products_found"] += len(products)
            else:
                self.session_stats["failed_scrapes"] += 1

            print(f"Found {len(products)} products on {url}")

        except Exception as e:
            print(f"Error scraping {url}: {e}")
            self.session_stats["failed_scrapes"] += 1

        return products

    async def extract_products_multiple_strategies(self, page: Page, url: str) -> List[Dict[str, Any]]:
        """Try multiple extraction strategies"""
        products = []

        # Strategy 1: DNE item tiles (Deal pages)
        products.extend(await self.extract_dne_item_tiles(page))

        # Strategy 2: Standard search results
        if not products:
            products.extend(await self.extract_search_results(page))

        # Strategy 3: Deal tiles
        if not products:
            products.extend(await self.extract_deal_tiles(page))

        # Strategy 4: Category grid
        if not products:
            products.extend(await self.extract_category_grid(page))

        # Strategy 5: Generic product cards
        if not products:
            products.extend(await self.extract_generic_cards(page))

        return products

    async def extract_dne_item_tiles(self, page: Page) -> List[Dict[str, Any]]:
        """Extract products from DNE item tiles (Deal pages)"""
        products = []

        try:
            # Wait for DNE item tiles with multiple possible selectors
            selectors_to_try = [
                ".dne-itemtile",
                ".dne-itemtile-large",
                ".dne-itemtile-small",
                "[itemscope][itemtype*='Product']"
            ]

            items = []
            for selector in selectors_to_try:
                try:
                    await page.wait_for_selector(selector, timeout=3000)
                    items = await page.query_selector_all(selector)
                    if items:
                        print(f"Found {len(items)} items with selector: {selector}")
                        break
                except Exception as e:
                    continue

            if not items:
                print("No DNE item tiles found with any selector")
                return products

            # Limit items based on num_products
            max_items = self.num_products
            for item in items[:max_items]:
                try:
                    product = await self.extract_dne_item_data(item)
                    if product:
                        products.append(product)

                except Exception as e:
                    print(f"Error extracting DNE item: {e}")
                    continue

        except Exception as e:
            print(f"No DNE item tiles found: {e}")

        return products

    async def extract_dne_item_data(self, item) -> Optional[Dict[str, Any]]:
        """Extract data from a DNE item tile"""
        product = {
            "name": "",
            "price": "",
            "image": "",
            "url": "",
            "seller": "",
            "seller_url": "",
            "original_price": "",
            "discount": "",
            "listing_id": "",
            "sold": "",
            "sold_history": ""
        }

        try:
            # Extract listing ID from data attribute
            listing_id = await item.get_attribute("data-listing-id")
            if listing_id:
                product["listing_id"] = listing_id

            # Extract product name - try multiple selectors
            name_selectors = [
                "[itemprop='name']",
                ".dne-itemtile-title",
                ".ellipse-2",
                "h3",
                ".title"
            ]

            for selector in name_selectors:
                name_element = await item.query_selector(selector)
                if name_element:
                    name_text = await name_element.inner_text()
                    if name_text and name_text.strip():
                        product["name"] = name_text.strip()
                        break

            # Extract price - try multiple selectors
            price_selectors = [
                "[itemprop='price']",
                ".dne-itemtile-price [itemprop='price']",
                ".first",
                ".price"
            ]

            for selector in price_selectors:
                price_element = await item.query_selector(selector)
                if price_element:
                    price_text = await price_element.inner_text()
                    if price_text and price_text.strip():
                        product["price"] = price_text.strip()
                        break

            # Extract image - try multiple selectors
            image_selectors = [
                ".dne-itemtile-imagewrapper img",
                ".slashui-image-cntr img",
                "img[src*='ebayimg']",
                "img"
            ]

            for selector in image_selectors:
                img_element = await item.query_selector(selector)
                if img_element:
                    img_src = await img_element.get_attribute("src")
                    if img_src and ('ebayimg' in img_src or 'ebay' in img_src):
                        product["image"] = img_src
                        break

            # Extract URL - try multiple selectors
            url_selectors = [
                "a[itemprop='url']",
                ".dne-itemtile-detail a",
                "a[href*='itm']",
                "a[href*='item']"
            ]

            for selector in url_selectors:
                link_element = await item.query_selector(selector)
                if link_element:
                    href = await link_element.get_attribute("href")
                    if href and ('itm' in href or 'item' in href):
                        product["url"] = self.clean_url(href)
                        break

            # Extract original price and discount
            original_price_selectors = [
                ".itemtile-price-strikethrough",
                ".dne-itemtile-original-price .itemtile-price-strikethrough",
                ".strikethrough"
            ]

            for selector in original_price_selectors:
                original_price_element = await item.query_selector(selector)
                if original_price_element:
                    original_price_text = await original_price_element.inner_text()
                    if original_price_text and original_price_text.strip():
                        product["original_price"] = original_price_text.strip()
                        break

            discount_selectors = [
                ".itemtile-price-bold",
                ".dne-itemtile-original-price .itemtile-price-bold",
                ".discount"
            ]

            for selector in discount_selectors:
                discount_element = await item.query_selector(selector)
                if discount_element:
                    discount_text = await discount_element.inner_text()
                    if discount_text and discount_text.strip():
                        product["discount"] = discount_text.strip()
                        break

            # Validate required fields
            if product["name"] and product["price"]:
                return product

        except Exception as e:
            print(f"Error extracting DNE item data: {e}")

        return None

    async def extract_search_results(self, page: Page) -> List[Dict[str, Any]]:
        """Extract products from search results page"""
        products = []

        try:
            # Wait for search results
            await page.wait_for_selector(".s-item", timeout=5000)

            # Get all search result items
            items = await page.query_selector_all(".s-item")

            # Limit items based on num_products
            max_items = self.num_products
            for item in items[:max_items]:
                try:
                    # Skip ads and sponsored items
                    if await item.query_selector(".s-item__subtitle"):
                        subtitle = await (await item.query_selector(".s-item__subtitle")).inner_text()
                        if "Sponsored" in subtitle or "Ad" in subtitle:
                            continue

                    product = await self.extract_search_item_data(item)
                    if product:
                        products.append(product)

                except Exception as e:
                    print(f"Error extracting search item: {e}")
                    continue

        except Exception as e:
            print(f"No search results found: {e}")

        return products

    async def extract_search_item_data(self, item) -> Optional[Dict[str, Any]]:
        """Extract data from a search result item"""
        product = {
            "name": "",
            "price": "",
            "image": "",
            "url": "",
            "seller": "",
            "seller_url": "",
            "shipping": "",
            "condition": "",
            "location": "",
            "sold": "",
            "sales_30": ""
        }

        try:
            # Extract title
            title_element = await item.query_selector(".s-item__title")
            if title_element:
                product["name"] = await title_element.inner_text()
                product["name"] = re.sub(r'^New Listing', '', product["name"]).strip()

            # Extract price
            price_element = await item.query_selector(".s-item__price")
            if price_element:
                product["price"] = await price_element.inner_text()
                product["price"] = re.sub(r'to.*', '', product["price"]).strip()

            # Extract image
            img_element = await item.query_selector(".s-item__image img")
            if img_element:
                img_src = await img_element.get_attribute("src")
                if img_src:
                    product["image"] = img_src

            # Extract URL
            link_element = await item.query_selector(".s-item__link")
            if link_element:
                href = await link_element.get_attribute("href")
                if href:
                    product["url"] = self.clean_url(href)

            # Extract shipping info
            shipping_element = await item.query_selector(".s-item__shipping")
            if shipping_element:
                product["shipping"] = await shipping_element.inner_text()

            # Extract condition
            condition_element = await item.query_selector(".s-item__subtitle")
            if condition_element:
                condition_text = await condition_element.inner_text()
                if condition_text and "Sponsored" not in condition_text:
                    product["condition"] = condition_text

            # Extract location
            location_element = await item.query_selector(".s-item__location")
            if location_element:
                product["location"] = await location_element.inner_text()

            # Validate required fields
            if product["name"] and product["price"]:
                return product

        except Exception as e:
            print(f"Error extracting item data: {e}")

        return None

    async def extract_deal_tiles(self, page: Page) -> List[Dict[str, Any]]:
        """Extract products from deal tiles"""
        products = []

        try:
            await page.wait_for_selector(".deal-tile", timeout=5000)
            tiles = await page.query_selector_all(".deal-tile")

            for tile in tiles[:self.num_products]:
                try:
                    product = {
                        "name": "",
                        "price": "",
                        "image": "",
                        "url": "",
                        "seller": "",
                        "seller_url": "",
                        "discount": "",
                        "sold": "",
                        "sales_30": ""
                    }

                    # Extract deal tile data
                    title_element = await tile.query_selector(".deal-tile__title")
                    if title_element:
                        product["name"] = await title_element.inner_text()

                    price_element = await tile.query_selector(".deal-tile__price")
                    if price_element:
                        product["price"] = await price_element.inner_text()

                    img_element = await tile.query_selector(".deal-tile__image img")
                    if img_element:
                        img_src = await img_element.get_attribute("src")
                        if img_src:
                            product["image"] = img_src

                    link_element = await tile.query_selector("a")
                    if link_element:
                        href = await link_element.get_attribute("href")
                        if href:
                            product["url"] = self.clean_url(href)

                    # Extract discount info
                    discount_element = await tile.query_selector(".deal-tile__discount")
                    if discount_element:
                        product["discount"] = await discount_element.inner_text()

                    if product["name"] and product["price"]:
                        products.append(product)

                except Exception as e:
                    print(f"Error extracting deal tile: {e}")
                    continue

        except Exception as e:
            print(f"No deal tiles found: {e}")

        return products

    async def extract_category_grid(self, page: Page) -> List[Dict[str, Any]]:
        """Extract products from category grid layout"""
        products = []

        try:
            # Try multiple selectors for category grids
            selectors = [".item-card", ".grid-item", ".category-item"]

            for selector in selectors:
                try:
                    await page.wait_for_selector(selector, timeout=3000)
                    items = await page.query_selector_all(selector)

                    for item in items:
                        try:
                            product = await self.extract_generic_item_data(item)
                            if product:
                                products.append(product)
                        except Exception as e:
                            continue

                    if products:
                        break

                except Exception as e:
                    continue

        except Exception as e:
            print(f"No category grid found: {e}")

        return products

    async def extract_generic_cards(self, page: Page) -> List[Dict[str, Any]]:
        """Extract products from generic card layouts"""
        products = []

        try:
            # Generic selectors
            selectors = ["[data-testid*='item']", ".item", ".product", ".listing"]

            for selector in selectors:
                try:
                    items = await page.query_selector_all(selector)
                    # Limit items based on num_products
                    max_items = self.num_products
                    for item in items[:max_items]:
                        try:
                            product = await self.extract_generic_item_data(item)
                            if product:
                                products.append(product)
                        except Exception as e:
                            continue
                    if products:
                        break
                except Exception as e:
                    continue
        except Exception as e:
            print(f"No generic cards found: {e}")

        return products

    async def extract_generic_item_data(self, item) -> Optional[Dict[str, Any]]:
        """Extract data from generic item elements"""
        product = {
            "name": "",
            "price": "",
            "image": "",
            "url": "",
            "seller": "",
            "seller_url": "",
            "sold": "",
            "sales_30": ""
        }

        try:
            # Try multiple selectors for each field
            title_selectors = ["h3", ".title", "[data-testid*='title']", "a[href*='itm']"]
            price_selectors = [".price", "[data-testid*='price']", ".currency"]
            img_selectors = ["img"]
            link_selectors = ["a[href*='itm']", "a[href*='item']", "a"]

            # Extract title
            for selector in title_selectors:
                element = await item.query_selector(selector)
                if element:
                    text = await element.inner_text()
                    if text and len(text) > 10:
                        product["name"] = text.strip()
                        break

            # Extract price
            for selector in price_selectors:
                element = await item.query_selector(selector)
                if element:
                    text = await element.inner_text()
                    if text and ('$' in text or '£' in text or '€' in text):
                        product["price"] = text.strip()
                        break

            # Extract image
            for selector in img_selectors:
                element = await item.query_selector(selector)
                if element:
                    src = await element.get_attribute("src")
                    if src and ('ebayimg' in src or 'ebay' in src):
                        product["image"] = src
                        break

            # Extract URL
            for selector in link_selectors:
                element = await item.query_selector(selector)
                if element:
                    href = await element.get_attribute("href")
                    if href and ('itm' in href or 'item' in href):
                        product["url"] = self.clean_url(href)
                        break

            if product["name"] and product["price"]:
                return product

        except Exception as e:
            print(f"Error extracting generic item: {e}")

        return None

    def clean_url(self, url: str) -> str:
        """Clean and normalize URLs"""
        if not url:
            return ""

        if url.startswith("//"):
            url = "https:" + url
        elif url.startswith("/"):
            url = f"https://www.{self.website}" + url

        return url

    def get_websites_to_scrape(self) -> List[str]:
        """Get list of websites to scrape based on configuration"""
        if self.all_websites:
            # For demo mode, limit to fewer websites for faster testing
            if self.demo_mode:
                return [
                    "ebay.com",
                    "ebay.co.uk",
                    "ebay.de"
                ]
            else:
                return [
                    "ebay.com",
                    "ebay.co.uk",
                    "ebay.de",
                    "ebay.ca",
                    "ebay.com.au",
                    "ebay.fr",
                    "ebay.it",
                    "ebay.es",
                    "ebay.at",
                    "ebay.ch",
                    # "ebay.ph",
                    # "ebay.co.th"
                ]
        else:
            return [self.website]

    async def extract_categories_from_deals_page(self, page: Page, website: str) -> List[Dict[str, Any]]:
        """Extract category links from the deals page navigation menu. In demo mode, keep only 1 category with 3 subcategories."""
        categories = []
        deals_url = f"https://www.{website}/deals/"

        try:
            print(f"Extracting categories from: {deals_url}")

            # Navigate to the deals page
            response = await page.goto(deals_url, wait_until="domcontentloaded", timeout=30000)

            if response.status != 200:
                print(f"HTTP {response.status} for {deals_url}")
                return categories

            # Wait for navigation to load
            await page.wait_for_timeout(3000)

            # Find the navigation with role="tablist"
            nav_element = await page.query_selector('nav[role="tablist"]')
            if not nav_element:
                print(f"No navigation with role='tablist' found on {website}")
                return categories

            # Get all menu items
            menu_items = await nav_element.query_selector_all('li[role="menu"]')

            for item in menu_items:
                try:
                    # Get the main category link
                    main_link = await item.query_selector('.navigation-desktop-menu-link')
                    if not main_link:
                        continue

                    # Extract category name and URL
                    category_name = await main_link.inner_text()
                    category_url = await main_link.get_attribute('href')

                    if not category_name or not category_url:
                        continue

                    category_name = category_name.strip()

                    print(f"Found category: {category_name} -> {category_url}")

                    # Create category object
                    category_obj = {
                        "category": category_name,
                        "url": category_url,
                        "subcategories": []
                    }

                    # Look for subcategories in the flyout menu
                    flyout = await item.query_selector('.navigation-desktop-flyout')
                    if flyout:
                        subcategory_links = await flyout.query_selector_all('.navigation-desktop-flyout-link')

                        for sub_link in subcategory_links:
                            try:
                                sub_name = await sub_link.inner_text()
                                sub_url = await sub_link.get_attribute('href')

                                if sub_name and sub_url:
                                    sub_name = sub_name.strip()
                                    category_obj["subcategories"].append({
                                        "name": sub_name,
                                        "url": sub_url
                                    })
                                    print(f"  Subcategory: {sub_name} -> {sub_url}")
                            except Exception as e:
                                continue

                    categories.append(category_obj)

                    # If demo_mode, keep only 1 category with 3 subcategories
                    if self.demo_mode and len(categories) == 1:
                        # Truncate subcategories to 3
                        if len(categories[0]["subcategories"]) > 3:
                            categories[0]["subcategories"] = categories[0]["subcategories"][:3]
                        break

                except Exception as e:
                    print(f"Error extracting category from menu item: {e}")
                    continue

            print(f"Extracted {len(categories)} categories from {website}")

        except Exception as e:
            print(f"Error extracting categories from {deals_url}: {e}")

        return categories

    async def scrape_all_categories(self) -> List[Dict[str, Any]]:
        """Scrape all categories with enhanced browser setup"""
        all_products = []
        websites_to_scrape = self.get_websites_to_scrape()

        async with async_playwright() as p:
            browser = await p.chromium.launch(
                headless=self.headless,
                args=[
                    "--no-sandbox",
                    "--disable-blink-features=AutomationControlled",
                    "--disable-web-security",
                    "--disable-features=VizDisplayCompositor"
                ]
            )

            # Use self.viewport if provided, else default to 1024x500
            if self.viewport and len(self.viewport) == 2:
                try:
                    width = int(self.viewport[0])
                    height = int(self.viewport[1])
                except Exception:
                    width, height = 1024, 500
            else:
                width, height = 1024, 500

            context = await browser.new_context(
                viewport={"width": width, "height": height},
                user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            )

            # Add stealth settings
            await context.add_init_script("""
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => undefined,
                });
            """)

            page = await context.new_page()

            # Set headers
            await page.set_extra_http_headers({
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
                "Accept-Encoding": "gzip, deflate",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
            })

            try:
                for website in websites_to_scrape:
                    self.website = website  # Update current website

                    print(f"\n{'='*60}")
                    print(f"SCRAPING WEBSITE: {website.upper()}")
                    print(f"{'='*60}")

                    # Extract categories dynamically from the deals page
                    categories = await self.extract_categories_from_deals_page(page, website)

                    if not categories:
                        print(f"No categories found on {website}, skipping...")
                        continue

                    # Track products for this specific website
                    website_products = []

                    for category in categories:
                        print(f"\n{'='*50}")
                        print(f"Scraping Category: {category['category']} on {website}")
                        print(f"{'='*50}")

                        # Scrape main category URL
                        category_products = await self.scrape_product_data(page, category["url"])
                        if len(category_products) > self.num_products:
                            category_products = category_products[:self.num_products]
                        for product in category_products:
                            product["category"] = category["category"]
                            product["subcategory"] = "Main"
                            product["website"] = website
                        website_products.extend(category_products)

                        # Small delay
                        await asyncio.sleep(2)

                        # Scrape subcategories
                        for subcategory in category.get("subcategories", []):
                            print(f"\n--- Scraping Subcategory: {subcategory['name']} on {website} ---")
                            sub_products = await self.scrape_product_data(page, subcategory["url"])

                            # Limit subcategory products to first num_products items
                            if len(sub_products) > self.num_products:
                                print(f"ℹ️  Subcategory '{subcategory['name']}' has {len(sub_products)} items, taking first {self.num_products}...")
                                sub_products = sub_products[:self.num_products]

                            for product in sub_products:
                                product["category"] = category["category"]
                                product["subcategory"] = subcategory["name"]
                                product["website"] = website
                            website_products.extend(sub_products)

                            # Delay between subcategories
                            await asyncio.sleep(0.5)

                    # Add this website's products to the total collection
                    all_products.extend(website_products)

                    print(f"\n📊 Collected {len(website_products)} products from {website}")

                    # In demo mode, continue to next website (don't break the website loop)

                    # Delay between websites
                    if not self.demo_mode:
                        await asyncio.sleep(3)

            except Exception as e:
                print(f"Error during scraping: {e}")
            finally:
                await browser.close()

        return all_products

    def _signal_handler(self, signum, frame):
        """Handle SIGINT (Ctrl+C) and SIGTERM signals to save data before exit"""
        print(f"\n\n🚨 Received signal {signum}. Saving data before exit...")

        # Calculate duration in seconds and format
        duration_seconds = time.time() - self.start_time
        self.session_stats["duration"] = self._format_duration(duration_seconds)

        # Save current products if any exist
        if self.scraped_products:
            if self.output_filename:
                filename = self.save_to_json(self.scraped_products, self.output_filename)
            else:
                filename = self.save_to_json(self.scraped_products)
            print(f"💾 Data saved to: {filename}")
        else:
            print("ℹ️  No products collected yet - nothing to save")

        print(f"⏱️  Session duration: {self.session_stats['duration']}")
        print("👋 Exiting gracefully...")
        sys.exit(0)

    def save_to_json(self, products: List[Dict[str, Any]], filename: str = None) -> str:
        """Save scraped products to JSON file with enhanced metadata"""
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"ebay_products_{timestamp}.json"

        # Calculate duration in HH:mm:ss format
        duration_seconds = time.time() - self.start_time
        self.session_stats["duration"] = self._format_duration(duration_seconds)

        # Calculate statistics
        categories = {}
        websites = {}
        for product in products:
            cat = product.get("category", "Unknown")
            if cat not in categories:
                categories[cat] = 0
            categories[cat] += 1

            website = product.get("website", "Unknown")
            if website not in websites:
                websites[website] = 0
            websites[website] += 1

        # Add comprehensive metadata
        output_data = {
            "metadata": {
                "scrape_date": datetime.now().isoformat(),
                "websites": self.get_websites_to_scrape(),
                "scraper_version": "2.0",
                "total_products": len(products),
                "categories": categories,
                "session_stats": self.session_stats
            },
            "products": products
        }

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=2, ensure_ascii=False)

        return filename

    def print_summary(self, products: List[Dict[str, Any]]):
        """Print scraping summary"""
        # Calculate duration for summary in HH:mm:ss format
        duration_seconds = time.time() - self.start_time
        duration_formatted = self._format_duration(duration_seconds)

        print(f"\n{'='*60}")
        print(f"SCRAPING SUMMARY")
        print(f"{'='*60}")
        print(f"Total URLs visited: {self.session_stats['total_urls_visited']}")
        print(f"Successful scrapes: {self.session_stats['successful_scrapes']}")
        print(f"Failed scrapes: {self.session_stats['failed_scrapes']}")
        print(f"Total products found: {len(products)}")
        print(f"Session duration: {duration_formatted}")
        print(f"Success rate: {(self.session_stats['successful_scrapes'] / self.session_stats['total_urls_visited'] * 100):.1f}%")

        # Category breakdown
        categories = {}
        for product in products:
            cat = product.get("category", "Unknown")
            if cat not in categories:
                categories[cat] = 0
            categories[cat] += 1

        print(f"\nProducts by category:")
        for cat, count in categories.items():
            print(f"  {cat}: {count}")

        # Website breakdown if multiple websites
        if self.all_websites:
            websites = {}
            for product in products:
                website = product.get("website", "Unknown")
                if website not in websites:
                    websites[website] = 0
                websites[website] += 1

            print(f"\nProducts by website:")
            for website, count in websites.items():
                print(f"  {website}: {count}")

    async def run(self, output_filename: str = None) -> str:
        """Main scraping function"""
        # Store output filename for signal handler
        self.output_filename = output_filename

        websites_list = self.get_websites_to_scrape()

        print(f"Starting Enhanced eBay scraper")
        print(f"Websites to scrape: {', '.join(websites_list)}")
        print(f"Headless mode: {self.headless}")
        print(f"Demo mode: {self.demo_mode} (limit: 5 products)" if self.demo_mode else "Demo mode: False")
        print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

        products = await self.scrape_all_categories()

        # Store products in class for signal handler access
        self.scraped_products = products

        if products:
            filename = self.save_to_json(products, output_filename)
            self.print_summary(products)
            print(f"\nData saved to: {filename}")

            print(f"{'='*50}")
            await self.update_products_with_seller_info_async(filename, filename)

            # Ask user if they want to run ebay_update_sold_only.py
            print(f"\n{'='*60}")
            print("📊 Do you want to update sold counts with current data?")
            print("🔄 This will run ebay_update_sold_only.py to get latest sold numbers from eBay")
            print(f"{'='*60}")

            user_input = input("Run ebay_update_sold_only.py? (y/n): ").lower().strip()

            if user_input in ['y', 'yes', '1']:
                import subprocess
                import os

                print(f"\n🚀 Running ebay_update_sold_only.py on {filename}...")
                print("⏳ This may take a while depending on the number of products...")

                # Get the directory of the current script
                script_dir = os.path.dirname(os.path.abspath(__file__))
                python_path = os.path.join(script_dir, "myenv", "bin", "python")
                update_script = os.path.join(script_dir, "ebay_update_sold_only.py")

                try:
                    # Run the update script
                    cmd = [python_path, update_script, filename, "--delay", "2"]
                    result = subprocess.run(cmd, cwd=script_dir, capture_output=False, text=True)

                    if result.returncode == 0:
                        print(f"\n✅ Successfully updated sold counts in {filename}")
                    else:
                        print(f"\n⚠️ Update script finished with return code: {result.returncode}")

                except Exception as e:
                    print(f"\n❌ Error running update script: {e}")
                    print(f"💡 You can manually run: {python_path} {update_script} {filename} --delay 2")
            else:
                print(f"\n📝 Skipped sold count update. You can manually run it later with:")
                print(f"   ./resources/py/myenv/bin/python ebay_update_sold_only.py {filename} --delay 2")

            return filename
        else:
            print("No products found!")
            return ""

    async def update_products_with_seller_info_async(self, input_file: str, output_file: str) -> Optional[str]:
        """Update products with seller information from input JSON file (async version)"""
        try:
            with open(input_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

            products = data.get('products', [])

            print(f"Found {len(products)} products in {input_file}")
            print(f"{'='*60}")

            # Update products with seller info
            updated_products = await self._update_products_async(products)

            # Update the data with the new products
            data['products'] = updated_products

            # Save the data to output file
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

            print(f"\nData saved to: {output_file}")
            return output_file

        except FileNotFoundError:
            print(f"Error: Input file '{input_file}' not found")
            return None
        except json.JSONDecodeError:
            print(f"Error: Invalid JSON format in '{input_file}'")
            return None
        except Exception as e:
            print(f"Error updating products: {e}")
            return None

    def update_products_with_seller_info(self, input_file: str, output_file: str) -> Optional[str]:
        """Update products with seller information from input JSON file (sync version). If --update, skip products with sold_history.date."""
        try:
            with open(input_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

            products = data.get('products', [])

            print(f"Found {len(products)} products in {input_file}")
            print(f"{'='*60}")

            # Check if in-place update (input_file == output_file)
            in_place = input_file == output_file

            # If in-place, skip products with sold_history.date
            if in_place:
                products_to_update = []
                for p in products:
                    sold_history = p.get('sold_history', {})
                    if isinstance(sold_history, dict) and sold_history.get('date'):
                        print(f"Skipping product with existing sold_history.date: {p.get('name', '')}")
                        continue
                    products_to_update.append(p)
                if not products_to_update:
                    print("All products already have sold_history.date. Nothing to update.")
                    return output_file
            else:
                products_to_update = products

            # Run async function to update products
            updated_products = asyncio.run(self._update_products_async(products_to_update))

            # If in-place, merge updated products back into original list
            if in_place:
                idx = 0
                for i, p in enumerate(products):
                    sold_history = p.get('sold_history', {})
                    if isinstance(sold_history, dict) and sold_history.get('date'):
                        print(f"Skipping product with existing sold_history.date: {p.get('name', '')}")
                        continue
                    products[i] = updated_products[idx]
                    idx += 1
                data['products'] = products
            else:
                data['products'] = updated_products

            # Save the data to output file
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

            print(f"\nData saved to: {output_file}")
            print(f"{'='*60}")
            return output_file

        except FileNotFoundError:
            print(f"Error: Input file '{input_file}' not found")
            return None
        except json.JSONDecodeError:
            print(f"Error: Invalid JSON format in '{input_file}'")
            return None
        except Exception as e:
            print(f"Error updating products: {e}")
            return None

    async def _update_products_async(self, products: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Async function to update products with seller information and write after each update"""
        import os
        updated_products = products.copy()

        # Find the output file path from the call stack (hacky, but needed for immediate write)
        import inspect
        output_file = None
        for frame in inspect.stack():
            if 'output_file' in frame.frame.f_locals:
                output_file = frame.frame.f_locals['output_file']
                break
        if not output_file:
            output_file = 'ebay_products_live_update.json'

        async with async_playwright() as p:
            browser = await p.chromium.launch(
                headless=self.headless,
                args=[
                    "--no-sandbox",
                    "--disable-blink-features=AutomationControlled",
                    "--disable-web-security",
                    "--disable-features=VizDisplayCompositor"
                ]
            )

            # Use self.viewport if provided, else default to 1024x500
            if self.viewport and len(self.viewport) == 2:
                try:
                    width = int(self.viewport[0])
                    height = int(self.viewport[1])
                except Exception:
                    width, height = 1024, 500
            else:
                width, height = 1024, 500

            context = await browser.new_context(
                viewport={"width": width, "height": height},
                user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            )

            # Add stealth settings
            await context.add_init_script("""
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => undefined,
                });
            """)

            page = await context.new_page()

            # Set headers
            await page.set_extra_http_headers({
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
                "Accept-Encoding": "gzip, deflate",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
            })

            try:
                total_products = len(products)
                for i, product in enumerate(products, 1):
                    product_name = product.get('name', 'Unknown Product')
                    product_url = product.get('url', 'No URL')
                    product_price = product.get('price', 'No Price')
                    category = product.get('category', 'Unknown')
                    subcategory = product.get('subcategory', 'Unknown')

                    print(f"Product {i}/{total_products}:")
                    print(f"  Name: {product_name}")
                    print(f"  Price: {product_price}")
                    print(f"  Category: {category} / {subcategory}")
                    print(f"  URL: {product_url}")

                    # Skip products without URLs
                    if not product_url or product_url == 'No URL':
                        print(f"  ⚠️  No URL available, skipping...")
                        updated_products[i-1] = product
                        print(f"  {'-'*40}")
                        continue

                    # Extract seller information
                    seller_info = await self._extract_seller_info(page, product_url)

                    if seller_info:
                        product['seller'] = seller_info.get('seller', '')
                        product['seller_url'] = seller_info.get('seller_url', '')
                        product['sold'] = seller_info.get('sold', 0)
                        print(f"  ✅ Seller: {seller_info.get('seller', 'Not found')}")
                        print(f"  ✅ Seller URL: {seller_info.get('seller_url', 'Not found')}")
                        print(f"  ✅ Sold: {seller_info.get('sold', 0)}")
                    else:
                        product['sold'] = 0
                        print(f"  ❌ Could not extract seller information")

                    # snapshot_date = await find_closest_snapshot(product_url)
                    # if snapshot_date:
                    #     sold_history_data = await open_archived_page(product_url, snapshot_date)
                    #     product['sold_history'] = sold_history_data
                    #     product['sold_history']['period'] = product['sold'] - product['sold_history']['sold']
                    # else:
                    #     product['sold_history'] = {'date': snapshot_date, 'sold': product['sold'], 'period': 0}

                    updated_products[i-1] = product

                    # Write the updated products list to file after each update
                    try:
                        # Try to preserve the original JSON structure if possible
                        if os.path.exists(output_file):
                            with open(output_file, 'r', encoding='utf-8') as f:
                                data = json.load(f)
                            data['products'] = updated_products
                        else:
                            data = {'products': updated_products}
                        with open(output_file, 'w', encoding='utf-8') as f:
                            json.dump(data, f, indent=2, ensure_ascii=False)
                        print(f"  💾 Updated product {i} written to {output_file}")
                    except Exception as e:
                        print(f"  ⚠️  Failed to write product update: {e}")

                    print(f"  {'-'*40}")
                    await asyncio.sleep(1)

            except Exception as e:
                print(f"Error during seller info extraction: {e}")
            finally:
                await browser.close()

        return updated_products

    async def _extract_seller_info(self, page: Page, product_url: str) -> Optional[Dict[str, Any]]:
        """Extract seller information and sold count from a product page"""
        try:
            print(f"  🔍 Extracting seller info from: {product_url}")

            # Navigate to the product page
            response = await page.goto(product_url, wait_until="domcontentloaded", timeout=30000)

            if response.status != 200:
                print(f"  ❌ HTTP {response.status} for {product_url}")
                return None

            # Wait for content to load
            await page.wait_for_timeout(3000)

            seller_info = {
                'seller': '',
                'seller_url': '',
                'sold': 0
            }

            # Try multiple selectors for seller information
            seller_selectors = [
                '.x-sellercard-atf__info__about-seller a',
                '.seller-info a',
                '.seller-card a',
                '.seller-name a',
                'a[href*="/str/"]',
                'a[href*="seller"]',
                '.ebay-seller-info a',
                '.seller-profile a'
            ]

            for selector in seller_selectors:
                try:
                    seller_element = await page.query_selector(selector)
                    if seller_element:
                        seller_name = await seller_element.inner_text()
                        if seller_name and seller_name.strip():
                            seller_info['seller'] = seller_name.strip()

                            seller_url = await seller_element.get_attribute('href')
                            if seller_url:
                                if seller_url.startswith('//'):
                                    seller_url = 'https:' + seller_url
                                elif seller_url.startswith('/'):
                                    seller_url = f'https://www.{self.website}' + seller_url
                                seller_info['seller_url'] = seller_url

                            # Extract sold count from #qtyAvailability > last .ux-textspans.ux-textspans--SECONDARY
                            seller_info['sold'] = await get_sold(page)

                            print(f"  ✅ Found seller with selector: {selector}")
                            return seller_info
                except Exception as e:
                    continue

            # If no seller found with main selectors, try alternative approaches
            alternative_selectors = [
                '.seller-info-container',
                '.seller-details',
                '.seller-profile-container',
                '.seller-information'
            ]

            for container_selector in alternative_selectors:
                try:
                    container = await page.query_selector(container_selector)
                    if container:
                        links = await container.query_selector_all('a')
                        for link in links:
                            href = await link.get_attribute('href')
                            if href and ('/str/' in href or 'seller' in href.lower()):
                                seller_name = await link.inner_text()
                                if seller_name and seller_name.strip():
                                    seller_info['seller'] = seller_name.strip()
                                    seller_info['seller_url'] = href
                                    # Extract sold count from #qtyAvailability > last .ux-textspans.ux-textspans--SECONDARY
                                    qty_availability = await page.query_selector('#qtyAvailability')
                                    sold_num = 0
                                    if qty_availability:
                                        secondary_spans = await qty_availability.query_selector_all('.ux-textspans.ux-textspans--SECONDARY')
                                        if secondary_spans:
                                            last_span = secondary_spans[-1]
                                            sold_text = await last_span.inner_text()
                                            match = re.search(r'(\d[\d,.]*)', sold_text)
                                            if match:
                                                sold_num = int(match.group(1).replace(',', '').replace('.', ''))
                                    seller_info['sold'] = sold_num
                                    print(f"  ✅ Found seller in container: {container_selector}")
                                    return seller_info
                except Exception as e:
                    continue

            print(f"  ❌ No seller information found")
            return None
        except Exception as e:
            print(f"  ❌ Error extracting seller info: {e}")
            return None
async def get_sold(page) -> int:
    qty_availability = await page.query_selector('#qtyAvailability')
    sold_num = 0
    if qty_availability:
        secondary_spans = await qty_availability.query_selector_all('.ux-textspans.ux-textspans--SECONDARY')
        if secondary_spans:
            last_span = secondary_spans[-1]
            sold_text = await last_span.inner_text()
            match = re.search(r'(\d[\d,.]*)', sold_text)
            if match:
                sold_num = int(match.group(1).replace(',', '').replace('.', ''))
    return sold_num

async def open_browser(playwright) -> BrowserContext:
    """Open a browser using Playwright."""
    context = await playwright.chromium.launch_persistent_context(
        user_data_dir="browser-data",
        headless=False,
        viewport={"width": 1024, "height": 768}
    )
    return context

async def open_page(context: BrowserContext, url: str) -> Page:
    """Open a page in the browser context."""
    page = await context.new_page()
    max_retries = 3
    for attempt in range(max_retries):
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=90000)
            return page
        except Exception as e:
            print(f"[Wayback] Attempt {attempt+1} failed to load {url}: {e}")
            if attempt == max_retries - 1:
                raise
            await asyncio.sleep(1)
    return page

async def close_browser(context: BrowserContext) -> None:
    """Close the browser context."""
    await context.close()


async def find_closest_snapshot(page: Page, url: str, days=60) -> Optional[str]:
    from_date = (datetime.now() - timedelta(days)).strftime("%Y%m%d")
    to_date = datetime.now().strftime("%Y%m%d")
    archive_url = f"https://web.archive.org/cdx/search/cdx?url={url}&output=json&fl=timestamp,statuscode&collapse=digest&from={from_date}&to={to_date}&filter=statuscode:200"
    print(f"Fetching snapshot list: {archive_url}")

    try:
        await page.goto(archive_url, wait_until="domcontentloaded", timeout=300000)
        pre_content = await page.locator('pre').text_content()
        history_data = json.loads(pre_content)
        if history_data and len(history_data) > 1:
            history_data.pop(0)  # Remove header
            target_date = datetime.now() - timedelta(days=days)
            valid_rows = [row for row in history_data if len(row) > 1 and row[1] == '200']
            if not valid_rows:
                return None
            closest = valid_rows[-1][0]
            min_diff = None
            for row in valid_rows:
                ts = row[0]
                snap_date = datetime.strptime(ts[:8], "%Y%m%d")
                diff = abs((snap_date - target_date).total_seconds())
                if min_diff is None or diff < min_diff:
                    min_diff = diff
                    closest = ts
            return closest
    except Exception as e:
        print(f"Error in find_closest_snapshot: {e}")
        return None


async def get_sold_history(context, url, snapshot_date) -> dict:
    """Get the sold history from the Wayback Machine."""
    url = f"https://web.archive.org/web/{snapshot_date}/{quote(url)}"
    print(f"Fetching sold history from: {url}")
    page = await open_page(context, url)
    # content = await get_page_content(page)
    sold_data = await get_sold(page)
    print(f"Sold data: {sold_data}")
    return sold_data

async def open_archived_page(context: BrowserContext, url: str, snapshot_date: str) -> dict:
    archived_url = f"https://web.archive.org/web/{snapshot_date}/{quote(url)}"
    print(f"Opening archived URL: {archived_url}")

    try:
        page = await context.new_page()
        await page.goto(archived_url, wait_until="domcontentloaded", timeout=30000)
        sold = await get_sold(page)
        await page.close()
        return {
            'date': snapshot_date,
            'sold': sold,
            'period': 0,
            'formated_date': '',
            # 'url': archived_url
        }
    except Exception as e:
        print(f"Error loading archived page: {e}")
        return {
            'date': snapshot_date,
            'sold': 0,
            'period': 0,
            'formated_date': '',
            # 'url': archived_url
        }

async def update_sold_history_from_file(input_file: str, output_file: str):
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        products = data.get('products', [])
        print(f"Updating sold_history for {len(products)} products...")

        async with async_playwright() as playwright:
            browser = await playwright.chromium.launch(headless=False)
            context = await browser.new_context()
            snapshot_page = await context.new_page()  # يستخدم فقط لاستعلام CDX API

            for i, product in enumerate(products):
                url = product.get("url")
                if not url:
                    print(f"[{i+1}] Skipped: No URL")
                    continue

                snapshot_date = await find_closest_snapshot(snapshot_page, url)
                if snapshot_date:
                    sold_history = await open_archived_page(context, url, snapshot_date)
                    product['sold_history'] = sold_history
                    product['sold_history']['period'] = product.get('sold', 0) - sold_history.get('sold', 0)
                    product['sold_history']['formated_date'] = datetime.strptime(product['sold_history']['date'][:8], "%Y%m%d").strftime("%y-%m-%d")
                    print(f"[{i+1}] ✅ Done: {product.get('name', '')}")
                else:
                    print(f"[{i+1}] ⚠️ No snapshot found for {url}")
                    product['sold_history']= {'date': None, 'sold': 0, 'period': 0}

                await asyncio.sleep(1)

            await snapshot_page.close()
            await context.close()

        # حفظ النتائج
        data['products'] = products
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"\n✅ Saved updated sold_history to {output_file}")


    except Exception as e:
        print(f"❌ Error updating sold history: {e}")


def main():
    parser = argparse.ArgumentParser(description="Enhanced eBay Product Scraper")
    parser.add_argument(
        "--num-products",
        type=int,
        default=5,
        help="Number of products to scrape per category/website (default: 5)"
    )
    parser.add_argument(
        "--website",
        default="ebay.com",
        help="Website to scrape (default: ebay.com)"
    )
    parser.add_argument('--viewport',
        nargs='?',
        default="1024x500",
        help="Viewport size in format WIDTHxHEIGHT (default: 1024x500)"
    )
    parser.add_argument(
        "--headless",
        action="store_false",  # Default to False (visible browser)
        help="Run browser in headless mode"
    )
    parser.add_argument(
        "--demo",
        action="store_true",
        help="Enable demo mode (scrape only 5 products)"
    )
    parser.add_argument(
        "--all-websites",
        action="store_true",
        help="Scrape all eBay websites (ebay.com, ebay.de, ebay.co.uk, etc.)"
    )
    parser.add_argument(
        "--output",
        help="Output JSON filename"
    )
    parser.add_argument(
        "--update-seller-info",
        nargs=2,
        metavar=('INPUT_FILE', 'OUTPUT_FILE'),
        help="Update products with seller info from existing JSON file (input_file output_file)"
    )
    parser.add_argument(
        "--update-sold-only",
        metavar='FILENAME',
        help="Update products with sold info in-place in the given JSON file (input=output)"
    )
    parser.add_argument(
        "--update",
        metavar='FILENAME',
        help="Update products with seller info in-place in the given JSON file (input=output)"
    )

    parser.add_argument(
        "--update-sold-history",
        nargs=2,
        metavar=('INPUT_FILE', 'OUTPUT_FILE'),
        help="Update sold_history for products using Wayback Machine"
    )

    args = parser.parse_args()

    # Create scraper instance
    scraper = EnhancedEbayProductScraper(
        website=args.website,
        headless=args.headless,
        demo_mode=args.demo,
        all_websites=args.all_websites,
        viewport=args.viewport.split('x') if args.viewport else None,
        num_products=args.num_products
    )

    # Handle seller info update mode
    if args.update_seller_info:
        input_file, output_file = args.update_seller_info
        result = scraper.update_products_with_seller_info(input_file, output_file)
        if result:
            print(f"\n✅ Successfully updated products! Output saved to: {result}")
        else:
            print("\n❌ Failed to update products")
            sys.exit(1)
        return

    # Handle in-place update mode
    if args.update:
        filename = args.update
        result = scraper.update_products_with_seller_info(filename, filename)
        if result:
            print(f"\n✅ Successfully updated products in-place! Output saved to: {result}")
        else:
            print("\n❌ Failed to update products in-place")
            sys.exit(1)
        return

    if args.update_sold_history:
        input_file, output_file = args.update_sold_history
        asyncio.run(update_sold_history_from_file(input_file, output_file))
        return

    # Run the scraper
    try:
        filename = asyncio.run(scraper.run(args.output))
        if filename:
            print(f"\n✅ Scraping successful! Output saved to: {filename}")
        else:
            print("\n❌ Scraping failed - no data collected")
            sys.exit(1)
    except KeyboardInterrupt:
        print("\n🛑 Scraping interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\n❌ Scraping failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()