#!myenv/bin/python

from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import sys
import random
import json
import re
import os
from datetime import datetime

all_sellers = []

def send_sse_event(event_type, data):
    """
    Send a Server-Sent Event formatted message
    """
    print(f"event: {event_type}")
    print(f"data: {json.dumps(data, ensure_ascii=False)}")
    print()  # Empty line required for SSE format
    sys.stdout.flush()

def user_agent_rotator():
    user_agents = [
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
        "Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1",
        "Mozilla/5.0 (Linux; Android 13; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.216 Mobile Safari/537.36",
        "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.6045.134 Mobile Safari/537.36"
    ]
    return random.choice(user_agents)

def extract_availability_signal_from_html(html):
    """
    Extract availability information from availabilitySignal JSON structure in HTML
    Handles formats like "More than 10 available" and "32,800 sold"
    """
    availability_info = {}

    # Search for availabilitySignal JSON structure
    availability_pattern = r'"availabilitySignal":\s*{[^}]*"textSpans":\s*\[[^\]]*\]'
    availability_match = re.search(availability_pattern, html)

    if availability_match:
        availability_section = availability_match.group(0)

        # Extract all text values from textSpans
        text_pattern = r'"text"\s*:\s*"([^"]*)"'
        text_matches = re.findall(text_pattern, availability_section)

        for text in text_matches:
            # Look for available quantities
            # Patterns: "10 available", "More than 10 available", "Last one / More than 10 available"
            available_patterns = [
                r'(?:More than\s+)?(\d+(?:,\d{3})*(?:\.\d+)?)\s+available',
                r'(\d+(?:,\d{3})*(?:\.\d+)?)\s+available',
                r'(\d)\s+available',
                r'Last one.*?(\d+(?:,\d{3})*(?:\.\d+)?)\s+available'
            ]

            for pattern in available_patterns:
                available_match = re.search(pattern, text, re.IGNORECASE)
                if available_match:
                    # Remove commas and periods, convert to int
                    number_str = available_match.group(1).replace(',', '').replace('.', '')
                    try:
                        availability_info["available"] = int(number_str)
                        break
                    except ValueError:
                        continue

            # Look for sold quantities
            # Patterns: "32,800 sold", "1.234 sold"
            sold_patterns = [
                r'(\d+(?:,\d{3})*(?:\.\d+)?)\s+sold',
                r'(\d+(?:\.\d{3})*(?:,\d+)?)\s+sold',  # Handle different decimal/thousand separators
                r'(\d+)\s+sold',
            ]

            for pattern in sold_patterns:
                sold_match = re.search(pattern, text, re.IGNORECASE)
                if sold_match:
                    # Remove commas and periods, convert to int
                    number_str = sold_match.group(1).replace(',', '').replace('.', '')
                    try:
                        availability_info["sold"] = int(number_str)
                        break
                    except ValueError:
                        continue

    # Fallback to old HTML span patterns if availabilitySignal not found
    if not availability_info:
        # Search for available quantity pattern in HTML spans
        available_pattern = r'<span class="ux-textspans[^"]*">\s*(?:More than\s+)?(\d+(?:,\d{3})*)\s+available\s*</span>'
        available_match = re.search(available_pattern, html, re.IGNORECASE)

        if available_match:
            number_str = available_match.group(1).replace(',', '')
            try:
                availability_info["available"] = int(number_str)
            except ValueError:
                pass

        # Search for sold quantity pattern in HTML spans
        sold_pattern = r'<span class="ux-textspans[^"]*">\s*(\d+(?:,\d{3})*)\s+sold\s*</span>'
        sold_match = re.search(sold_pattern, html, re.IGNORECASE)

        if sold_match:
            number_str = sold_match.group(1).replace(',', '')
            try:
                availability_info["sold"] = int(number_str)
            except ValueError:
                pass

    # Return the availability info if we found anything, otherwise None
    return availability_info if availability_info else None

def extract_store_link_from_html(html):
    """
    Extract store link from HTML content using the action URL pattern
    """
    # Pattern for store URL in action
    # ,"action":{"_type":"Action","URL":"https://www.ebay.de/str/maschinenwelt?_trksid=p4429486.m168239.l149267"
    store_link_pattern = r'"action"\s*:\s*{\s*"_type"\s*:\s*"Action"\s*,\s*"URL"\s*:\s*"(https://www\.ebay\.[^/]+/str/[^"?]+)'

    store_match = re.search(store_link_pattern, html)
    if store_match:
        return store_match.group(1)

    # Alternative pattern for simpler store links
    alt_pattern = r'"URL"\s*:\s*"(https://www\.ebay\.[^/]+/str/[^"?]+)'
    alt_match = re.search(alt_pattern, html)
    if alt_match:
        return alt_match.group(1)

    return None

def extract_country_from_location(location):
    """
    Extract country from location string
    """
    if not location:
        return None

    # Split by comma and take the last part as country
    parts = [part.strip() for part in location.split(',')]
    if len(parts) > 0:
        return parts[-1]

    return None

def extract_images_from_html(soup):
    """
    Extract images from .ux-image-carousel>img[src]
    """
    images = []

    # Find all images in the carousel
    carousel_images = soup.select('.ux-image-carousel img[src]')

    for img in carousel_images:
        src = img.get('src')
        if src:
            images.append(src)

    return images

def extract_main_image_from_html(soup):
    """
    Extract first image from .ux-image-carousel[0]>img[src]
    """
    # Get the first carousel and its first image
    first_carousel = soup.select_one('.ux-image-carousel')
    if first_carousel:
        first_img = first_carousel.select_one('img[src]')
        if first_img:
            return first_img.get('src')

    return None

def extract_categories_from_html(soup):
    """
    Extract categories from breadcrumbs - select first .breadcrumbs.breadcrumb--overflow only
    """
    categories = []

    # Find first breadcrumb container and its links
    first_breadcrumb = soup.select_one('.breadcrumbs.breadcrumb--overflow')
    if first_breadcrumb:
        breadcrumb_links = first_breadcrumb.select('ul li a')

        for link in breadcrumb_links:
            name = link.get_text(strip=True)
            href = link.get('href')

            if name and href:
                # Convert relative URLs to absolute
                if href.startswith('/'):
                    href = 'https://www.ebay.com' + href

                categories.append({
                    "name": name,
                    "link": href
                })

    return categories

def extract_product_name_from_html(soup):
    """
    Extract product name from first h1 element
    """
    h1_element = soup.select_one('h1')
    if h1_element:
        return h1_element.get_text(strip=True)

    return None

def extract_item_location_from_html(html):
    """
    Extract item location information from HTML content using the itemLocation pattern
    """
    # Pattern based on the actual structure found in eBay pages:
    # "itemLocation":{"_type":"LabelsValues","labels":[{"_type":"TextualDisplay","textSpans":[{"_type":"TextSpan","text":"Location"}]}],"values":[{"_type":"TextualDisplay","textSpans":[{"_type":"TextSpan","text":"Goleta, CA, United States"}]}]}

    # Main pattern for itemLocation in JSON - step by step approach
    # First find the itemLocation section, then extract the location text
    item_location_section = re.search(r'"itemLocation":\{.*?\}\}\}', html)
    if item_location_section:
        section_text = item_location_section.group(0)
        # Now find the location text within this section
        location_match = re.search(r'"values":\[.*?"text":"([^"]+)"', section_text)
        if location_match:
            return location_match.group(1)

    # Alternative simpler pattern for itemLocation
    simple_pattern = r'"itemLocation"[^}]*?"text"\s*:\s*"([^"]+)"'
    simple_match = re.search(simple_pattern, html)
    if simple_match:
        location = simple_match.group(1)
        # Make sure it's not shipping information by checking length and content
        if (len(location) < 100 and
            (',' in location or any(country in location for country in ['United States', 'Deutschland', 'Germany', 'France', 'Italia', 'España', 'Canada', 'UK'])) and
            not ('Afghanistan' in location or 'Albania' in location)):  # Avoid shipping country lists
            return location

    # HTML pattern for the location spans
    html_pattern = r'ux-labels-values--itemLocation[^>]*>[^<]*<[^>]*>Location[^<]*</[^>]*>[^<]*<[^>]*>[^<]*<[^>]*>([^<]+)</span>'
    html_match = re.search(html_pattern, html)
    if html_match:
        return html_match.group(1)

    # More flexible HTML pattern
    flexible_html_pattern = r'class="ux-textspans">Location</span>[^<]*</[^>]*>[^<]*<[^>]*>[^<]*<[^>]*><[^>]*class="ux-textspans">([^<]+)</span>'
    flexible_match = re.search(flexible_html_pattern, html)
    if flexible_match:
        return flexible_match.group(1)

    # Pattern specifically for German locations (Standort)
    standort_pattern = r'"itemLocation"[^}]*?"text"\s*:\s*"Standort"[^}]*?"text"\s*:\s*"([^"]+Deutschland[^"]*)"'
    standort_match = re.search(standort_pattern, html)
    if standort_match:
        return standort_match.group(1)

    return None
def get_seller_from_url(seller_link):
    """
    Extract seller name from seller link URL
    """
    if not seller_link:
        return None

    match = re.search(r'/str/([^/?]+)', seller_link)
    if match:
        return match.group(1)

    return None
def extract_seller_info(soup, html, item_id, output_mode='all'):
    seller = None

    link_tag = soup.select_one(
        'a.ux-action[href*="/str/"]')

    seller_selectors = [
        ".x-sellercard-atf__info a span.ux-textspans",
        ".x-sellercard-atf_info.about-seller a span.ux-textspans",
        "div.x-sellercard-atf__about-seller a span.ux-textspans",
        "a.ux-action span.ux-textspans--BOLD"
    ]
    for sel in seller_selectors:
        tag = soup.select_one(sel)
        if tag and tag.get_text(strip=True):
            seller = tag.get_text(strip=True)
            break

    # Extract availability signal directly from HTML
    availability_info = extract_availability_signal_from_html(html)

    # Extract item location from HTML
    item_location = extract_item_location_from_html(html)

    # Extract store link from HTML (will be renamed to seller_link)
    seller_link = extract_store_link_from_html(html)

    # Extract country from location
    country = extract_country_from_location(item_location)

    # Extract individual availability values
    available = availability_info.get("available") if availability_info else None
    sold = availability_info.get("sold") if availability_info else None

    # Extract images and categories
    main_image = extract_main_image_from_html(soup)
    images = extract_images_from_html(soup)
    categories = extract_categories_from_html(soup)

    # Extract product name
    product_name = extract_product_name_from_html(soup)

    # Create URL
    url = f"https://www.ebay.com/itm/{item_id}"

    if seller:
        seller_info = {
            item_id: {
                "product_name": product_name,
                "url": url,
                "seller_name": seller if seller else get_seller_from_url(seller_link),
                "seller_link": seller_link,
                "available": available,
                "sold": sold,
                "item_location": item_location,
                "country": country,
                "image": main_image,
                "images": images,
                "categories": categories
            }
        }
    else:
        seller_info = {
            item_id: {
                "product_name": product_name,
                "url": url,
                "seller_name": None,
                "seller_link": seller_link,
                "available": available,
                "sold": sold,
                "item_location": item_location,
                "country": country,
                "image": main_image,
                "images": images,
                "categories": categories
            }
        }

    # Handle output based on mode
    if output_mode == 'stream':
        # Send immediately as SSE data event
        send_sse_event('data', seller_info)
    else:
        # Add to global list for batch output
        all_sellers.append(seller_info)

def main():
    # Check if user provided ID
    if len(sys.argv) < 2:
        print("Please provide the item ID")
        sys.exit(1)

    # Check for flags
    save_to_file = False
    headless_mode = False  # Default to showing browser (more stable)
    output_mode = 'all'  # Default output mode
    browser_choice = 'chromium'  # Default browser
    args = sys.argv[1:]

    if '--save' in args:
        save_to_file = True
        args.remove('--save')

    if '--headless' in args:
        headless_mode = True  # Hide browser when --headless flag is passed
        args.remove('--headless')

    # Handle --output-mode parameter
    if '--output-mode' in args:
        output_mode_index = args.index('--output-mode')
        if output_mode_index + 1 < len(args):
            output_mode = args[output_mode_index + 1]
            if output_mode not in ['stream', 'all']:
                print("Error: --output-mode must be either 'stream' or 'all'")
                sys.exit(1)
            # Remove both the flag and its value
            args.pop(output_mode_index)  # Remove the flag
            args.pop(output_mode_index)  # Remove the value (now at same index)
        else:
            print("Error: --output-mode requires a value (stream or all)")
            sys.exit(1)

    # Handle --browser parameter (chromium|firefox)
    if '--browser' in args:
        browser_index = args.index('--browser')
        if browser_index + 1 < len(args):
            browser_choice = args[browser_index + 1].lower()
            if browser_choice not in ['chromium', 'firefox']:
                print("Error: --browser must be either 'chromium' or 'firefox'")
                sys.exit(1)
            # Remove both the flag and its value
            args.pop(browser_index)
            args.pop(browser_index)
        else:
            print("Error: --browser requires a value (chromium or firefox)")
            sys.exit(1)

    if len(args) == 0:
        print("Please provide the item ID")
        sys.exit(1)

    item_ids = args[0].strip().split(",")
    # print(f"Processing Item ID(s): {item_ids}")

    with sync_playwright() as p:
        # Configure browser arguments based on headless mode
        browser_args = []
        if browser_choice == 'chromium':
            browser_args = [
                '--disable-web-security',
                '--disable-features=VizDisplayCompositor',
                '--no-first-run',
                '--disable-default-apps',
                '--disable-blink-features=AutomationControlled',
                '--disable-dev-shm-usage',
                '--no-sandbox'
            ]

        # Add additional stability arguments for headless mode
        if headless_mode and browser_choice == 'chromium':
            browser_args.extend([
                '--disable-gpu',
                '--disable-software-rasterizer',
                '--disable-background-timer-throttling',
                '--disable-backgrounding-occluded-windows',
                '--disable-renderer-backgrounding',
                '--disable-features=TranslateUI'
            ])

        browser_launcher = p.chromium if browser_choice == 'chromium' else p.firefox

        browser = browser_launcher.launch(
            headless=headless_mode,
            args=browser_args
        )
        context = browser.new_context(
            user_agent=user_agent_rotator(),
            ignore_https_errors=True,
            bypass_csp=True,
            java_script_enabled=True,
            extra_http_headers={
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
                'Accept-Encoding': 'gzip, deflate',
                'DNT': '1',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
                'Sec-Fetch-Dest': 'document',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'none',
                'Sec-Fetch-User': '?1'
            }
        )

        def allow_request(route):
            """Skip non-essential assets to speed up navigation."""
            req = route.request
            if req.resource_type in {"document", "script", "xhr", "fetch", "image", "stylesheet"}:
                route.continue_()
            else:
                route.abort()

        context.route("**/*", allow_request)

        # Single page reused for all items to avoid tab churn
        page = context.new_page()

        # Add stealth settings
        page.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined,
            });
        """)

        try:
            for i, item_id in enumerate(item_ids):
                # Send processing status in stream mode
                if output_mode == 'stream':
                    send_sse_event('debug', {
                        'message': f'Processing item {i+1}/{len(item_ids)}: {item_id}',
                        'progress': {
                            'current': i + 1,
                            'total': len(item_ids),
                            'item_id': item_id
                        }
                    })

                url = f"https://www.ebay.com/itm/{item_id}"

                # Navigate with redirect handling
                try:
                    response = page.goto(
                        url,
                        wait_until="networkidle",
                        timeout=15000  # Faster failover
                    )

                    # Check if we got redirected
                    final_url = page.url
                    # if final_url != url:
                        # print(f"Redirected from {url} to {final_url}")

                    # Wait for main content instead of fixed sleep
                    try:
                        page.wait_for_selector("h1", timeout=8000)
                    except Exception:
                        pass  # If it never appears, continue with whatever is loaded

                    # Check final URL again after waiting
                    final_final_url = page.url
                    # if final_final_url != final_url:
                    #     print(f"Further redirect to: {final_final_url}")

                    html = page.content()

                    soup = BeautifulSoup(html, "lxml")
                    extract_seller_info(soup, html, item_id, output_mode)

                except Exception as e:
                    if output_mode == 'stream':
                        send_sse_event('debug', {'message': f'Error loading page for item {item_id}: {e}'})
                    else:
                        print(f"Error loading page for item {item_id}: {e}")
                    # Still add entry even if error occurred
                    extract_seller_info(BeautifulSoup("", "lxml"), "", item_id, output_mode)

        finally:
            # Close browser after all items are processed
            browser.close()

    # Handle output - only if not in stream mode (stream mode prints as it goes)
    if output_mode == 'stream':
        # Send completion event for stream mode
        send_sse_event('complete', {
            'event': 'stream_complete',
            'message': 'All items processed',
            'total_items': len(item_ids)
        })
    else:
        if save_to_file:
            # Create files directory if it doesn't exist
            files_dir = os.path.join(os.getcwd(), 'files')
            os.makedirs(files_dir, exist_ok=True)

            # Generate filename with timestamp
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"ebay_data_{timestamp}.json"
            file_path = os.path.join(files_dir, filename)

            # Save JSON to file
            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(all_sellers, f, indent=2, ensure_ascii=False)

            # Output the file path info to terminal
            print(json.dumps({"file": file_path}, indent=2))
        else:
            # Print to terminal as before
            print(json.dumps(all_sellers, indent=2))

if __name__ == "__main__":
    main()
