#!/usr/bin/env python3
"""
Post-process AliExpress products to extract store information
"""

import json
from random import random
import sys
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import time
import re
import os


def extract_store_info_from_product_page(page, product_url):
    """Extract store information from a product page"""
    try:
        print(f"🔍 Processing: {product_url}")

        # Navigate to product page
        page.goto(product_url, timeout=60000, wait_until="domcontentloaded")
        page.wait_for_timeout(3000)

        # Check for captcha and wait if present
        captcha_selectors = [
            '.captcha',
            '#captcha',
            '[class*="captcha"]',
            '[id*="captcha"]',
            '.verify',
            '#verify',
            '[class*="verify"]',
            'iframe[src*="captcha"]',
            '.nc_wrapper',  # Common AliExpress captcha wrapper
            '.bx-captcha'   # Another captcha type
        ]

        captcha_detected = False
        for selector in captcha_selectors:
            if page.locator(selector).count() > 0:
                captcha_detected = True
                break

        if captcha_detected:
            print("🛡️  CAPTCHA detected! Please solve it manually...")
            print("⏳ Waiting for CAPTCHA to be resolved...")
            # Play beep sound to alert user (multiple beeps for attention)
            for _ in range(5):  # Play 5 beeps
                print('\a', end='', flush=True)  # ASCII bell character
                time.sleep(0.3)
            # Try multiple methods to play system sound
            os.system('play -nq -t alsa synth 0.5 sine 1000 2>/dev/null || paplay /usr/share/sounds/freedesktop/stereo/bell.oga 2>/dev/null || beep -f 1000 -l 500 2>/dev/null || printf "\\a"')

            # Wait for captcha to be resolved (check every 5 seconds for up to 5 minutes)
            max_wait_time = 300  # 5 minutes
            wait_interval = 5    # 5 seconds
            elapsed_time = 0

            while elapsed_time < max_wait_time:
                page.wait_for_timeout(wait_interval * 1000)
                elapsed_time += wait_interval

                # Check if captcha is still present
                captcha_still_present = False
                for selector in captcha_selectors:
                    if page.locator(selector).count() > 0:
                        captcha_still_present = True
                        break

                if not captcha_still_present:
                    print("✅ CAPTCHA resolved! Continuing...")
                    break

                print(f"⏳ Still waiting for CAPTCHA... ({elapsed_time}s/{max_wait_time}s)")

            if elapsed_time >= max_wait_time:
                print("⚠️  CAPTCHA wait timeout. Continuing anyway...")

        # Scroll to the bottom of the page to load all content
        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        page.wait_for_timeout(2000)

        # Get page content after scrolling
        content = page.content()
        soup = BeautifulSoup(content, "lxml")

        store_info = {"store": "", "store_link": "", "store_logo": "", "category": ""}

        # Priority Method: Find store links with /store/ in href (after scrolling to bottom)

        # Find elements with class starting with "store-detail--wrap-"
        store_detail_elements = soup.find_all(class_=lambda x: x and x.startswith('store-detail--wrap-'))
        store_links = []
        for element in store_detail_elements:
            # Find all <a> tags inside these elements
            links = element.find_all('a')
            store_links.extend(links)

        if store_links:
            # Use the first store link found
            store_link_elem = store_links[0]
            store_name = store_link_elem.get_text(strip=True)
            store_href = store_link_elem.get('href', '')

            if store_name:
                store_info["store"] = store_name

            if store_href:
                if not store_href.startswith('https:'):
                    store_href = "https:" + store_href
                store_info["store_link"] = store_href

        # Method 1: Find store name and link from element with class starting with "store-info--name-" (fallback)
        if not store_info["store"] or not store_info["store_link"]:
            store_name_elements = soup.find_all(class_=lambda x: x and x.startswith('store-info--name-'))

            for element in store_name_elements:
                # Find first <a> tag inside this element
                store_link_elem = element.find('a')
                if store_link_elem:
                    store_name = store_link_elem.get_text(strip=True)
                    store_href = store_link_elem.get('href', '')

                    if store_name and not store_info["store"]:
                        store_info["store"] = store_name

                    if store_href and not store_info["store_link"]:
                        if not store_href.startswith('https:'):
                            store_href = "https:" + store_href
                        store_info["store_link"] = store_href

                    break  # Use first match

        # Method 2: Find store logo from element with class starting with "store-info--logo"
        store_logo_elements = soup.find_all(class_=lambda x: x and x.startswith('store-info--logo'))

        for element in store_logo_elements:
            # Check for background-image in style attribute
            style_attr = element.get('style', '')
            if 'background-image' in style_attr:
                # Extract URL from background-image: url("...")
                match = re.search(r'background-image:\s*url\(["\']?(.*?)["\']?\)', style_attr)
                if match:
                    logo_url = match.group(1)
                    if not logo_url.startswith('https:'):
                        logo_url = "https:" + logo_url
                    store_info["store_logo"] = logo_url
                    break

            # Alternative: check for img tag inside
            img_elem = element.find('img')
            if img_elem and img_elem.get('src'):
                logo_url = img_elem['src']
                if not logo_url.startswith('https:'):
                    logo_url = "https:" + logo_url
                store_info["store_logo"] = logo_url
                break

        # Fallback methods if primary methods fail
        if not store_info["store"] or not store_info["store_link"]:
            # Try other store name selectors, prioritizing /store/ links
            fallback_selectors = [
                'a[href*="/store/"]',  # Primary fallback - links containing /store/
                '.store-name a',
                '[class*="store-name"] a',
                '[class*="seller"] a'
            ]

            for selector in fallback_selectors:
                elements = soup.select(selector)
                for elem in elements:
                    if not store_info["store"]:
                        store_name = elem.get_text(strip=True)
                        if store_name:
                            store_info["store"] = store_name

                    if not store_info["store_link"]:
                        href = elem.get('href', '')
                        if href:
                            if not href.startswith('https:'):
                                href = "https:" + href
                            store_info["store_link"] = href

                    # If we found both, break out of both loops
                    if store_info["store"] and store_info["store_link"]:
                        break

                # If we found both, break out of selector loop
                if store_info["store"] and store_info["store_link"]:
                    break




        print(f"✅ Store: {store_info['store'][:30]}... | Logo: {'Yes' if store_info['store_logo'] else 'No'}... | cat: {store_info['category']}")
        return store_info

    except Exception as e:
        print(f"❌ Error processing {product_url}: {e}")
        return {"store": "", "store_link": "", "store_logo": ""}


def fix_store_information(json_file_path):
    """Fix store information in the JSON file"""

    print("🚀 Starting Store Information Fix")
    print("=" * 50)

    # Load the JSON file
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            products = json.load(f)
        print(f"📂 Loaded {len(products)} products from {json_file_path}")
    except FileNotFoundError:
        print(f"❌ File {json_file_path} not found!")
        return
    except json.JSONDecodeError as e:
        print(f"❌ Error parsing JSON: {e}")
        return

    # Launch browser
    with sync_playwright() as p:
        try:
            browser = p.chromium.launch_persistent_context(
                'chrome-data',
                headless=False,  # Set to False for visible mode
                args=[
                    "--disable-gpu",
                    "--no-sandbox",
                    "--disable-dev-shm-usage",
                    "--disable-blink-features=AutomationControlled",
                    "--disable-web-security"
                ]
            )
            page = browser.new_page()
        except Exception as e:
            print(f"⚠️  Failed to launch persistent context: {e}")
            print("🔄 Trying regular browser launch...")
            browser = p.chromium.launch(
                headless=False,  # Set to False for visible mode
                args=[
                    "--disable-gpu",
                    "--no-sandbox",
                    "--disable-dev-shm-usage",
                    "--disable-blink-features=AutomationControlled",
                    "--disable-web-security"
                ]
            )
            page = browser.new_page()

        updated_count = 0
        filtered_products = []

        for i, product in enumerate(products, 1):
            product_link = product.get('link', '')

            if not product_link:
                print(f"⚠️  Product {i}/{len(products)}: No product link found")
                continue

            # Skip if store name is "Google Play"
            if product.get('store') == "Google Play":
                print(f"🚫 Product {i}/{len(products)}: Skipping Google Play store")
                continue

            # Skip if store info already exists
            if product.get('store') and product.get('store_link'):
                print(f"⏭️  Product {i}/{len(products)}: Store info already exists")
                filtered_products.append(product)
                continue

            print(f"\n🔄 Processing Product {i}/{len(products)}")

            # Extract store information
            store_info = extract_store_info_from_product_page(page, product_link)

            # Check if extracted store is "Google Play" and skip if so
            if store_info.get("store") == "Google Play":
                print(f"🚫 Product {i}: Skipping Google Play store (extracted)")
                continue

            # Update product with store information
            if store_info["store"] or store_info["store_link"] or store_info["store_logo"]:
                product.update(store_info)
                updated_count += 1
                print(f"✅ Updated product {i}")
                filtered_products.append(product)
            else:
                print(f"⚠️  No store info found for product {i}")
                filtered_products.append(product)

            # Extract specifications from the loaded page's HTML content
            html_content = page.content()
            product['category'] = extract_specifications(html_content)


            # Add small delay to avoid overwhelming the server
            # Add random delay between 20 and 90 seconds
            sleep_time = .5
            print(f"😴 Sleeping for {sleep_time:.1f} seconds...")
            time.sleep(sleep_time)

        try:
            browser.close()
        except Exception as e:
            print(f"⚠️  Warning during browser close: {e}")

    # Save updated JSON
    output_file = json_file_path
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(filtered_products, f, indent=2, ensure_ascii=False)
        print(f"\n✅ Updated JSON saved to: {output_file}")
        print(f"📊 Successfully updated {updated_count}/{len(products)} products")
        print(f"📋 Filtered products count: {len(filtered_products)}/{len(products)}")
    except Exception as e:
        print(f"❌ Error saving file: {e}")


def extract_specifications(html):
    # Try to extract category from meta property if present
    category = None
    meta_category = re.search(r'<meta\s+property=["\']og:category["\']\s+content=["\']([^"\']+)["\']', html, re.IGNORECASE)
    if meta_category:
        category = meta_category.group(1)
    return category

def main():
    """Main function"""
    if len(sys.argv) > 1:
        json_file = sys.argv[1]
    else:
        json_file = "aliexpress_trending_products.json"

    print(f"🎯 Target file: {json_file}")
    fix_store_information(json_file)


if __name__ == "__main__":
    main()
