#!/usr/bin/env python3
import argparse
import os
import urllib.parse
import json
import re
import sys
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import datetime
import random

parser = argparse.ArgumentParser(
    description="Fast eBay scraper: collect 120 items WITH seller_sold_items")
parser.add_argument("search_term", nargs="+",
                    help="Product keywords (multi-word supported)")

parser.add_argument("--limit", type=int, default=100,
                    help="Target number of items that HAVE seller_sold_items (default 120)")
parser.add_argument("--page_count", type=str, default="240",
                    help="Items per page on eBay (240 recommended)")
parser.add_argument("--max_pages", type=int, default=5,
                    help="Safety cap on pages to scan (default 50)")
parser.add_argument("--timeout", type=int, default=10,
                    help="Per-request timeout seconds (default 10)")
parser.add_argument("--site", type=str, default="ebay.com",
                    help="eBay site (default ebay.com)")
parser.add_argument("--debug", action="store_true", help="Enable debug logs")

# Browser display option
headless_group = parser.add_mutually_exclusive_group()
headless_group.add_argument("--headless", action="store_true",
                           help="Run browser in headless mode")
headless_group.add_argument("--no-headless", action="store_true", default=True,
                           help="Show browser window (default)")

# Optional filters
parser.add_argument("--shipping_location", type=str, default="", nargs='?')
parser.add_argument("--price_min", type=int, default=1, nargs='?')
parser.add_argument("--price_max", type=int, nargs='?')
parser.add_argument("--listing_type", type=int, default=1,
                    nargs='?')  # 1 all, 2 BIN, 3 offer, 4 auction
parser.add_argument("--condition", type=str, default="",
                    nargs='?')     # "new" / "used"
parser.add_argument("--category", type=int, default=0, nargs='?')
parser.add_argument("--exclude", type=str, default="", nargs='?')

args = parser.parse_args()
if isinstance(args.search_term, list):
    args.search_term = " ".join(args.search_term)
args.limit = max(1, int(args.limit))

# Handle headless mode logic (default is visible browser)
if args.headless:
    args.headless = True
else:
    args.headless = False  # Default: show browser window

DEBUG_FILE = "debug.txt"
_debug_inited = False

def debug_print(*a):
    if not args.debug:
        return
    global _debug_inited
    if not _debug_inited:
        with open(DEBUG_FILE, "w", encoding="utf-8") as f:
            f.write(f"Debug log {datetime.datetime.now():%Y-%m-%d %H:%M:%S}\n")
        _debug_inited = True
    with open(DEBUG_FILE, "a", encoding="utf-8") as f:
        f.write("[DEBUG] " + " ".join(str(x) for x in a) + "\n")
    print("[DEBUG]", *a)

# Per-page cap (نخلّي 240 لو متاح)
try:
    _requested_per_page = int(args.page_count) if args.page_count else 240
except Exception:
    _requested_per_page = 240
PER_PAGE_CAP = max(1, min(_requested_per_page, 240))

# ---------- small utils ----------
def normalize_count(s: str):
    if not s:
        return None
    t = s.strip().replace(',', '').replace('+', '')
    m = re.match(r'^(\d+(?:\.\d+)?)([kKmM]?)$', t)
    if m:
        num = float(m.group(1))
        suf = m.group(2).lower()
        if suf == 'k':
            num *= 1000
        elif suf == 'm':
            num *= 1000000
        return int(round(num))
    m2 = re.search(r'\d+(?:\.\d+)?', t)
    return int(float(m2.group(0))) if m2 else None

def parse_price_number(text: str):
    if not text:
        return None
    t = text.replace(',', '')
    m = re.search(r'(\d+(?:\.\d+)?)', t)
    return float(m.group(1)) if m else None

def txt(node):
    return (node.get_text(strip=True) if node else "").strip()

def first(sel_list):
    return sel_list[0] if sel_list else None

# ---------------- URL builder ----------------
def build_url(page=1):
    url = f"https://www.{args.site}/sch/i.html?"
    params = "_nkw=" + urllib.parse.quote(args.search_term)
    if args.shipping_location:
        params += "&LH_PrefLoc=" + str(args.shipping_location)
    if args.price_min:
        params += "&_udlo=" + str(args.price_min)
    if args.price_max:
        params += "&_udhi=" + str(args.price_max)
    if args.listing_type:
        if args.listing_type == 1:
            params += "&LH_All=1"
        elif args.listing_type == 2:
            params += "&LH_BIN=1"
        elif args.listing_type == 3:
            params += "&LH_BO=1"
        elif args.listing_type == 4:
            params += "&LH_Auction=1"
    if args.condition == "new":
        params += "&LH_ItemCondition=3"
    if args.condition == "used":
        params += "&LH_ItemCondition=4"
    if args.category:
        params += "&_sacat=" + str(args.category)
    if args.exclude:
        params += "&_ex_kw=" + urllib.parse.quote(args.exclude)
    if args.page_count:
        params += "&_ipg=" + str(args.page_count)
    if page > 1:
        params += f"&_pgn={page}"
    params += "&_dmd=1"
    return url + params

# ---------------- Seller & rating helpers ----------------
def extract_seller_and_percent(li):
    seller_name, seller_url, positive_percent = None, "N/A", None

    a_ssn = first(li.select("a[href*='_ssn=']"))
    if a_ssn:
        href = a_ssn.get("href", "")
        q = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
        seller_name = (q.get('_ssn', [None])[0] or txt(a_ssn) or None)
        if seller_name:
            seller_url = f"https://www.{args.site}/sch/i.html?_ssn={seller_name}"

    raw = li.get_text(" ", strip=True)
    mpos = re.search(r'(\d{1,3}(?:\.\d+)?)%\s*positive', raw, flags=re.I)
    if mpos:
        try:
            positive_percent = float(mpos.group(1))
        except:
            positive_percent = None

    if not seller_name:
        mname = re.search(
            r'([A-Za-z0-9._\-]+)\s*(\d{1,3}(?:\.\d+)?)%\s*positive', raw, flags=re.I)
        if mname:
            seller_name = mname.group(1)
            if positive_percent is None:
                try:
                    positive_percent = float(mname.group(2))
                except:
                    pass

    return (seller_name or "N/A", seller_url, positive_percent)

def extract_location_after_located_in(li):
    selectors = [
        "div.s-card__attributes_primary div.s-card__attribute-row span.su-styled-text.secondary.large",
        "div.s-card__attribute-row span.su-styled-text.secondary.large",
        ".su-styled-text.secondary.large",
        ".s-item__itemLocation",
    ]
    for sel in selectors:
        for el in li.select(sel):
            text = el.get_text(" ", strip=True)
            m = re.search(r'located in\s*(.+)$', text, flags=re.I)
            if m:
                loc = m.group(1).strip()
                loc = re.sub(r'[\|•·]+.*$', '', loc).strip()
                if ',' in loc:
                    parts = [p.strip() for p in loc.split(',') if p.strip()]
                    loc = parts[-1] if parts else loc
                loc = re.sub(r'[^A-Za-z ]+', ' ', loc)
                loc = re.sub(r'\s+', ' ', loc).strip()
                return loc or "N/A"

    node = li.find(lambda tag: tag.name in ('span', 'div')
                   and 'located in' in tag.get_text(strip=True).lower())
    if node:
        text = node.get_text(" ", strip=True)
        m = re.search(r'located in\s*(.+)$', text, flags=re.I)
        if m:
            loc = m.group(1).strip()
            loc = re.sub(r'[\|•·]+.*$', '', loc).strip()
            if ',' in loc:
                parts = [p.strip() for p in loc.split(',') if p.strip()]
                loc = parts[-1] if parts else loc
            loc = re.sub(r'[^A-Za-z ]+', ' ', loc)
            loc = re.sub(r'\s+', ' ', loc).strip()
            return loc or "N/A"

    return "N/A"

# ---------------- Parse one listing ----------------
def parse_listing(li):
    item = {}

    # URL & item_id
    a = first(li.select("a.s-item__link")) or first(li.select("a"))
    href = a.get("href", "") if a else ""
    if "/itm/" in href:
        try:
            item_id_str = href.split("/itm/")[1].split("?")[0]
        except:
            item_id_str = ""
        item["url"] = f"https://{args.site}/itm/{item_id_str}" if item_id_str else "N/A"
        if item_id_str.isdigit():
            try:
                item["item_id"] = int(item_id_str)
            except:
                item["item_id"] = item_id_str
        else:
            item["item_id"] = item_id_str
    else:
        item["url"] = "N/A"
        item["item_id"] = ""

    # Title
    t = first(li.select("div.s-card__title .su-styled-text.primary.default")) \
        or first(li.select("h3.s-item__title")) \
        or first(li.select("span[role='heading']"))
    item["title"] = txt(t) if t else "N/A"

    # Price (TEXT with currency)
    p = first(li.select("span.s-item__price")) \
        or first(li.select(".su-card-container__content .su-card-container__attributes div span"))
    price_text = txt(p) if p else ""
    item["price"] = price_text or "N/A"

    # Image
    img = first(li.select("img.s-card__image")) or first(li.select("img.s-item__image-img"))
    item["image"] = img.get("src", "") if img else "N/A"

    # Seller + positive%
    seller_name, seller_url, pos_pct = extract_seller_and_percent(li)
    item["seller"] = seller_name
    item["seller_url"] = seller_url

    if (not item["seller_url"] or item["seller_url"] == "N/A") and item["seller"] and item["seller"] != "N/A":
        item["seller_url"] = f"https://www.{args.site}/sch/i.html?_ssn={urllib.parse.quote_plus(str(item['seller']))}"

    if pos_pct is not None:
        item["seller_positive_percent"] = float(pos_pct)
        item["positive_feedback"] = f"{pos_pct}%"
    else:
        item["positive_feedback"] = "N/A"

    # Shipping
    item["shipping"] = "N/A"
    ship_nodes = li.select(".s-item__shipping, .s-item__logisticsCost")
    for n in ship_nodes:
        s = txt(n)
        if s.startswith("+"):
            item["shipping"] = s.replace(",", ".").split(" ")[0]
            break

    # Location (strictly after 'Located in')
    item["location"] = extract_location_after_located_in(li)

    # Rating + ratings_count (int)
    rating_clip = first(li.select(".x-star-rating .clipped"))
    if rating_clip and "out of" in txt(rating_clip):
        try:
            item["rating"] = float(txt(rating_clip).split(" out of ")[0])
        except:
            item["rating"] = txt(rating_clip).split(" out of ")[0]
    rc = first(li.select(".s-item__reviews-count > span"))
    if rc:
        mrc = re.search(r'\d+', txt(rc))
        if mrc:
            try:
                item["ratings"] = int(mrc.group(0))
            except:
                item["ratings"] = None

    # seller_sold_items -> int
    raw_text = li.get_text(" ", strip=True)
    mss = re.search(r'(\d[\d\.,]*\s*[KMkm]?[\+]?)\s*sold\b', raw_text, flags=re.I)
    if mss:
        num = normalize_count(mss.group(1))
        if num is not None:
            item["seller_sold_items"] = int(num)

    # quantity_sold -> int
    qs = first(li.select("span.s-item__quantitySold"))
    if qs:
        mq = re.search(r'(\d+)', txt(qs))
        if mq:
            try:
                item["quantity_sold"] = int(mq.group(1))
            except:
                item["quantity_sold"] = None

    return item

def parse_items_from_html(html, need, require_ssi=True):
    soup = BeautifulSoup(html, "lxml")
    nodes = soup.select("ul.srp-results li.s-item") or soup.select("ul.srp-results li")
    results = []
    for i, li in enumerate(nodes):
        if i == 0 and "Shop on eBay" in li.get_text(" ", strip=True):
            continue
        item = parse_listing(li)
        if require_ssi and item.get("seller_sold_items") is None:
            continue
        if args.debug:
            debug_print(f"Parsed item: {json.dumps(item, ensure_ascii=False)}")
        if item.get("title") != "N/A" and item.get("url") != "N/A" and item.get("seller") != "N/A":
            results.append(item)
        else:
            debug_print("Skipping incomplete item:", json.dumps(item, ensure_ascii=False))
            if args.debug:
                results.append(item)
        if len(results) >= need:
            break
    return results

# ----------- NEW: detect protection/challenge pages -----------
def is_challenge_page(html: str) -> bool:
    if not html:
        return True
    needles = [
        "Checking your browser before you access",
        "Access denied",
        "Attention Required",
        "Ray ID",
        "bot detection",
        "Request unsuccessful. Incapsula",
        "blocked by the site owner",
        "captcha",
        "cloudflare",
        "security check",
        "please wait while we check your browser",
        "ddos protection",
        "unusual traffic"
    ]
    low = html.lower()
    is_challenge = any(s.lower() in low for s in needles)

    # Also check if we have very minimal HTML (likely a block page)
    if len(html.strip()) < 500 and "<html" in low:
        is_challenge = True

    return is_challenge

# ---------------- Fetch & loop ----------------
def fetch_page_html(req, page_num, nav_page=None, shot_path=None):
    url = build_url(page_num)
    debug_print("GET", url)
    html = ""
    try:
        resp = req.get(url, timeout=args.timeout * 1000, max_redirects=15)
        if resp and resp.status < 400:
            html = resp.text()
    except Exception as e:
        debug_print("req.get error:", e)


    if (not html) or is_challenge_page(html):
        debug_print("Challenge/empty detected -> fallback to real browser")
        if not nav_page:
            return html or ""
        try:
            debug_print("Loading page in browser, please wait...")
            nav_page.goto(url, wait_until="domcontentloaded")

            # Wait longer for initial load
            try:
                nav_page.wait_for_load_state("networkidle", timeout=15000)
            except:
                debug_print("Network idle timeout, continuing...")
                pass

            # Check if still blocked and retry
            try:
                c1 = nav_page.content()
                if is_challenge_page(c1):
                    debug_print("Still detecting challenge, waiting and retrying...")
                    nav_page.wait_for_timeout(5000)  # Wait 5 seconds

                    # Try scrolling to trigger any lazy loading
                    try:
                        nav_page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                        nav_page.wait_for_timeout(2000)
                        nav_page.evaluate("window.scrollTo(0, 0)")
                    except:
                        pass

                    # Final check
                    c2 = nav_page.content()
                    if is_challenge_page(c2):
                        debug_print("Challenge still detected, but proceeding with current content")
            except:
                pass

            html = nav_page.content()
            debug_print(f"Browser content length: {len(html)} chars")
            return html
        except Exception as e:
            debug_print("fallback goto failed:", e)
            return html or ""

    return html

def scrape_until_target(req, nav_page=None, shot_path=None):
    results = []
    page = 1
    while len(results) < args.limit and page <= max(1, int(args.max_pages)):
        remaining = args.limit - len(results)
        per_page_need = min(remaining, PER_PAGE_CAP)

        html = fetch_page_html(req, page, nav_page=nav_page)
        if not html:
            page += 1
            continue

        page_items = parse_items_from_html(html, need=per_page_need, require_ssi=True)
        debug_print(f"Page {page}: Found {len(page_items)} items with seller_sold_items")

        if not page_items and nav_page and not is_challenge_page(html):
            debug_print("No items parsed; retry via browser for this page...")
            try:
                url = build_url(page)
                nav_page.goto(url, wait_until="domcontentloaded")
                try:
                    nav_page.wait_for_load_state("networkidle", timeout=7000)
                except:
                    pass

                try:
                    nav_page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                    nav_page.wait_for_timeout(900)
                except:
                    pass
                html2 = nav_page.content()
                page_items = parse_items_from_html(html2, need=per_page_need, require_ssi=True)
            except Exception as e:
                debug_print("Retry via browser failed:", e)
        # Early stop: if first page has no data, return empty immediately
        if not page_items:
            if page == 1:
                debug_print("First page has no data -> stop immediately with empty results")
                return []
            page += 1
            continue

        if page_items:
            results.extend(page_items)

        page += 1

    return results

# ---------------- Main ----------------
user_agents = [
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Linux; Android 13; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.216 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.6045.134 Mobile Safari/537.36"
]

def main():
    with sync_playwright() as pw:
        ua = random.choice(user_agents)
        debug_print("Using User-Agent:", ua)


        req = pw.request.new_context(
            extra_http_headers={
                "Accept-Language": "en-US,en;q=0.9",
                "User-Agent": ua
            }
        )


        browser = None
        context = None
        page = None

        try:

            safe_name = f"results_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
            script_dir = os.path.dirname(os.path.abspath(__file__))
            out_base = os.path.join(script_dir, safe_name)
            out_json = out_base + ".json"
            out_png  = out_base + ".png"

            browser = pw.chromium.launch(headless=args.headless, args=['--no-sandbox', '--disable-blink-features=AutomationControlled'])
            context = browser.new_context(
                viewport={"width": 1280, "height": 3000},
                user_agent=ua,
                locale="en-US",
                timezone_id="America/Los_Angeles"
            )
            page = context.new_page()


            items = scrape_until_target(req, nav_page=page)

            data = {"data": []}
            for it in items[:args.limit]:
                out = {
                    "title": it.get("title", ""),
                    "url": it.get("url", ""),
                    "price": it.get("price", ""),
                    "shipping": it.get("shipping", ""),
                    "location": it.get("location", ""),
                    "image": it.get("image", ""),
                    "rating": it.get("rating", ""),
                    "ratings_count": it.get("ratings", None),
                    "quantity_sold": it.get("quantity_sold", None),
                    "seller": it.get("seller", ""),
                    "positive_feedback": it.get("positive_feedback", "N/A"),
                    "seller_url": it.get("seller_url", ""),
                    "country_code": "US",
                    "item_id": it.get("item_id", ""),
                }
                if it.get("seller_positive_percent") is not None:
                    out["seller_positive_percent"] = float(it["seller_positive_percent"])
                if it.get("seller_sold_items") is not None:
                    out["seller_sold_items"] = int(it["seller_sold_items"])
                data["data"].append(out)

            with open(out_json, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False)

            # Prepare output response
            output_response = {"file": out_json}

            # Add full URL and sample content if debug mode is enabled
            if args.debug:
                http_url = "https://ds2025.hopto.org/shopify-api/resources/py/"
                file_name = os.path.basename(out_json)
                output_response["full_url"] = http_url + file_name

                # Add sample of first data item
                try:
                    if data["data"] and len(data["data"]) > 0:
                        first_item = data["data"][0]
                        output_response["sample"] = first_item
                        debug_print(f"Sample first item: {json.dumps(first_item, ensure_ascii=False)[:100]}...")
                    else:
                        output_response["sample"] = []
                        debug_print("No data items found for sample")
                except Exception as e:
                    debug_print(f"Could not extract sample item: {e}")
                    output_response["sample"] = []

                debug_print(f"Results saved to: {out_json}")
                debug_print(f"Full URL: {output_response['full_url']}")

            print(json.dumps(output_response, ensure_ascii=False))

        except Exception as e:
            import traceback
            traceback.print_exc()
        finally:
            try:
                req.dispose()
            except:
                pass
            try:
                if context:
                    context.close()
            except:
                pass
            try:
                if browser:
                    browser.close()
            except:
                pass

if __name__ == "__main__":
    main()
