Spaces:

lakkiroy
/

git-chat

Sleeping

App Files Files Community

Gemini CLI commited on Aug 12, 2025

Commit

a96d5ce

1 Parent(s): c6def7d

Deploying project

Browse files

Files changed (8) hide show

.env +0 -7
README.md +0 -1
config.py +0 -19
main.py +0 -5
modules/__init__.py +0 -0
modules/chrome_utils.py +0 -313
modules/file_utils.py +0 -89
modules/scraper_utils.py +0 -46

.env DELETED Viewed

@@ -1,7 +0,0 @@
-PROXY=socks5://geo.iproyal.com:51228
-CSV_FILE=/Users/arif/shopee_url.csv
-JSON_FILE=api_responses.json
-RESPONSE_DIR=responses
-RESOURCE_CACHE_FILE=resource_cache.json
-CHROME_PATH=/Applications/Google Chrome.app/Contents/MacOS/Google Chrome
-RETRIES=3

README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- # shopee-crawler

config.py DELETED Viewed

@@ -1,19 +0,0 @@
-# config.py
-import os
-from dotenv import load_dotenv
-# Load environment variables from .env
-load_dotenv()
-# Retrieve environment variables
-PROXY = os.getenv("PROXY")
-CSV_FILE = os.getenv("CSV_FILE")
-JSON_FILE = os.getenv("JSON_FILE")
-RESPONSE_DIR = os.getenv("RESPONSE_DIR")
-RESOURCE_CACHE_FILE = os.getenv("RESOURCE_CACHE_FILE")
-CHROME_PATH = os.getenv("CHROME_PATH")
-RETRIES = int(os.getenv("RETRIES", 3))
-# You can define additional constants or logic here
-MAX_WORKERS = 8
-BASE_PORT = 9222

main.py DELETED Viewed

@@ -1,5 +0,0 @@
-# main.py
-from modules.scraper_utils import run_scraper
-if __name__ == "__main__":
-    run_scraper()

modules/__init__.py DELETED Viewed

File without changes

modules/chrome_utils.py DELETED Viewed

@@ -1,313 +0,0 @@
-import os
-import time
-import json
-import random
-import base64
-import subprocess
-import requests
-import mimetypes
-import shutil
-import tempfile
-from urllib.parse import urlparse
-import pychrome
-from config import RETRIES
-from modules.file_utils import save_response_to_file
-from config import (RESOURCE_CACHE_FILE, RESPONSE_DIR, CHROME_PATH)
-# Global for data usage
-total_network_data = 0
-def launch_chrome(debug_port, user_data_dir):
-    """
-    Launch Chrome with remote debugging and specified user data directory.
-    """
-    cmd = [
-        CHROME_PATH,
-        f"--remote-debugging-port={debug_port}",
-        f"--user-data-dir={user_data_dir}",
-        "--disable-web-security",
-        "--no-first-run",
-        # Add or remove additional flags as needed
-    ]
-    return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-def load_resource_cache():
-    """
-    Load resource cache (for intercepting requests) from JSON.
-    """
-    if not os.path.exists(RESOURCE_CACHE_FILE):
-        with open(RESOURCE_CACHE_FILE, "w", encoding="utf-8") as f:
-            json.dump({}, f)
-        return {}
-    try:
-        with open(RESOURCE_CACHE_FILE, "r", encoding="utf-8") as f:
-            data = json.load(f)
-            print(f"[CACHE] Loaded {len(data)} resources from {RESOURCE_CACHE_FILE}")
-            return data
-    except Exception as e:
-        print(f"[CACHE] Error loading cache: {e}")
-        return {}
-def save_resource_cache(resource_cache):
-    """
-    Save updated resource cache to JSON.
-    """
-    try:
-        with open(RESOURCE_CACHE_FILE, "w", encoding="utf-8") as f:
-            json.dump(resource_cache, f, indent=2)
-        print(f"[CACHE] Saved {len(resource_cache)} resources to {RESOURCE_CACHE_FILE}")
-    except Exception as e:
-        print(f"[CACHE] Error saving cache: {e}")
-def setup_tab(browser, debug_port, resource_cache):
-    """
-    Create a new tab, attach PyChrome callbacks, enable network & set up intercepts.
-    Returns the tab object.
-    """
-    tab = browser.new_tab()
-    requests_dict = {}
-    def on_request_will_be_sent(**kwargs):
-        request_id = kwargs.get("requestId")
-        request_obj = kwargs.get("request", {})
-        url = request_obj.get("url", "")
-        headers = request_obj.get("headers", {})
-        # Remove possible undesired headers
-        for h in ["X-Forwarded-For", "Via", "Forwarded", "X-Amzn-Trace-Id"]:
-            headers.pop(h, None)
-        requests_dict[request_id] = {
-            "url": url,
-            "fromDiskCache": False,
-            "servedFromLocalCache": False,
-            "status": None,
-            "headers": {}
-        }
-        print(f"Request sent {request_id} => {url}")
-    def on_response_received(**kwargs):
-        request_id = kwargs.get("requestId")
-        response = kwargs.get("response", {})
-        from_disk_cache = response.get("fromDiskCache", False)
-        if request_id in requests_dict:
-            requests_dict[request_id]["fromDiskCache"] = from_disk_cache
-            requests_dict[request_id]["status"] = response.get("status")
-            requests_dict[request_id]["headers"] = response.get("headers", {})
-    def on_loading_finished(**kwargs):
-        nonlocal tab
-        global total_network_data
-        request_id = kwargs.get("requestId")
-        if request_id not in requests_dict:
-            return
-        encoded_data_length = kwargs.get("encodedDataLength", 0)
-        req_info = requests_dict[request_id]
-        url = req_info.get("url", "")
-        from_disk = req_info.get("fromDiskCache", False)
-        from_local = req_info.get("servedFromLocalCache", False)
-        if not from_disk and not from_local:
-            total_network_data += encoded_data_length
-            print(f"[DATA] Request {request_id} => {encoded_data_length} bytes => {url}")
-            print(f"Total net data (not from cache): {total_network_data/1024:.2f} KB")
-        # If this is the Shopee API for item detail:
-        if "api/v4/pdp/get_pc" in url:
-            try:
-                result = tab.Network.getResponseBody(requestId=request_id)
-                body = result.get("body", "")
-                if result.get("base64Encoded", False):
-                    body = base64.b64decode(body).decode("utf-8", errors="replace")
-                print("==========================================")
-                print(f"Response for: {url}")
-                print(body)
-                print("==========================================\n")
-                # Save valid responses
-                if body.startswith('{"bff_meta":null'):
-                    save_response_to_file(body, url, RESPONSE_DIR)
-                else:
-                    print("❌ Response does not match filter criteria. Skipping.")
-            except Exception as e:
-                print(f"Error retrieving body for {url}: {e}")
-    def on_request_intercepted(interceptionId=None, authChallenge=None, request=None, **kwargs):
-        """
-        Intercept requests to serve from local cache or fetch them manually.
-        """
-        nonlocal resource_cache
-        global total_network_data
-        if not request:
-            tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
-            return
-        request_id = kwargs.get("requestId")
-        url = request.get("url", "")
-        headers = request.get("headers", {})
-        content_type = headers.get("Content-Type", "")
-        # Handle Basic Auth or Proxy Auth
-        if authChallenge:
-            print("Auth challenge detected. Providing credentials (example).")
-            tab.Network.continueInterceptedRequest(
-                interceptionId=interceptionId,
-                authChallengeResponse={
-                    "response": "ProvideCredentials",
-                    "username": "example_username",
-                    "password": "example_password",
-                }
-            )
-            return
-        parsed = urlparse(url)
-        path = parsed.path.lower()
-        exts_js   = [".js", ".json"]
-        exts_img  = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
-        exts_css  = [".css"]
-        is_js   = any(path.endswith(e) for e in exts_js) or "application/javascript" in content_type
-        is_img  = any(path.endswith(e) for e in exts_img) or content_type.startswith("image/")
-        is_css  = any(path.endswith(e) for e in exts_css) or content_type == "text/css"
-        cache_this = is_js or is_img or is_css
-        # If we want to cache static resources:
-        if cache_this:
-            # Cache hit
-            if url in resource_cache:
-                print(f"⚡ [CACHE-HIT] Request {request_id}: {url}")
-                if request_id in requests_dict:
-                    requests_dict[request_id]["servedFromLocalCache"] = True
-                cached_raw_response = resource_cache[url]
-                try:
-                    tab.Network.continueInterceptedRequest(
-                        interceptionId=interceptionId,
-                        rawResponse=cached_raw_response
-                    )
-                except Exception as e:
-                    print(f"⚠️ [CACHE ERROR] {e}")
-                    tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
-                return
-            # Cache miss - fetch manually
-            else:
-                print(f"🌐 [CACHE-MISS] {url}")
-                try:
-                    r = requests.get(url, timeout=20)
-                    if r.status_code != 200:
-                        print(f"❌ Resource fetch status: {r.status_code}")
-                        tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
-                        return
-                    raw_data = r.content
-                    fetched_size = len(raw_data)
-                    total_network_data += fetched_size
-                    final_ct = r.headers.get("Content-Type", None)
-                    if not final_ct:
-                        guess_type, _ = mimetypes.guess_type(url)
-                        final_ct = guess_type if guess_type else "application/octet-stream"
-                    response_str = (
-                        "HTTP/1.1 200 OK\r\n"
-                        f"Content-Type: {final_ct}\r\n"
-                        "Cache-Control: public, max-age=31536000\r\n"
-                        f"Content-Length: {len(raw_data)}\r\n"
-                        "\r\n"
-                    )
-                    combined = response_str.encode("utf-8") + raw_data
-                    raw_response = base64.b64encode(combined).decode("utf-8")
-                    # Update cache
-                    resource_cache[url] = raw_response
-                    if len(resource_cache) % 20 == 0:
-                        save_resource_cache(resource_cache)
-                    if request_id in requests_dict:
-                        requests_dict[request_id]["servedFromLocalCache"] = True
-                    tab.Network.continueInterceptedRequest(
-                        interceptionId=interceptionId,
-                        rawResponse=raw_response
-                    )
-                    print("✅ Resource fetched & cached.")
-                    return
-                except Exception as e:
-                    print(f"❌ Error fetching resource: {e}")
-                    tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
-                    return
-        else:
-            # Not caching, just continue
-            tab.Network.continueInterceptedRequest(interceptionId=interceptionId, headers=headers)
-    # Attach the callbacks
-    tab.Network.requestWillBeSent = on_request_will_be_sent
-    tab.Network.responseReceived  = on_response_received
-    tab.Network.loadingFinished   = on_loading_finished
-    tab.start()
-    tab.Network.enable()
-    # Clear cookies, caches
-    tab.Network.clearBrowserCookies()
-    tab.Network.clearBrowserCache()
-    tab.Storage.clearDataForOrigin(
-        origin="https://shopee.tw",
-        storageTypes="all"
-    )
-    tab.Network.setBlockedURLs(urls=[
-        "*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.svg",
-    ])
-    # Intercept all requests
-    tab.Network.setRequestInterception(patterns=[{"urlPattern": "*"}])
-    tab.Network.requestIntercepted = on_request_intercepted
-    return tab
-def scrape_url(url, debug_port, csv_file, response_dir, resource_cache):
-    """
-    Launch a separate Chrome instance, navigate, pick random URLs, do scraping.
-    Retries if it fails, up to RETRIES times.
-    """
-    temp_dir = tempfile.mkdtemp(prefix="chrome_profile_")
-    print(f"[Thread-{debug_port}] Launching Chrome for {url} | Profile: {temp_dir}")
-    chrome_proc = launch_chrome(debug_port, temp_dir)
-    time.sleep(5)  # Wait for Chrome to launch
-    browser_url = f"http://127.0.0.1:{debug_port}"
-    attempt = 0
-    try:
-        while attempt < RETRIES:
-            try:
-                print(f"[Thread-{debug_port}] Attempt {attempt + 1}/{RETRIES} - Navigating to {url}")
-                browser = pychrome.Browser(url=browser_url)
-                tab = setup_tab(browser, debug_port, resource_cache)
-                # Example random sleep before navigation
-                time.sleep(random.uniform(20, 40))
-                tab.Page.navigate(url=url)
-                time.sleep(random.uniform(60, 110))  # Wait for page to load
-                # If navigation succeeds, break the retry loop
-                print(f"[Thread-{debug_port}] Successfully navigated to {url}")
-                break
-            except Exception as e:
-                print(f"[Thread-{debug_port}] Error scraping {url}: {e}")
-                attempt += 1
-                if attempt < RETRIES:
-                    wait_time = random.uniform(30, 60)
-                    print(f"[Thread-{debug_port}] Retrying in {wait_time:.2f} seconds...")
-                    time.sleep(wait_time)
-                else:
-                    print(f"[Thread-{debug_port}] ❌ Max retries reached for {url}. Skipping.")
-    # Cleanup after finishing attempts
-    finally:
-        try:
-            if chrome_proc:
-                chrome_proc.kill()
-        except Exception:
-            pass
-        shutil.rmtree(temp_dir, ignore_errors=True)
-        print(f"[Thread-{debug_port}] Finished scraping {url}")

modules/file_utils.py DELETED Viewed

@@ -1,89 +0,0 @@
-# modules/file_utils.py
-import os
-import re
-import json
-import pandas as pd
-from urllib.parse import urlparse, parse_qs
-def load_urls_from_csv(csv_file):
-    """Read URLs from a CSV file (no header)."""
-    if not os.path.exists(csv_file):
-        print(f"❌ Error: CSV file '{csv_file}' not found.")
-        return []
-    try:
-        df = pd.read_csv(csv_file, header=None)
-        urls = df[0].dropna().tolist()
-        print(f"✅ Loaded {len(urls)} URLs from {csv_file}")
-        return urls
-    except Exception as e:
-        print(f"❌ Error reading CSV: {e}")
-        return []
-def generate_filename_from_url_request(url):
-    """Convert Shopee API URL to the expected filename format (shop_id, item_id)."""
-    parsed_url = urlparse(url)
-    domain = parsed_url.netloc
-    query_params = parse_qs(parsed_url.query)
-    item_id = query_params.get("item_id", ["unknown"])[0]
-    shop_id = query_params.get("shop_id", ["unknown"])[0]
-    filename = f"{domain}---i.{shop_id}.{item_id}.json"
-    return filename
-def generate_filename_from_url(url):
-    """Extract filename from the pattern '---i.shop_id.item_id' in the path."""
-    parsed_url = urlparse(url)
-    domain = parsed_url.netloc
-    match = re.search(r"---i\.(\d+)\.(\d+)", parsed_url.path)
-    if match:
-        shop_id, item_id = match.groups()
-        return f"{domain}---i.{shop_id}.{item_id}.json"
-    else:
-        print(f"⚠️ Unable to parse shop_id/item_id from URL: {url}")
-        return None
-def load_existing_responses(response_dir):
-    """Return a set of filenames already in the response directory."""
-    if not os.path.exists(response_dir):
-        os.makedirs(response_dir)
-        return set()
-    return set(os.listdir(response_dir))
-def save_response_to_file(response_text, url, response_dir):
-    """
-    Save API response as a JSON file in the response_dir.
-    Skips if file already exists.
-    """
-    filename = generate_filename_from_url_request(url)
-    if not filename:
-        print(f"⚠️ Cannot generate filename for URL: {url}")
-        return
-    file_path = os.path.join(response_dir, filename)
-    if os.path.exists(file_path):
-        print(f"⚠️ Skipping duplicate: {filename} (Already exists)")
-        return
-    try:
-        with open(file_path, "w", encoding="utf-8") as f:
-            f.write(response_text)
-        print(f"✅ Response saved: {file_path}")
-    except Exception as e:
-        print(f"❌ Error saving response for {url}: {e}")
-def find_missing_urls(csv_file, response_dir):
-    """Find CSV URLs that do not yet exist in the response_dir."""
-    urls = load_urls_from_csv(csv_file)
-    existing_files = load_existing_responses(response_dir)
-    missing_urls = []
-    for url in urls:
-        filename = generate_filename_from_url(url)
-        if filename and filename not in existing_files:
-            missing_urls.append(url)
-    print(f"🔍 {len(missing_urls)} URLs are missing from {response_dir}.")
-    return missing_urls
-def pick_random_url(csv_file, response_dir):
-    """Pick one random URL from those that are missing."""
-    import random
-    missing = find_missing_urls(csv_file, response_dir)
-    if missing:
-        return random.choice(missing)
-    return None

modules/scraper_utils.py DELETED Viewed

@@ -1,46 +0,0 @@
-# modules/scraper_utils.py
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from modules.file_utils import load_existing_responses, find_missing_urls
-from modules.chrome_utils import scrape_url, load_resource_cache, save_resource_cache
-from config import (CSV_FILE, RESPONSE_DIR, BASE_PORT, MAX_WORKERS)
-def run_scraper():
-    # Print how many responses we already have
-    existing_responses = load_existing_responses(RESPONSE_DIR)
-    print(f"Total responses in '{RESPONSE_DIR}': {len(existing_responses)}")
-    urls_to_scrape = find_missing_urls(CSV_FILE, RESPONSE_DIR)
-    if not urls_to_scrape:
-        print("No missing URLs to process.")
-        return
-    resource_cache = load_resource_cache()
-    max_workers = min(MAX_WORKERS, len(urls_to_scrape))
-    base_port = BASE_PORT
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = []
-        for i, url in enumerate(urls_to_scrape):
-            debug_port = base_port + i
-            # Check if we already have a file for this URL
-            # to skip duplicates if you want:
-            futures.append(executor.submit(
-                scrape_url,
-                url,
-                debug_port,
-                CSV_FILE,
-                RESPONSE_DIR,
-                resource_cache
-            ))
-        for future in as_completed(futures):
-            try:
-                future.result()
-            except Exception as exc:
-                print(f"[Thread] Exception: {exc}")
-    # After all threads complete, save updated resource cache
-    save_resource_cache(resource_cache)
-    print("All scraping tasks completed.")