Spaces:

lakkiroy
/

git-chat

Sleeping

App Files Files Community

Arif commited on Mar 11, 2025

Commit

c6def7d

1 Parent(s): 2c9bf12

Added scraper

Browse files

Files changed (8) hide show

.env +7 -0
README.md +0 -1
config.py +19 -0
main.py +5 -0
modules/__init__.py +0 -0
modules/chrome_utils.py +313 -0
modules/file_utils.py +89 -0
modules/scraper_utils.py +46 -0

.env ADDED Viewed

	@@ -0,0 +1,7 @@

+PROXY=socks5://geo.iproyal.com:51228
+CSV_FILE=/Users/arif/shopee_url.csv
+JSON_FILE=api_responses.json
+RESPONSE_DIR=responses
+RESOURCE_CACHE_FILE=resource_cache.json
+CHROME_PATH=/Applications/Google Chrome.app/Contents/MacOS/Google Chrome
+RETRIES=3

README.md CHANGED Viewed

	@@ -1,2 +1 @@
1	# shopee-crawler
2	- # shopee-crawler


1	# shopee-crawler

config.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# config.py
+import os
+from dotenv import load_dotenv
+# Load environment variables from .env
+load_dotenv()
+# Retrieve environment variables
+PROXY = os.getenv("PROXY")
+CSV_FILE = os.getenv("CSV_FILE")
+JSON_FILE = os.getenv("JSON_FILE")
+RESPONSE_DIR = os.getenv("RESPONSE_DIR")
+RESOURCE_CACHE_FILE = os.getenv("RESOURCE_CACHE_FILE")
+CHROME_PATH = os.getenv("CHROME_PATH")
+RETRIES = int(os.getenv("RETRIES", 3))
+# You can define additional constants or logic here
+MAX_WORKERS = 8
+BASE_PORT = 9222

main.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# main.py
+from modules.scraper_utils import run_scraper
+if __name__ == "__main__":
+    run_scraper()

modules/__init__.py ADDED Viewed

File without changes

modules/chrome_utils.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import os
+import time
+import json
+import random
+import base64
+import subprocess
+import requests
+import mimetypes
+import shutil
+import tempfile
+from urllib.parse import urlparse
+import pychrome
+from config import RETRIES
+from modules.file_utils import save_response_to_file
+from config import (RESOURCE_CACHE_FILE, RESPONSE_DIR, CHROME_PATH)
+# Global for data usage
+total_network_data = 0
+def launch_chrome(debug_port, user_data_dir):
+    """
+    Launch Chrome with remote debugging and specified user data directory.
+    """
+    cmd = [
+        CHROME_PATH,
+        f"--remote-debugging-port={debug_port}",
+        f"--user-data-dir={user_data_dir}",
+        "--disable-web-security",
+        "--no-first-run",
+        # Add or remove additional flags as needed
+    ]
+    return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+def load_resource_cache():
+    """
+    Load resource cache (for intercepting requests) from JSON.
+    """
+    if not os.path.exists(RESOURCE_CACHE_FILE):
+        with open(RESOURCE_CACHE_FILE, "w", encoding="utf-8") as f:
+            json.dump({}, f)
+        return {}
+    try:
+        with open(RESOURCE_CACHE_FILE, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            print(f"[CACHE] Loaded {len(data)} resources from {RESOURCE_CACHE_FILE}")
+            return data
+    except Exception as e:
+        print(f"[CACHE] Error loading cache: {e}")
+        return {}
+def save_resource_cache(resource_cache):
+    """
+    Save updated resource cache to JSON.
+    """
+    try:
+        with open(RESOURCE_CACHE_FILE, "w", encoding="utf-8") as f:
+            json.dump(resource_cache, f, indent=2)
+        print(f"[CACHE] Saved {len(resource_cache)} resources to {RESOURCE_CACHE_FILE}")
+    except Exception as e:
+        print(f"[CACHE] Error saving cache: {e}")
+def setup_tab(browser, debug_port, resource_cache):
+    """
+    Create a new tab, attach PyChrome callbacks, enable network & set up intercepts.
+    Returns the tab object.
+    """
+    tab = browser.new_tab()
+    requests_dict = {}
+    def on_request_will_be_sent(**kwargs):
+        request_id = kwargs.get("requestId")
+        request_obj = kwargs.get("request", {})
+        url = request_obj.get("url", "")
+        headers = request_obj.get("headers", {})
+        # Remove possible undesired headers
+        for h in ["X-Forwarded-For", "Via", "Forwarded", "X-Amzn-Trace-Id"]:
+            headers.pop(h, None)
+        requests_dict[request_id] = {
+            "url": url,
+            "fromDiskCache": False,
+            "servedFromLocalCache": False,
+            "status": None,
+            "headers": {}
+        }
+        print(f"Request sent {request_id} => {url}")
+    def on_response_received(**kwargs):
+        request_id = kwargs.get("requestId")
+        response = kwargs.get("response", {})
+        from_disk_cache = response.get("fromDiskCache", False)
+        if request_id in requests_dict:
+            requests_dict[request_id]["fromDiskCache"] = from_disk_cache
+            requests_dict[request_id]["status"] = response.get("status")
+            requests_dict[request_id]["headers"] = response.get("headers", {})
+    def on_loading_finished(**kwargs):
+        nonlocal tab
+        global total_network_data
+        request_id = kwargs.get("requestId")
+        if request_id not in requests_dict:
+            return
+        encoded_data_length = kwargs.get("encodedDataLength", 0)
+        req_info = requests_dict[request_id]
+        url = req_info.get("url", "")
+        from_disk = req_info.get("fromDiskCache", False)
+        from_local = req_info.get("servedFromLocalCache", False)
+        if not from_disk and not from_local:
+            total_network_data += encoded_data_length
+            print(f"[DATA] Request {request_id} => {encoded_data_length} bytes => {url}")
+            print(f"Total net data (not from cache): {total_network_data/1024:.2f} KB")
+        # If this is the Shopee API for item detail:
+        if "api/v4/pdp/get_pc" in url:
+            try:
+                result = tab.Network.getResponseBody(requestId=request_id)
+                body = result.get("body", "")
+                if result.get("base64Encoded", False):
+                    body = base64.b64decode(body).decode("utf-8", errors="replace")
+                print("==========================================")
+                print(f"Response for: {url}")
+                print(body)
+                print("==========================================\n")
+                # Save valid responses
+                if body.startswith('{"bff_meta":null'):
+                    save_response_to_file(body, url, RESPONSE_DIR)
+                else:
+                    print("❌ Response does not match filter criteria. Skipping.")
+            except Exception as e:
+                print(f"Error retrieving body for {url}: {e}")
+    def on_request_intercepted(interceptionId=None, authChallenge=None, request=None, **kwargs):
+        """
+        Intercept requests to serve from local cache or fetch them manually.
+        """
+        nonlocal resource_cache
+        global total_network_data
+        if not request:
+            tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
+            return
+        request_id = kwargs.get("requestId")
+        url = request.get("url", "")
+        headers = request.get("headers", {})
+        content_type = headers.get("Content-Type", "")
+        # Handle Basic Auth or Proxy Auth
+        if authChallenge:
+            print("Auth challenge detected. Providing credentials (example).")
+            tab.Network.continueInterceptedRequest(
+                interceptionId=interceptionId,
+                authChallengeResponse={
+                    "response": "ProvideCredentials",
+                    "username": "example_username",
+                    "password": "example_password",
+                }
+            )
+            return
+        parsed = urlparse(url)
+        path = parsed.path.lower()
+        exts_js   = [".js", ".json"]
+        exts_img  = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
+        exts_css  = [".css"]
+        is_js   = any(path.endswith(e) for e in exts_js) or "application/javascript" in content_type
+        is_img  = any(path.endswith(e) for e in exts_img) or content_type.startswith("image/")
+        is_css  = any(path.endswith(e) for e in exts_css) or content_type == "text/css"
+        cache_this = is_js or is_img or is_css
+        # If we want to cache static resources:
+        if cache_this:
+            # Cache hit
+            if url in resource_cache:
+                print(f"⚡ [CACHE-HIT] Request {request_id}: {url}")
+                if request_id in requests_dict:
+                    requests_dict[request_id]["servedFromLocalCache"] = True
+                cached_raw_response = resource_cache[url]
+                try:
+                    tab.Network.continueInterceptedRequest(
+                        interceptionId=interceptionId,
+                        rawResponse=cached_raw_response
+                    )
+                except Exception as e:
+                    print(f"⚠️ [CACHE ERROR] {e}")
+                    tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
+                return
+            # Cache miss - fetch manually
+            else:
+                print(f"🌐 [CACHE-MISS] {url}")
+                try:
+                    r = requests.get(url, timeout=20)
+                    if r.status_code != 200:
+                        print(f"❌ Resource fetch status: {r.status_code}")
+                        tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
+                        return
+                    raw_data = r.content
+                    fetched_size = len(raw_data)
+                    total_network_data += fetched_size
+                    final_ct = r.headers.get("Content-Type", None)
+                    if not final_ct:
+                        guess_type, _ = mimetypes.guess_type(url)
+                        final_ct = guess_type if guess_type else "application/octet-stream"
+                    response_str = (
+                        "HTTP/1.1 200 OK\r\n"
+                        f"Content-Type: {final_ct}\r\n"
+                        "Cache-Control: public, max-age=31536000\r\n"
+                        f"Content-Length: {len(raw_data)}\r\n"
+                        "\r\n"
+                    )
+                    combined = response_str.encode("utf-8") + raw_data
+                    raw_response = base64.b64encode(combined).decode("utf-8")
+                    # Update cache
+                    resource_cache[url] = raw_response
+                    if len(resource_cache) % 20 == 0:
+                        save_resource_cache(resource_cache)
+                    if request_id in requests_dict:
+                        requests_dict[request_id]["servedFromLocalCache"] = True
+                    tab.Network.continueInterceptedRequest(
+                        interceptionId=interceptionId,
+                        rawResponse=raw_response
+                    )
+                    print("✅ Resource fetched & cached.")
+                    return
+                except Exception as e:
+                    print(f"❌ Error fetching resource: {e}")
+                    tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
+                    return
+        else:
+            # Not caching, just continue
+            tab.Network.continueInterceptedRequest(interceptionId=interceptionId, headers=headers)
+    # Attach the callbacks
+    tab.Network.requestWillBeSent = on_request_will_be_sent
+    tab.Network.responseReceived  = on_response_received
+    tab.Network.loadingFinished   = on_loading_finished
+    tab.start()
+    tab.Network.enable()
+    # Clear cookies, caches
+    tab.Network.clearBrowserCookies()
+    tab.Network.clearBrowserCache()
+    tab.Storage.clearDataForOrigin(
+        origin="https://shopee.tw",
+        storageTypes="all"
+    )
+    tab.Network.setBlockedURLs(urls=[
+        "*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.svg",
+    ])
+    # Intercept all requests
+    tab.Network.setRequestInterception(patterns=[{"urlPattern": "*"}])
+    tab.Network.requestIntercepted = on_request_intercepted
+    return tab
+def scrape_url(url, debug_port, csv_file, response_dir, resource_cache):
+    """
+    Launch a separate Chrome instance, navigate, pick random URLs, do scraping.
+    Retries if it fails, up to RETRIES times.
+    """
+    temp_dir = tempfile.mkdtemp(prefix="chrome_profile_")
+    print(f"[Thread-{debug_port}] Launching Chrome for {url} | Profile: {temp_dir}")
+    chrome_proc = launch_chrome(debug_port, temp_dir)
+    time.sleep(5)  # Wait for Chrome to launch
+    browser_url = f"http://127.0.0.1:{debug_port}"
+    attempt = 0
+    try:
+        while attempt < RETRIES:
+            try:
+                print(f"[Thread-{debug_port}] Attempt {attempt + 1}/{RETRIES} - Navigating to {url}")
+                browser = pychrome.Browser(url=browser_url)
+                tab = setup_tab(browser, debug_port, resource_cache)
+                # Example random sleep before navigation
+                time.sleep(random.uniform(20, 40))
+                tab.Page.navigate(url=url)
+                time.sleep(random.uniform(60, 110))  # Wait for page to load
+                # If navigation succeeds, break the retry loop
+                print(f"[Thread-{debug_port}] Successfully navigated to {url}")
+                break
+            except Exception as e:
+                print(f"[Thread-{debug_port}] Error scraping {url}: {e}")
+                attempt += 1
+                if attempt < RETRIES:
+                    wait_time = random.uniform(30, 60)
+                    print(f"[Thread-{debug_port}] Retrying in {wait_time:.2f} seconds...")
+                    time.sleep(wait_time)
+                else:
+                    print(f"[Thread-{debug_port}] ❌ Max retries reached for {url}. Skipping.")
+    # Cleanup after finishing attempts
+    finally:
+        try:
+            if chrome_proc:
+                chrome_proc.kill()
+        except Exception:
+            pass
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        print(f"[Thread-{debug_port}] Finished scraping {url}")

modules/file_utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# modules/file_utils.py
+import os
+import re
+import json
+import pandas as pd
+from urllib.parse import urlparse, parse_qs
+def load_urls_from_csv(csv_file):
+    """Read URLs from a CSV file (no header)."""
+    if not os.path.exists(csv_file):
+        print(f"❌ Error: CSV file '{csv_file}' not found.")
+        return []
+    try:
+        df = pd.read_csv(csv_file, header=None)
+        urls = df[0].dropna().tolist()
+        print(f"✅ Loaded {len(urls)} URLs from {csv_file}")
+        return urls
+    except Exception as e:
+        print(f"❌ Error reading CSV: {e}")
+        return []
+def generate_filename_from_url_request(url):
+    """Convert Shopee API URL to the expected filename format (shop_id, item_id)."""
+    parsed_url = urlparse(url)
+    domain = parsed_url.netloc
+    query_params = parse_qs(parsed_url.query)
+    item_id = query_params.get("item_id", ["unknown"])[0]
+    shop_id = query_params.get("shop_id", ["unknown"])[0]
+    filename = f"{domain}---i.{shop_id}.{item_id}.json"
+    return filename
+def generate_filename_from_url(url):
+    """Extract filename from the pattern '---i.shop_id.item_id' in the path."""
+    parsed_url = urlparse(url)
+    domain = parsed_url.netloc
+    match = re.search(r"---i\.(\d+)\.(\d+)", parsed_url.path)
+    if match:
+        shop_id, item_id = match.groups()
+        return f"{domain}---i.{shop_id}.{item_id}.json"
+    else:
+        print(f"⚠️ Unable to parse shop_id/item_id from URL: {url}")
+        return None
+def load_existing_responses(response_dir):
+    """Return a set of filenames already in the response directory."""
+    if not os.path.exists(response_dir):
+        os.makedirs(response_dir)
+        return set()
+    return set(os.listdir(response_dir))
+def save_response_to_file(response_text, url, response_dir):
+    """
+    Save API response as a JSON file in the response_dir.
+    Skips if file already exists.
+    """
+    filename = generate_filename_from_url_request(url)
+    if not filename:
+        print(f"⚠️ Cannot generate filename for URL: {url}")
+        return
+    file_path = os.path.join(response_dir, filename)
+    if os.path.exists(file_path):
+        print(f"⚠️ Skipping duplicate: {filename} (Already exists)")
+        return
+    try:
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write(response_text)
+        print(f"✅ Response saved: {file_path}")
+    except Exception as e:
+        print(f"❌ Error saving response for {url}: {e}")
+def find_missing_urls(csv_file, response_dir):
+    """Find CSV URLs that do not yet exist in the response_dir."""
+    urls = load_urls_from_csv(csv_file)
+    existing_files = load_existing_responses(response_dir)
+    missing_urls = []
+    for url in urls:
+        filename = generate_filename_from_url(url)
+        if filename and filename not in existing_files:
+            missing_urls.append(url)
+    print(f"🔍 {len(missing_urls)} URLs are missing from {response_dir}.")
+    return missing_urls
+def pick_random_url(csv_file, response_dir):
+    """Pick one random URL from those that are missing."""
+    import random
+    missing = find_missing_urls(csv_file, response_dir)
+    if missing:
+        return random.choice(missing)
+    return None

modules/scraper_utils.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# modules/scraper_utils.py
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from modules.file_utils import load_existing_responses, find_missing_urls
+from modules.chrome_utils import scrape_url, load_resource_cache, save_resource_cache
+from config import (CSV_FILE, RESPONSE_DIR, BASE_PORT, MAX_WORKERS)
+def run_scraper():
+    # Print how many responses we already have
+    existing_responses = load_existing_responses(RESPONSE_DIR)
+    print(f"Total responses in '{RESPONSE_DIR}': {len(existing_responses)}")
+    urls_to_scrape = find_missing_urls(CSV_FILE, RESPONSE_DIR)
+    if not urls_to_scrape:
+        print("No missing URLs to process.")
+        return
+    resource_cache = load_resource_cache()
+    max_workers = min(MAX_WORKERS, len(urls_to_scrape))
+    base_port = BASE_PORT
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = []
+        for i, url in enumerate(urls_to_scrape):
+            debug_port = base_port + i
+            # Check if we already have a file for this URL
+            # to skip duplicates if you want:
+            futures.append(executor.submit(
+                scrape_url,
+                url,
+                debug_port,
+                CSV_FILE,
+                RESPONSE_DIR,
+                resource_cache
+            ))
+        for future in as_completed(futures):
+            try:
+                future.result()
+            except Exception as exc:
+                print(f"[Thread] Exception: {exc}")
+    # After all threads complete, save updated resource cache
+    save_resource_cache(resource_cache)
+    print("All scraping tasks completed.")