Gemini CLI commited on
Commit ·
a96d5ce
1
Parent(s): c6def7d
Deploying project
Browse files- .env +0 -7
- README.md +0 -1
- config.py +0 -19
- main.py +0 -5
- modules/__init__.py +0 -0
- modules/chrome_utils.py +0 -313
- modules/file_utils.py +0 -89
- modules/scraper_utils.py +0 -46
.env
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
PROXY=socks5://geo.iproyal.com:51228
|
| 2 |
-
CSV_FILE=/Users/arif/shopee_url.csv
|
| 3 |
-
JSON_FILE=api_responses.json
|
| 4 |
-
RESPONSE_DIR=responses
|
| 5 |
-
RESOURCE_CACHE_FILE=resource_cache.json
|
| 6 |
-
CHROME_PATH=/Applications/Google Chrome.app/Contents/MacOS/Google Chrome
|
| 7 |
-
RETRIES=3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# shopee-crawler
|
|
|
|
|
|
config.py
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
# config.py
|
| 2 |
-
import os
|
| 3 |
-
from dotenv import load_dotenv
|
| 4 |
-
|
| 5 |
-
# Load environment variables from .env
|
| 6 |
-
load_dotenv()
|
| 7 |
-
|
| 8 |
-
# Retrieve environment variables
|
| 9 |
-
PROXY = os.getenv("PROXY")
|
| 10 |
-
CSV_FILE = os.getenv("CSV_FILE")
|
| 11 |
-
JSON_FILE = os.getenv("JSON_FILE")
|
| 12 |
-
RESPONSE_DIR = os.getenv("RESPONSE_DIR")
|
| 13 |
-
RESOURCE_CACHE_FILE = os.getenv("RESOURCE_CACHE_FILE")
|
| 14 |
-
CHROME_PATH = os.getenv("CHROME_PATH")
|
| 15 |
-
RETRIES = int(os.getenv("RETRIES", 3))
|
| 16 |
-
|
| 17 |
-
# You can define additional constants or logic here
|
| 18 |
-
MAX_WORKERS = 8
|
| 19 |
-
BASE_PORT = 9222
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
# main.py
|
| 2 |
-
from modules.scraper_utils import run_scraper
|
| 3 |
-
|
| 4 |
-
if __name__ == "__main__":
|
| 5 |
-
run_scraper()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modules/__init__.py
DELETED
|
File without changes
|
modules/chrome_utils.py
DELETED
|
@@ -1,313 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import time
|
| 3 |
-
import json
|
| 4 |
-
import random
|
| 5 |
-
import base64
|
| 6 |
-
import subprocess
|
| 7 |
-
import requests
|
| 8 |
-
import mimetypes
|
| 9 |
-
import shutil
|
| 10 |
-
import tempfile
|
| 11 |
-
from urllib.parse import urlparse
|
| 12 |
-
import pychrome
|
| 13 |
-
from config import RETRIES
|
| 14 |
-
|
| 15 |
-
from modules.file_utils import save_response_to_file
|
| 16 |
-
from config import (RESOURCE_CACHE_FILE, RESPONSE_DIR, CHROME_PATH)
|
| 17 |
-
|
| 18 |
-
# Global for data usage
|
| 19 |
-
total_network_data = 0
|
| 20 |
-
|
| 21 |
-
def launch_chrome(debug_port, user_data_dir):
|
| 22 |
-
"""
|
| 23 |
-
Launch Chrome with remote debugging and specified user data directory.
|
| 24 |
-
"""
|
| 25 |
-
cmd = [
|
| 26 |
-
CHROME_PATH,
|
| 27 |
-
f"--remote-debugging-port={debug_port}",
|
| 28 |
-
f"--user-data-dir={user_data_dir}",
|
| 29 |
-
"--disable-web-security",
|
| 30 |
-
"--no-first-run",
|
| 31 |
-
# Add or remove additional flags as needed
|
| 32 |
-
]
|
| 33 |
-
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 34 |
-
|
| 35 |
-
def load_resource_cache():
|
| 36 |
-
"""
|
| 37 |
-
Load resource cache (for intercepting requests) from JSON.
|
| 38 |
-
"""
|
| 39 |
-
if not os.path.exists(RESOURCE_CACHE_FILE):
|
| 40 |
-
with open(RESOURCE_CACHE_FILE, "w", encoding="utf-8") as f:
|
| 41 |
-
json.dump({}, f)
|
| 42 |
-
return {}
|
| 43 |
-
try:
|
| 44 |
-
with open(RESOURCE_CACHE_FILE, "r", encoding="utf-8") as f:
|
| 45 |
-
data = json.load(f)
|
| 46 |
-
print(f"[CACHE] Loaded {len(data)} resources from {RESOURCE_CACHE_FILE}")
|
| 47 |
-
return data
|
| 48 |
-
except Exception as e:
|
| 49 |
-
print(f"[CACHE] Error loading cache: {e}")
|
| 50 |
-
return {}
|
| 51 |
-
|
| 52 |
-
def save_resource_cache(resource_cache):
|
| 53 |
-
"""
|
| 54 |
-
Save updated resource cache to JSON.
|
| 55 |
-
"""
|
| 56 |
-
try:
|
| 57 |
-
with open(RESOURCE_CACHE_FILE, "w", encoding="utf-8") as f:
|
| 58 |
-
json.dump(resource_cache, f, indent=2)
|
| 59 |
-
print(f"[CACHE] Saved {len(resource_cache)} resources to {RESOURCE_CACHE_FILE}")
|
| 60 |
-
except Exception as e:
|
| 61 |
-
print(f"[CACHE] Error saving cache: {e}")
|
| 62 |
-
|
| 63 |
-
def setup_tab(browser, debug_port, resource_cache):
|
| 64 |
-
"""
|
| 65 |
-
Create a new tab, attach PyChrome callbacks, enable network & set up intercepts.
|
| 66 |
-
Returns the tab object.
|
| 67 |
-
"""
|
| 68 |
-
tab = browser.new_tab()
|
| 69 |
-
requests_dict = {}
|
| 70 |
-
|
| 71 |
-
def on_request_will_be_sent(**kwargs):
|
| 72 |
-
request_id = kwargs.get("requestId")
|
| 73 |
-
request_obj = kwargs.get("request", {})
|
| 74 |
-
url = request_obj.get("url", "")
|
| 75 |
-
headers = request_obj.get("headers", {})
|
| 76 |
-
# Remove possible undesired headers
|
| 77 |
-
for h in ["X-Forwarded-For", "Via", "Forwarded", "X-Amzn-Trace-Id"]:
|
| 78 |
-
headers.pop(h, None)
|
| 79 |
-
requests_dict[request_id] = {
|
| 80 |
-
"url": url,
|
| 81 |
-
"fromDiskCache": False,
|
| 82 |
-
"servedFromLocalCache": False,
|
| 83 |
-
"status": None,
|
| 84 |
-
"headers": {}
|
| 85 |
-
}
|
| 86 |
-
print(f"Request sent {request_id} => {url}")
|
| 87 |
-
|
| 88 |
-
def on_response_received(**kwargs):
|
| 89 |
-
request_id = kwargs.get("requestId")
|
| 90 |
-
response = kwargs.get("response", {})
|
| 91 |
-
from_disk_cache = response.get("fromDiskCache", False)
|
| 92 |
-
if request_id in requests_dict:
|
| 93 |
-
requests_dict[request_id]["fromDiskCache"] = from_disk_cache
|
| 94 |
-
requests_dict[request_id]["status"] = response.get("status")
|
| 95 |
-
requests_dict[request_id]["headers"] = response.get("headers", {})
|
| 96 |
-
|
| 97 |
-
def on_loading_finished(**kwargs):
|
| 98 |
-
nonlocal tab
|
| 99 |
-
global total_network_data
|
| 100 |
-
|
| 101 |
-
request_id = kwargs.get("requestId")
|
| 102 |
-
if request_id not in requests_dict:
|
| 103 |
-
return
|
| 104 |
-
|
| 105 |
-
encoded_data_length = kwargs.get("encodedDataLength", 0)
|
| 106 |
-
req_info = requests_dict[request_id]
|
| 107 |
-
url = req_info.get("url", "")
|
| 108 |
-
from_disk = req_info.get("fromDiskCache", False)
|
| 109 |
-
from_local = req_info.get("servedFromLocalCache", False)
|
| 110 |
-
|
| 111 |
-
if not from_disk and not from_local:
|
| 112 |
-
total_network_data += encoded_data_length
|
| 113 |
-
print(f"[DATA] Request {request_id} => {encoded_data_length} bytes => {url}")
|
| 114 |
-
print(f"Total net data (not from cache): {total_network_data/1024:.2f} KB")
|
| 115 |
-
|
| 116 |
-
# If this is the Shopee API for item detail:
|
| 117 |
-
if "api/v4/pdp/get_pc" in url:
|
| 118 |
-
try:
|
| 119 |
-
result = tab.Network.getResponseBody(requestId=request_id)
|
| 120 |
-
body = result.get("body", "")
|
| 121 |
-
if result.get("base64Encoded", False):
|
| 122 |
-
body = base64.b64decode(body).decode("utf-8", errors="replace")
|
| 123 |
-
|
| 124 |
-
print("==========================================")
|
| 125 |
-
print(f"Response for: {url}")
|
| 126 |
-
print(body)
|
| 127 |
-
print("==========================================\n")
|
| 128 |
-
|
| 129 |
-
# Save valid responses
|
| 130 |
-
if body.startswith('{"bff_meta":null'):
|
| 131 |
-
save_response_to_file(body, url, RESPONSE_DIR)
|
| 132 |
-
else:
|
| 133 |
-
print("❌ Response does not match filter criteria. Skipping.")
|
| 134 |
-
except Exception as e:
|
| 135 |
-
print(f"Error retrieving body for {url}: {e}")
|
| 136 |
-
|
| 137 |
-
def on_request_intercepted(interceptionId=None, authChallenge=None, request=None, **kwargs):
|
| 138 |
-
"""
|
| 139 |
-
Intercept requests to serve from local cache or fetch them manually.
|
| 140 |
-
"""
|
| 141 |
-
nonlocal resource_cache
|
| 142 |
-
global total_network_data
|
| 143 |
-
|
| 144 |
-
if not request:
|
| 145 |
-
tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
|
| 146 |
-
return
|
| 147 |
-
|
| 148 |
-
request_id = kwargs.get("requestId")
|
| 149 |
-
url = request.get("url", "")
|
| 150 |
-
headers = request.get("headers", {})
|
| 151 |
-
content_type = headers.get("Content-Type", "")
|
| 152 |
-
|
| 153 |
-
# Handle Basic Auth or Proxy Auth
|
| 154 |
-
if authChallenge:
|
| 155 |
-
print("Auth challenge detected. Providing credentials (example).")
|
| 156 |
-
tab.Network.continueInterceptedRequest(
|
| 157 |
-
interceptionId=interceptionId,
|
| 158 |
-
authChallengeResponse={
|
| 159 |
-
"response": "ProvideCredentials",
|
| 160 |
-
"username": "example_username",
|
| 161 |
-
"password": "example_password",
|
| 162 |
-
}
|
| 163 |
-
)
|
| 164 |
-
return
|
| 165 |
-
|
| 166 |
-
parsed = urlparse(url)
|
| 167 |
-
path = parsed.path.lower()
|
| 168 |
-
exts_js = [".js", ".json"]
|
| 169 |
-
exts_img = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
|
| 170 |
-
exts_css = [".css"]
|
| 171 |
-
is_js = any(path.endswith(e) for e in exts_js) or "application/javascript" in content_type
|
| 172 |
-
is_img = any(path.endswith(e) for e in exts_img) or content_type.startswith("image/")
|
| 173 |
-
is_css = any(path.endswith(e) for e in exts_css) or content_type == "text/css"
|
| 174 |
-
|
| 175 |
-
cache_this = is_js or is_img or is_css
|
| 176 |
-
|
| 177 |
-
# If we want to cache static resources:
|
| 178 |
-
if cache_this:
|
| 179 |
-
# Cache hit
|
| 180 |
-
if url in resource_cache:
|
| 181 |
-
print(f"⚡ [CACHE-HIT] Request {request_id}: {url}")
|
| 182 |
-
if request_id in requests_dict:
|
| 183 |
-
requests_dict[request_id]["servedFromLocalCache"] = True
|
| 184 |
-
cached_raw_response = resource_cache[url]
|
| 185 |
-
try:
|
| 186 |
-
tab.Network.continueInterceptedRequest(
|
| 187 |
-
interceptionId=interceptionId,
|
| 188 |
-
rawResponse=cached_raw_response
|
| 189 |
-
)
|
| 190 |
-
except Exception as e:
|
| 191 |
-
print(f"⚠️ [CACHE ERROR] {e}")
|
| 192 |
-
tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
|
| 193 |
-
return
|
| 194 |
-
# Cache miss - fetch manually
|
| 195 |
-
else:
|
| 196 |
-
print(f"🌐 [CACHE-MISS] {url}")
|
| 197 |
-
try:
|
| 198 |
-
r = requests.get(url, timeout=20)
|
| 199 |
-
if r.status_code != 200:
|
| 200 |
-
print(f"❌ Resource fetch status: {r.status_code}")
|
| 201 |
-
tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
|
| 202 |
-
return
|
| 203 |
-
raw_data = r.content
|
| 204 |
-
fetched_size = len(raw_data)
|
| 205 |
-
total_network_data += fetched_size
|
| 206 |
-
final_ct = r.headers.get("Content-Type", None)
|
| 207 |
-
if not final_ct:
|
| 208 |
-
guess_type, _ = mimetypes.guess_type(url)
|
| 209 |
-
final_ct = guess_type if guess_type else "application/octet-stream"
|
| 210 |
-
response_str = (
|
| 211 |
-
"HTTP/1.1 200 OK\r\n"
|
| 212 |
-
f"Content-Type: {final_ct}\r\n"
|
| 213 |
-
"Cache-Control: public, max-age=31536000\r\n"
|
| 214 |
-
f"Content-Length: {len(raw_data)}\r\n"
|
| 215 |
-
"\r\n"
|
| 216 |
-
)
|
| 217 |
-
combined = response_str.encode("utf-8") + raw_data
|
| 218 |
-
raw_response = base64.b64encode(combined).decode("utf-8")
|
| 219 |
-
|
| 220 |
-
# Update cache
|
| 221 |
-
resource_cache[url] = raw_response
|
| 222 |
-
if len(resource_cache) % 20 == 0:
|
| 223 |
-
save_resource_cache(resource_cache)
|
| 224 |
-
|
| 225 |
-
if request_id in requests_dict:
|
| 226 |
-
requests_dict[request_id]["servedFromLocalCache"] = True
|
| 227 |
-
tab.Network.continueInterceptedRequest(
|
| 228 |
-
interceptionId=interceptionId,
|
| 229 |
-
rawResponse=raw_response
|
| 230 |
-
)
|
| 231 |
-
print("✅ Resource fetched & cached.")
|
| 232 |
-
return
|
| 233 |
-
except Exception as e:
|
| 234 |
-
print(f"❌ Error fetching resource: {e}")
|
| 235 |
-
tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
|
| 236 |
-
return
|
| 237 |
-
else:
|
| 238 |
-
# Not caching, just continue
|
| 239 |
-
tab.Network.continueInterceptedRequest(interceptionId=interceptionId, headers=headers)
|
| 240 |
-
|
| 241 |
-
# Attach the callbacks
|
| 242 |
-
tab.Network.requestWillBeSent = on_request_will_be_sent
|
| 243 |
-
tab.Network.responseReceived = on_response_received
|
| 244 |
-
tab.Network.loadingFinished = on_loading_finished
|
| 245 |
-
|
| 246 |
-
tab.start()
|
| 247 |
-
tab.Network.enable()
|
| 248 |
-
# Clear cookies, caches
|
| 249 |
-
tab.Network.clearBrowserCookies()
|
| 250 |
-
tab.Network.clearBrowserCache()
|
| 251 |
-
tab.Storage.clearDataForOrigin(
|
| 252 |
-
origin="https://shopee.tw",
|
| 253 |
-
storageTypes="all"
|
| 254 |
-
)
|
| 255 |
-
tab.Network.setBlockedURLs(urls=[
|
| 256 |
-
"*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.svg",
|
| 257 |
-
])
|
| 258 |
-
# Intercept all requests
|
| 259 |
-
tab.Network.setRequestInterception(patterns=[{"urlPattern": "*"}])
|
| 260 |
-
tab.Network.requestIntercepted = on_request_intercepted
|
| 261 |
-
|
| 262 |
-
return tab
|
| 263 |
-
|
| 264 |
-
def scrape_url(url, debug_port, csv_file, response_dir, resource_cache):
|
| 265 |
-
"""
|
| 266 |
-
Launch a separate Chrome instance, navigate, pick random URLs, do scraping.
|
| 267 |
-
Retries if it fails, up to RETRIES times.
|
| 268 |
-
"""
|
| 269 |
-
temp_dir = tempfile.mkdtemp(prefix="chrome_profile_")
|
| 270 |
-
print(f"[Thread-{debug_port}] Launching Chrome for {url} | Profile: {temp_dir}")
|
| 271 |
-
|
| 272 |
-
chrome_proc = launch_chrome(debug_port, temp_dir)
|
| 273 |
-
time.sleep(5) # Wait for Chrome to launch
|
| 274 |
-
|
| 275 |
-
browser_url = f"http://127.0.0.1:{debug_port}"
|
| 276 |
-
attempt = 0
|
| 277 |
-
|
| 278 |
-
try:
|
| 279 |
-
while attempt < RETRIES:
|
| 280 |
-
try:
|
| 281 |
-
print(f"[Thread-{debug_port}] Attempt {attempt + 1}/{RETRIES} - Navigating to {url}")
|
| 282 |
-
|
| 283 |
-
browser = pychrome.Browser(url=browser_url)
|
| 284 |
-
tab = setup_tab(browser, debug_port, resource_cache)
|
| 285 |
-
|
| 286 |
-
# Example random sleep before navigation
|
| 287 |
-
time.sleep(random.uniform(20, 40))
|
| 288 |
-
tab.Page.navigate(url=url)
|
| 289 |
-
time.sleep(random.uniform(60, 110)) # Wait for page to load
|
| 290 |
-
|
| 291 |
-
# If navigation succeeds, break the retry loop
|
| 292 |
-
print(f"[Thread-{debug_port}] Successfully navigated to {url}")
|
| 293 |
-
break
|
| 294 |
-
|
| 295 |
-
except Exception as e:
|
| 296 |
-
print(f"[Thread-{debug_port}] Error scraping {url}: {e}")
|
| 297 |
-
attempt += 1
|
| 298 |
-
if attempt < RETRIES:
|
| 299 |
-
wait_time = random.uniform(30, 60)
|
| 300 |
-
print(f"[Thread-{debug_port}] Retrying in {wait_time:.2f} seconds...")
|
| 301 |
-
time.sleep(wait_time)
|
| 302 |
-
else:
|
| 303 |
-
print(f"[Thread-{debug_port}] ❌ Max retries reached for {url}. Skipping.")
|
| 304 |
-
|
| 305 |
-
# Cleanup after finishing attempts
|
| 306 |
-
finally:
|
| 307 |
-
try:
|
| 308 |
-
if chrome_proc:
|
| 309 |
-
chrome_proc.kill()
|
| 310 |
-
except Exception:
|
| 311 |
-
pass
|
| 312 |
-
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 313 |
-
print(f"[Thread-{debug_port}] Finished scraping {url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modules/file_utils.py
DELETED
|
@@ -1,89 +0,0 @@
|
|
| 1 |
-
# modules/file_utils.py
|
| 2 |
-
import os
|
| 3 |
-
import re
|
| 4 |
-
import json
|
| 5 |
-
import pandas as pd
|
| 6 |
-
from urllib.parse import urlparse, parse_qs
|
| 7 |
-
|
| 8 |
-
def load_urls_from_csv(csv_file):
|
| 9 |
-
"""Read URLs from a CSV file (no header)."""
|
| 10 |
-
if not os.path.exists(csv_file):
|
| 11 |
-
print(f"❌ Error: CSV file '{csv_file}' not found.")
|
| 12 |
-
return []
|
| 13 |
-
try:
|
| 14 |
-
df = pd.read_csv(csv_file, header=None)
|
| 15 |
-
urls = df[0].dropna().tolist()
|
| 16 |
-
print(f"✅ Loaded {len(urls)} URLs from {csv_file}")
|
| 17 |
-
return urls
|
| 18 |
-
except Exception as e:
|
| 19 |
-
print(f"❌ Error reading CSV: {e}")
|
| 20 |
-
return []
|
| 21 |
-
|
| 22 |
-
def generate_filename_from_url_request(url):
|
| 23 |
-
"""Convert Shopee API URL to the expected filename format (shop_id, item_id)."""
|
| 24 |
-
parsed_url = urlparse(url)
|
| 25 |
-
domain = parsed_url.netloc
|
| 26 |
-
query_params = parse_qs(parsed_url.query)
|
| 27 |
-
item_id = query_params.get("item_id", ["unknown"])[0]
|
| 28 |
-
shop_id = query_params.get("shop_id", ["unknown"])[0]
|
| 29 |
-
filename = f"{domain}---i.{shop_id}.{item_id}.json"
|
| 30 |
-
return filename
|
| 31 |
-
|
| 32 |
-
def generate_filename_from_url(url):
|
| 33 |
-
"""Extract filename from the pattern '---i.shop_id.item_id' in the path."""
|
| 34 |
-
parsed_url = urlparse(url)
|
| 35 |
-
domain = parsed_url.netloc
|
| 36 |
-
match = re.search(r"---i\.(\d+)\.(\d+)", parsed_url.path)
|
| 37 |
-
if match:
|
| 38 |
-
shop_id, item_id = match.groups()
|
| 39 |
-
return f"{domain}---i.{shop_id}.{item_id}.json"
|
| 40 |
-
else:
|
| 41 |
-
print(f"⚠️ Unable to parse shop_id/item_id from URL: {url}")
|
| 42 |
-
return None
|
| 43 |
-
|
| 44 |
-
def load_existing_responses(response_dir):
|
| 45 |
-
"""Return a set of filenames already in the response directory."""
|
| 46 |
-
if not os.path.exists(response_dir):
|
| 47 |
-
os.makedirs(response_dir)
|
| 48 |
-
return set()
|
| 49 |
-
return set(os.listdir(response_dir))
|
| 50 |
-
|
| 51 |
-
def save_response_to_file(response_text, url, response_dir):
|
| 52 |
-
"""
|
| 53 |
-
Save API response as a JSON file in the response_dir.
|
| 54 |
-
Skips if file already exists.
|
| 55 |
-
"""
|
| 56 |
-
filename = generate_filename_from_url_request(url)
|
| 57 |
-
if not filename:
|
| 58 |
-
print(f"⚠️ Cannot generate filename for URL: {url}")
|
| 59 |
-
return
|
| 60 |
-
file_path = os.path.join(response_dir, filename)
|
| 61 |
-
if os.path.exists(file_path):
|
| 62 |
-
print(f"⚠️ Skipping duplicate: {filename} (Already exists)")
|
| 63 |
-
return
|
| 64 |
-
try:
|
| 65 |
-
with open(file_path, "w", encoding="utf-8") as f:
|
| 66 |
-
f.write(response_text)
|
| 67 |
-
print(f"✅ Response saved: {file_path}")
|
| 68 |
-
except Exception as e:
|
| 69 |
-
print(f"❌ Error saving response for {url}: {e}")
|
| 70 |
-
|
| 71 |
-
def find_missing_urls(csv_file, response_dir):
|
| 72 |
-
"""Find CSV URLs that do not yet exist in the response_dir."""
|
| 73 |
-
urls = load_urls_from_csv(csv_file)
|
| 74 |
-
existing_files = load_existing_responses(response_dir)
|
| 75 |
-
missing_urls = []
|
| 76 |
-
for url in urls:
|
| 77 |
-
filename = generate_filename_from_url(url)
|
| 78 |
-
if filename and filename not in existing_files:
|
| 79 |
-
missing_urls.append(url)
|
| 80 |
-
print(f"🔍 {len(missing_urls)} URLs are missing from {response_dir}.")
|
| 81 |
-
return missing_urls
|
| 82 |
-
|
| 83 |
-
def pick_random_url(csv_file, response_dir):
|
| 84 |
-
"""Pick one random URL from those that are missing."""
|
| 85 |
-
import random
|
| 86 |
-
missing = find_missing_urls(csv_file, response_dir)
|
| 87 |
-
if missing:
|
| 88 |
-
return random.choice(missing)
|
| 89 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modules/scraper_utils.py
DELETED
|
@@ -1,46 +0,0 @@
|
|
| 1 |
-
# modules/scraper_utils.py
|
| 2 |
-
import time
|
| 3 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 4 |
-
from modules.file_utils import load_existing_responses, find_missing_urls
|
| 5 |
-
from modules.chrome_utils import scrape_url, load_resource_cache, save_resource_cache
|
| 6 |
-
from config import (CSV_FILE, RESPONSE_DIR, BASE_PORT, MAX_WORKERS)
|
| 7 |
-
|
| 8 |
-
def run_scraper():
|
| 9 |
-
# Print how many responses we already have
|
| 10 |
-
existing_responses = load_existing_responses(RESPONSE_DIR)
|
| 11 |
-
print(f"Total responses in '{RESPONSE_DIR}': {len(existing_responses)}")
|
| 12 |
-
|
| 13 |
-
urls_to_scrape = find_missing_urls(CSV_FILE, RESPONSE_DIR)
|
| 14 |
-
if not urls_to_scrape:
|
| 15 |
-
print("No missing URLs to process.")
|
| 16 |
-
return
|
| 17 |
-
|
| 18 |
-
resource_cache = load_resource_cache()
|
| 19 |
-
|
| 20 |
-
max_workers = min(MAX_WORKERS, len(urls_to_scrape))
|
| 21 |
-
base_port = BASE_PORT
|
| 22 |
-
|
| 23 |
-
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 24 |
-
futures = []
|
| 25 |
-
for i, url in enumerate(urls_to_scrape):
|
| 26 |
-
debug_port = base_port + i
|
| 27 |
-
# Check if we already have a file for this URL
|
| 28 |
-
# to skip duplicates if you want:
|
| 29 |
-
futures.append(executor.submit(
|
| 30 |
-
scrape_url,
|
| 31 |
-
url,
|
| 32 |
-
debug_port,
|
| 33 |
-
CSV_FILE,
|
| 34 |
-
RESPONSE_DIR,
|
| 35 |
-
resource_cache
|
| 36 |
-
))
|
| 37 |
-
|
| 38 |
-
for future in as_completed(futures):
|
| 39 |
-
try:
|
| 40 |
-
future.result()
|
| 41 |
-
except Exception as exc:
|
| 42 |
-
print(f"[Thread] Exception: {exc}")
|
| 43 |
-
|
| 44 |
-
# After all threads complete, save updated resource cache
|
| 45 |
-
save_resource_cache(resource_cache)
|
| 46 |
-
print("All scraping tasks completed.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|