Gemini CLI commited on
Commit
a96d5ce
·
1 Parent(s): c6def7d

Deploying project

Browse files
.env DELETED
@@ -1,7 +0,0 @@
1
- PROXY=socks5://geo.iproyal.com:51228
2
- CSV_FILE=/Users/arif/shopee_url.csv
3
- JSON_FILE=api_responses.json
4
- RESPONSE_DIR=responses
5
- RESOURCE_CACHE_FILE=resource_cache.json
6
- CHROME_PATH=/Applications/Google Chrome.app/Contents/MacOS/Google Chrome
7
- RETRIES=3
 
 
 
 
 
 
 
 
README.md DELETED
@@ -1 +0,0 @@
1
- # shopee-crawler
 
 
config.py DELETED
@@ -1,19 +0,0 @@
1
- # config.py
2
- import os
3
- from dotenv import load_dotenv
4
-
5
- # Load environment variables from .env
6
- load_dotenv()
7
-
8
- # Retrieve environment variables
9
- PROXY = os.getenv("PROXY")
10
- CSV_FILE = os.getenv("CSV_FILE")
11
- JSON_FILE = os.getenv("JSON_FILE")
12
- RESPONSE_DIR = os.getenv("RESPONSE_DIR")
13
- RESOURCE_CACHE_FILE = os.getenv("RESOURCE_CACHE_FILE")
14
- CHROME_PATH = os.getenv("CHROME_PATH")
15
- RETRIES = int(os.getenv("RETRIES", 3))
16
-
17
- # You can define additional constants or logic here
18
- MAX_WORKERS = 8
19
- BASE_PORT = 9222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py DELETED
@@ -1,5 +0,0 @@
1
- # main.py
2
- from modules.scraper_utils import run_scraper
3
-
4
- if __name__ == "__main__":
5
- run_scraper()
 
 
 
 
 
 
modules/__init__.py DELETED
File without changes
modules/chrome_utils.py DELETED
@@ -1,313 +0,0 @@
1
- import os
2
- import time
3
- import json
4
- import random
5
- import base64
6
- import subprocess
7
- import requests
8
- import mimetypes
9
- import shutil
10
- import tempfile
11
- from urllib.parse import urlparse
12
- import pychrome
13
- from config import RETRIES
14
-
15
- from modules.file_utils import save_response_to_file
16
- from config import (RESOURCE_CACHE_FILE, RESPONSE_DIR, CHROME_PATH)
17
-
18
- # Global for data usage
19
- total_network_data = 0
20
-
21
- def launch_chrome(debug_port, user_data_dir):
22
- """
23
- Launch Chrome with remote debugging and specified user data directory.
24
- """
25
- cmd = [
26
- CHROME_PATH,
27
- f"--remote-debugging-port={debug_port}",
28
- f"--user-data-dir={user_data_dir}",
29
- "--disable-web-security",
30
- "--no-first-run",
31
- # Add or remove additional flags as needed
32
- ]
33
- return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
34
-
35
- def load_resource_cache():
36
- """
37
- Load resource cache (for intercepting requests) from JSON.
38
- """
39
- if not os.path.exists(RESOURCE_CACHE_FILE):
40
- with open(RESOURCE_CACHE_FILE, "w", encoding="utf-8") as f:
41
- json.dump({}, f)
42
- return {}
43
- try:
44
- with open(RESOURCE_CACHE_FILE, "r", encoding="utf-8") as f:
45
- data = json.load(f)
46
- print(f"[CACHE] Loaded {len(data)} resources from {RESOURCE_CACHE_FILE}")
47
- return data
48
- except Exception as e:
49
- print(f"[CACHE] Error loading cache: {e}")
50
- return {}
51
-
52
- def save_resource_cache(resource_cache):
53
- """
54
- Save updated resource cache to JSON.
55
- """
56
- try:
57
- with open(RESOURCE_CACHE_FILE, "w", encoding="utf-8") as f:
58
- json.dump(resource_cache, f, indent=2)
59
- print(f"[CACHE] Saved {len(resource_cache)} resources to {RESOURCE_CACHE_FILE}")
60
- except Exception as e:
61
- print(f"[CACHE] Error saving cache: {e}")
62
-
63
- def setup_tab(browser, debug_port, resource_cache):
64
- """
65
- Create a new tab, attach PyChrome callbacks, enable network & set up intercepts.
66
- Returns the tab object.
67
- """
68
- tab = browser.new_tab()
69
- requests_dict = {}
70
-
71
- def on_request_will_be_sent(**kwargs):
72
- request_id = kwargs.get("requestId")
73
- request_obj = kwargs.get("request", {})
74
- url = request_obj.get("url", "")
75
- headers = request_obj.get("headers", {})
76
- # Remove possible undesired headers
77
- for h in ["X-Forwarded-For", "Via", "Forwarded", "X-Amzn-Trace-Id"]:
78
- headers.pop(h, None)
79
- requests_dict[request_id] = {
80
- "url": url,
81
- "fromDiskCache": False,
82
- "servedFromLocalCache": False,
83
- "status": None,
84
- "headers": {}
85
- }
86
- print(f"Request sent {request_id} => {url}")
87
-
88
- def on_response_received(**kwargs):
89
- request_id = kwargs.get("requestId")
90
- response = kwargs.get("response", {})
91
- from_disk_cache = response.get("fromDiskCache", False)
92
- if request_id in requests_dict:
93
- requests_dict[request_id]["fromDiskCache"] = from_disk_cache
94
- requests_dict[request_id]["status"] = response.get("status")
95
- requests_dict[request_id]["headers"] = response.get("headers", {})
96
-
97
- def on_loading_finished(**kwargs):
98
- nonlocal tab
99
- global total_network_data
100
-
101
- request_id = kwargs.get("requestId")
102
- if request_id not in requests_dict:
103
- return
104
-
105
- encoded_data_length = kwargs.get("encodedDataLength", 0)
106
- req_info = requests_dict[request_id]
107
- url = req_info.get("url", "")
108
- from_disk = req_info.get("fromDiskCache", False)
109
- from_local = req_info.get("servedFromLocalCache", False)
110
-
111
- if not from_disk and not from_local:
112
- total_network_data += encoded_data_length
113
- print(f"[DATA] Request {request_id} => {encoded_data_length} bytes => {url}")
114
- print(f"Total net data (not from cache): {total_network_data/1024:.2f} KB")
115
-
116
- # If this is the Shopee API for item detail:
117
- if "api/v4/pdp/get_pc" in url:
118
- try:
119
- result = tab.Network.getResponseBody(requestId=request_id)
120
- body = result.get("body", "")
121
- if result.get("base64Encoded", False):
122
- body = base64.b64decode(body).decode("utf-8", errors="replace")
123
-
124
- print("==========================================")
125
- print(f"Response for: {url}")
126
- print(body)
127
- print("==========================================\n")
128
-
129
- # Save valid responses
130
- if body.startswith('{"bff_meta":null'):
131
- save_response_to_file(body, url, RESPONSE_DIR)
132
- else:
133
- print("❌ Response does not match filter criteria. Skipping.")
134
- except Exception as e:
135
- print(f"Error retrieving body for {url}: {e}")
136
-
137
- def on_request_intercepted(interceptionId=None, authChallenge=None, request=None, **kwargs):
138
- """
139
- Intercept requests to serve from local cache or fetch them manually.
140
- """
141
- nonlocal resource_cache
142
- global total_network_data
143
-
144
- if not request:
145
- tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
146
- return
147
-
148
- request_id = kwargs.get("requestId")
149
- url = request.get("url", "")
150
- headers = request.get("headers", {})
151
- content_type = headers.get("Content-Type", "")
152
-
153
- # Handle Basic Auth or Proxy Auth
154
- if authChallenge:
155
- print("Auth challenge detected. Providing credentials (example).")
156
- tab.Network.continueInterceptedRequest(
157
- interceptionId=interceptionId,
158
- authChallengeResponse={
159
- "response": "ProvideCredentials",
160
- "username": "example_username",
161
- "password": "example_password",
162
- }
163
- )
164
- return
165
-
166
- parsed = urlparse(url)
167
- path = parsed.path.lower()
168
- exts_js = [".js", ".json"]
169
- exts_img = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
170
- exts_css = [".css"]
171
- is_js = any(path.endswith(e) for e in exts_js) or "application/javascript" in content_type
172
- is_img = any(path.endswith(e) for e in exts_img) or content_type.startswith("image/")
173
- is_css = any(path.endswith(e) for e in exts_css) or content_type == "text/css"
174
-
175
- cache_this = is_js or is_img or is_css
176
-
177
- # If we want to cache static resources:
178
- if cache_this:
179
- # Cache hit
180
- if url in resource_cache:
181
- print(f"⚡ [CACHE-HIT] Request {request_id}: {url}")
182
- if request_id in requests_dict:
183
- requests_dict[request_id]["servedFromLocalCache"] = True
184
- cached_raw_response = resource_cache[url]
185
- try:
186
- tab.Network.continueInterceptedRequest(
187
- interceptionId=interceptionId,
188
- rawResponse=cached_raw_response
189
- )
190
- except Exception as e:
191
- print(f"⚠️ [CACHE ERROR] {e}")
192
- tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
193
- return
194
- # Cache miss - fetch manually
195
- else:
196
- print(f"🌐 [CACHE-MISS] {url}")
197
- try:
198
- r = requests.get(url, timeout=20)
199
- if r.status_code != 200:
200
- print(f"❌ Resource fetch status: {r.status_code}")
201
- tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
202
- return
203
- raw_data = r.content
204
- fetched_size = len(raw_data)
205
- total_network_data += fetched_size
206
- final_ct = r.headers.get("Content-Type", None)
207
- if not final_ct:
208
- guess_type, _ = mimetypes.guess_type(url)
209
- final_ct = guess_type if guess_type else "application/octet-stream"
210
- response_str = (
211
- "HTTP/1.1 200 OK\r\n"
212
- f"Content-Type: {final_ct}\r\n"
213
- "Cache-Control: public, max-age=31536000\r\n"
214
- f"Content-Length: {len(raw_data)}\r\n"
215
- "\r\n"
216
- )
217
- combined = response_str.encode("utf-8") + raw_data
218
- raw_response = base64.b64encode(combined).decode("utf-8")
219
-
220
- # Update cache
221
- resource_cache[url] = raw_response
222
- if len(resource_cache) % 20 == 0:
223
- save_resource_cache(resource_cache)
224
-
225
- if request_id in requests_dict:
226
- requests_dict[request_id]["servedFromLocalCache"] = True
227
- tab.Network.continueInterceptedRequest(
228
- interceptionId=interceptionId,
229
- rawResponse=raw_response
230
- )
231
- print("✅ Resource fetched & cached.")
232
- return
233
- except Exception as e:
234
- print(f"❌ Error fetching resource: {e}")
235
- tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
236
- return
237
- else:
238
- # Not caching, just continue
239
- tab.Network.continueInterceptedRequest(interceptionId=interceptionId, headers=headers)
240
-
241
- # Attach the callbacks
242
- tab.Network.requestWillBeSent = on_request_will_be_sent
243
- tab.Network.responseReceived = on_response_received
244
- tab.Network.loadingFinished = on_loading_finished
245
-
246
- tab.start()
247
- tab.Network.enable()
248
- # Clear cookies, caches
249
- tab.Network.clearBrowserCookies()
250
- tab.Network.clearBrowserCache()
251
- tab.Storage.clearDataForOrigin(
252
- origin="https://shopee.tw",
253
- storageTypes="all"
254
- )
255
- tab.Network.setBlockedURLs(urls=[
256
- "*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.svg",
257
- ])
258
- # Intercept all requests
259
- tab.Network.setRequestInterception(patterns=[{"urlPattern": "*"}])
260
- tab.Network.requestIntercepted = on_request_intercepted
261
-
262
- return tab
263
-
264
- def scrape_url(url, debug_port, csv_file, response_dir, resource_cache):
265
- """
266
- Launch a separate Chrome instance, navigate, pick random URLs, do scraping.
267
- Retries if it fails, up to RETRIES times.
268
- """
269
- temp_dir = tempfile.mkdtemp(prefix="chrome_profile_")
270
- print(f"[Thread-{debug_port}] Launching Chrome for {url} | Profile: {temp_dir}")
271
-
272
- chrome_proc = launch_chrome(debug_port, temp_dir)
273
- time.sleep(5) # Wait for Chrome to launch
274
-
275
- browser_url = f"http://127.0.0.1:{debug_port}"
276
- attempt = 0
277
-
278
- try:
279
- while attempt < RETRIES:
280
- try:
281
- print(f"[Thread-{debug_port}] Attempt {attempt + 1}/{RETRIES} - Navigating to {url}")
282
-
283
- browser = pychrome.Browser(url=browser_url)
284
- tab = setup_tab(browser, debug_port, resource_cache)
285
-
286
- # Example random sleep before navigation
287
- time.sleep(random.uniform(20, 40))
288
- tab.Page.navigate(url=url)
289
- time.sleep(random.uniform(60, 110)) # Wait for page to load
290
-
291
- # If navigation succeeds, break the retry loop
292
- print(f"[Thread-{debug_port}] Successfully navigated to {url}")
293
- break
294
-
295
- except Exception as e:
296
- print(f"[Thread-{debug_port}] Error scraping {url}: {e}")
297
- attempt += 1
298
- if attempt < RETRIES:
299
- wait_time = random.uniform(30, 60)
300
- print(f"[Thread-{debug_port}] Retrying in {wait_time:.2f} seconds...")
301
- time.sleep(wait_time)
302
- else:
303
- print(f"[Thread-{debug_port}] ❌ Max retries reached for {url}. Skipping.")
304
-
305
- # Cleanup after finishing attempts
306
- finally:
307
- try:
308
- if chrome_proc:
309
- chrome_proc.kill()
310
- except Exception:
311
- pass
312
- shutil.rmtree(temp_dir, ignore_errors=True)
313
- print(f"[Thread-{debug_port}] Finished scraping {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/file_utils.py DELETED
@@ -1,89 +0,0 @@
1
- # modules/file_utils.py
2
- import os
3
- import re
4
- import json
5
- import pandas as pd
6
- from urllib.parse import urlparse, parse_qs
7
-
8
- def load_urls_from_csv(csv_file):
9
- """Read URLs from a CSV file (no header)."""
10
- if not os.path.exists(csv_file):
11
- print(f"❌ Error: CSV file '{csv_file}' not found.")
12
- return []
13
- try:
14
- df = pd.read_csv(csv_file, header=None)
15
- urls = df[0].dropna().tolist()
16
- print(f"✅ Loaded {len(urls)} URLs from {csv_file}")
17
- return urls
18
- except Exception as e:
19
- print(f"❌ Error reading CSV: {e}")
20
- return []
21
-
22
- def generate_filename_from_url_request(url):
23
- """Convert Shopee API URL to the expected filename format (shop_id, item_id)."""
24
- parsed_url = urlparse(url)
25
- domain = parsed_url.netloc
26
- query_params = parse_qs(parsed_url.query)
27
- item_id = query_params.get("item_id", ["unknown"])[0]
28
- shop_id = query_params.get("shop_id", ["unknown"])[0]
29
- filename = f"{domain}---i.{shop_id}.{item_id}.json"
30
- return filename
31
-
32
- def generate_filename_from_url(url):
33
- """Extract filename from the pattern '---i.shop_id.item_id' in the path."""
34
- parsed_url = urlparse(url)
35
- domain = parsed_url.netloc
36
- match = re.search(r"---i\.(\d+)\.(\d+)", parsed_url.path)
37
- if match:
38
- shop_id, item_id = match.groups()
39
- return f"{domain}---i.{shop_id}.{item_id}.json"
40
- else:
41
- print(f"⚠️ Unable to parse shop_id/item_id from URL: {url}")
42
- return None
43
-
44
- def load_existing_responses(response_dir):
45
- """Return a set of filenames already in the response directory."""
46
- if not os.path.exists(response_dir):
47
- os.makedirs(response_dir)
48
- return set()
49
- return set(os.listdir(response_dir))
50
-
51
- def save_response_to_file(response_text, url, response_dir):
52
- """
53
- Save API response as a JSON file in the response_dir.
54
- Skips if file already exists.
55
- """
56
- filename = generate_filename_from_url_request(url)
57
- if not filename:
58
- print(f"⚠️ Cannot generate filename for URL: {url}")
59
- return
60
- file_path = os.path.join(response_dir, filename)
61
- if os.path.exists(file_path):
62
- print(f"⚠️ Skipping duplicate: {filename} (Already exists)")
63
- return
64
- try:
65
- with open(file_path, "w", encoding="utf-8") as f:
66
- f.write(response_text)
67
- print(f"✅ Response saved: {file_path}")
68
- except Exception as e:
69
- print(f"❌ Error saving response for {url}: {e}")
70
-
71
- def find_missing_urls(csv_file, response_dir):
72
- """Find CSV URLs that do not yet exist in the response_dir."""
73
- urls = load_urls_from_csv(csv_file)
74
- existing_files = load_existing_responses(response_dir)
75
- missing_urls = []
76
- for url in urls:
77
- filename = generate_filename_from_url(url)
78
- if filename and filename not in existing_files:
79
- missing_urls.append(url)
80
- print(f"🔍 {len(missing_urls)} URLs are missing from {response_dir}.")
81
- return missing_urls
82
-
83
- def pick_random_url(csv_file, response_dir):
84
- """Pick one random URL from those that are missing."""
85
- import random
86
- missing = find_missing_urls(csv_file, response_dir)
87
- if missing:
88
- return random.choice(missing)
89
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/scraper_utils.py DELETED
@@ -1,46 +0,0 @@
1
- # modules/scraper_utils.py
2
- import time
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from modules.file_utils import load_existing_responses, find_missing_urls
5
- from modules.chrome_utils import scrape_url, load_resource_cache, save_resource_cache
6
- from config import (CSV_FILE, RESPONSE_DIR, BASE_PORT, MAX_WORKERS)
7
-
8
- def run_scraper():
9
- # Print how many responses we already have
10
- existing_responses = load_existing_responses(RESPONSE_DIR)
11
- print(f"Total responses in '{RESPONSE_DIR}': {len(existing_responses)}")
12
-
13
- urls_to_scrape = find_missing_urls(CSV_FILE, RESPONSE_DIR)
14
- if not urls_to_scrape:
15
- print("No missing URLs to process.")
16
- return
17
-
18
- resource_cache = load_resource_cache()
19
-
20
- max_workers = min(MAX_WORKERS, len(urls_to_scrape))
21
- base_port = BASE_PORT
22
-
23
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
24
- futures = []
25
- for i, url in enumerate(urls_to_scrape):
26
- debug_port = base_port + i
27
- # Check if we already have a file for this URL
28
- # to skip duplicates if you want:
29
- futures.append(executor.submit(
30
- scrape_url,
31
- url,
32
- debug_port,
33
- CSV_FILE,
34
- RESPONSE_DIR,
35
- resource_cache
36
- ))
37
-
38
- for future in as_completed(futures):
39
- try:
40
- future.result()
41
- except Exception as exc:
42
- print(f"[Thread] Exception: {exc}")
43
-
44
- # After all threads complete, save updated resource cache
45
- save_resource_cache(resource_cache)
46
- print("All scraping tasks completed.")