Arif commited on
Commit
c6def7d
·
1 Parent(s): 2c9bf12

Added scraper

Browse files
.env ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ PROXY=socks5://geo.iproyal.com:51228
2
+ CSV_FILE=/Users/arif/shopee_url.csv
3
+ JSON_FILE=api_responses.json
4
+ RESPONSE_DIR=responses
5
+ RESOURCE_CACHE_FILE=resource_cache.json
6
+ CHROME_PATH=/Applications/Google Chrome.app/Contents/MacOS/Google Chrome
7
+ RETRIES=3
README.md CHANGED
@@ -1,2 +1 @@
1
  # shopee-crawler
2
- # shopee-crawler
 
1
  # shopee-crawler
 
config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ # Load environment variables from .env
6
+ load_dotenv()
7
+
8
+ # Retrieve environment variables
9
+ PROXY = os.getenv("PROXY")
10
+ CSV_FILE = os.getenv("CSV_FILE")
11
+ JSON_FILE = os.getenv("JSON_FILE")
12
+ RESPONSE_DIR = os.getenv("RESPONSE_DIR")
13
+ RESOURCE_CACHE_FILE = os.getenv("RESOURCE_CACHE_FILE")
14
+ CHROME_PATH = os.getenv("CHROME_PATH")
15
+ RETRIES = int(os.getenv("RETRIES", 3))
16
+
17
+ # You can define additional constants or logic here
18
+ MAX_WORKERS = 8
19
+ BASE_PORT = 9222
main.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # main.py
2
+ from modules.scraper_utils import run_scraper
3
+
4
+ if __name__ == "__main__":
5
+ run_scraper()
modules/__init__.py ADDED
File without changes
modules/chrome_utils.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import json
4
+ import random
5
+ import base64
6
+ import subprocess
7
+ import requests
8
+ import mimetypes
9
+ import shutil
10
+ import tempfile
11
+ from urllib.parse import urlparse
12
+ import pychrome
13
+ from config import RETRIES
14
+
15
+ from modules.file_utils import save_response_to_file
16
+ from config import (RESOURCE_CACHE_FILE, RESPONSE_DIR, CHROME_PATH)
17
+
18
+ # Global for data usage
19
+ total_network_data = 0
20
+
21
+ def launch_chrome(debug_port, user_data_dir):
22
+ """
23
+ Launch Chrome with remote debugging and specified user data directory.
24
+ """
25
+ cmd = [
26
+ CHROME_PATH,
27
+ f"--remote-debugging-port={debug_port}",
28
+ f"--user-data-dir={user_data_dir}",
29
+ "--disable-web-security",
30
+ "--no-first-run",
31
+ # Add or remove additional flags as needed
32
+ ]
33
+ return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
34
+
35
+ def load_resource_cache():
36
+ """
37
+ Load resource cache (for intercepting requests) from JSON.
38
+ """
39
+ if not os.path.exists(RESOURCE_CACHE_FILE):
40
+ with open(RESOURCE_CACHE_FILE, "w", encoding="utf-8") as f:
41
+ json.dump({}, f)
42
+ return {}
43
+ try:
44
+ with open(RESOURCE_CACHE_FILE, "r", encoding="utf-8") as f:
45
+ data = json.load(f)
46
+ print(f"[CACHE] Loaded {len(data)} resources from {RESOURCE_CACHE_FILE}")
47
+ return data
48
+ except Exception as e:
49
+ print(f"[CACHE] Error loading cache: {e}")
50
+ return {}
51
+
52
+ def save_resource_cache(resource_cache):
53
+ """
54
+ Save updated resource cache to JSON.
55
+ """
56
+ try:
57
+ with open(RESOURCE_CACHE_FILE, "w", encoding="utf-8") as f:
58
+ json.dump(resource_cache, f, indent=2)
59
+ print(f"[CACHE] Saved {len(resource_cache)} resources to {RESOURCE_CACHE_FILE}")
60
+ except Exception as e:
61
+ print(f"[CACHE] Error saving cache: {e}")
62
+
63
+ def setup_tab(browser, debug_port, resource_cache):
64
+ """
65
+ Create a new tab, attach PyChrome callbacks, enable network & set up intercepts.
66
+ Returns the tab object.
67
+ """
68
+ tab = browser.new_tab()
69
+ requests_dict = {}
70
+
71
+ def on_request_will_be_sent(**kwargs):
72
+ request_id = kwargs.get("requestId")
73
+ request_obj = kwargs.get("request", {})
74
+ url = request_obj.get("url", "")
75
+ headers = request_obj.get("headers", {})
76
+ # Remove possible undesired headers
77
+ for h in ["X-Forwarded-For", "Via", "Forwarded", "X-Amzn-Trace-Id"]:
78
+ headers.pop(h, None)
79
+ requests_dict[request_id] = {
80
+ "url": url,
81
+ "fromDiskCache": False,
82
+ "servedFromLocalCache": False,
83
+ "status": None,
84
+ "headers": {}
85
+ }
86
+ print(f"Request sent {request_id} => {url}")
87
+
88
+ def on_response_received(**kwargs):
89
+ request_id = kwargs.get("requestId")
90
+ response = kwargs.get("response", {})
91
+ from_disk_cache = response.get("fromDiskCache", False)
92
+ if request_id in requests_dict:
93
+ requests_dict[request_id]["fromDiskCache"] = from_disk_cache
94
+ requests_dict[request_id]["status"] = response.get("status")
95
+ requests_dict[request_id]["headers"] = response.get("headers", {})
96
+
97
+ def on_loading_finished(**kwargs):
98
+ nonlocal tab
99
+ global total_network_data
100
+
101
+ request_id = kwargs.get("requestId")
102
+ if request_id not in requests_dict:
103
+ return
104
+
105
+ encoded_data_length = kwargs.get("encodedDataLength", 0)
106
+ req_info = requests_dict[request_id]
107
+ url = req_info.get("url", "")
108
+ from_disk = req_info.get("fromDiskCache", False)
109
+ from_local = req_info.get("servedFromLocalCache", False)
110
+
111
+ if not from_disk and not from_local:
112
+ total_network_data += encoded_data_length
113
+ print(f"[DATA] Request {request_id} => {encoded_data_length} bytes => {url}")
114
+ print(f"Total net data (not from cache): {total_network_data/1024:.2f} KB")
115
+
116
+ # If this is the Shopee API for item detail:
117
+ if "api/v4/pdp/get_pc" in url:
118
+ try:
119
+ result = tab.Network.getResponseBody(requestId=request_id)
120
+ body = result.get("body", "")
121
+ if result.get("base64Encoded", False):
122
+ body = base64.b64decode(body).decode("utf-8", errors="replace")
123
+
124
+ print("==========================================")
125
+ print(f"Response for: {url}")
126
+ print(body)
127
+ print("==========================================\n")
128
+
129
+ # Save valid responses
130
+ if body.startswith('{"bff_meta":null'):
131
+ save_response_to_file(body, url, RESPONSE_DIR)
132
+ else:
133
+ print("❌ Response does not match filter criteria. Skipping.")
134
+ except Exception as e:
135
+ print(f"Error retrieving body for {url}: {e}")
136
+
137
+ def on_request_intercepted(interceptionId=None, authChallenge=None, request=None, **kwargs):
138
+ """
139
+ Intercept requests to serve from local cache or fetch them manually.
140
+ """
141
+ nonlocal resource_cache
142
+ global total_network_data
143
+
144
+ if not request:
145
+ tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
146
+ return
147
+
148
+ request_id = kwargs.get("requestId")
149
+ url = request.get("url", "")
150
+ headers = request.get("headers", {})
151
+ content_type = headers.get("Content-Type", "")
152
+
153
+ # Handle Basic Auth or Proxy Auth
154
+ if authChallenge:
155
+ print("Auth challenge detected. Providing credentials (example).")
156
+ tab.Network.continueInterceptedRequest(
157
+ interceptionId=interceptionId,
158
+ authChallengeResponse={
159
+ "response": "ProvideCredentials",
160
+ "username": "example_username",
161
+ "password": "example_password",
162
+ }
163
+ )
164
+ return
165
+
166
+ parsed = urlparse(url)
167
+ path = parsed.path.lower()
168
+ exts_js = [".js", ".json"]
169
+ exts_img = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
170
+ exts_css = [".css"]
171
+ is_js = any(path.endswith(e) for e in exts_js) or "application/javascript" in content_type
172
+ is_img = any(path.endswith(e) for e in exts_img) or content_type.startswith("image/")
173
+ is_css = any(path.endswith(e) for e in exts_css) or content_type == "text/css"
174
+
175
+ cache_this = is_js or is_img or is_css
176
+
177
+ # If we want to cache static resources:
178
+ if cache_this:
179
+ # Cache hit
180
+ if url in resource_cache:
181
+ print(f"⚡ [CACHE-HIT] Request {request_id}: {url}")
182
+ if request_id in requests_dict:
183
+ requests_dict[request_id]["servedFromLocalCache"] = True
184
+ cached_raw_response = resource_cache[url]
185
+ try:
186
+ tab.Network.continueInterceptedRequest(
187
+ interceptionId=interceptionId,
188
+ rawResponse=cached_raw_response
189
+ )
190
+ except Exception as e:
191
+ print(f"⚠️ [CACHE ERROR] {e}")
192
+ tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
193
+ return
194
+ # Cache miss - fetch manually
195
+ else:
196
+ print(f"🌐 [CACHE-MISS] {url}")
197
+ try:
198
+ r = requests.get(url, timeout=20)
199
+ if r.status_code != 200:
200
+ print(f"❌ Resource fetch status: {r.status_code}")
201
+ tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
202
+ return
203
+ raw_data = r.content
204
+ fetched_size = len(raw_data)
205
+ total_network_data += fetched_size
206
+ final_ct = r.headers.get("Content-Type", None)
207
+ if not final_ct:
208
+ guess_type, _ = mimetypes.guess_type(url)
209
+ final_ct = guess_type if guess_type else "application/octet-stream"
210
+ response_str = (
211
+ "HTTP/1.1 200 OK\r\n"
212
+ f"Content-Type: {final_ct}\r\n"
213
+ "Cache-Control: public, max-age=31536000\r\n"
214
+ f"Content-Length: {len(raw_data)}\r\n"
215
+ "\r\n"
216
+ )
217
+ combined = response_str.encode("utf-8") + raw_data
218
+ raw_response = base64.b64encode(combined).decode("utf-8")
219
+
220
+ # Update cache
221
+ resource_cache[url] = raw_response
222
+ if len(resource_cache) % 20 == 0:
223
+ save_resource_cache(resource_cache)
224
+
225
+ if request_id in requests_dict:
226
+ requests_dict[request_id]["servedFromLocalCache"] = True
227
+ tab.Network.continueInterceptedRequest(
228
+ interceptionId=interceptionId,
229
+ rawResponse=raw_response
230
+ )
231
+ print("✅ Resource fetched & cached.")
232
+ return
233
+ except Exception as e:
234
+ print(f"❌ Error fetching resource: {e}")
235
+ tab.Network.continueInterceptedRequest(interceptionId=interceptionId)
236
+ return
237
+ else:
238
+ # Not caching, just continue
239
+ tab.Network.continueInterceptedRequest(interceptionId=interceptionId, headers=headers)
240
+
241
+ # Attach the callbacks
242
+ tab.Network.requestWillBeSent = on_request_will_be_sent
243
+ tab.Network.responseReceived = on_response_received
244
+ tab.Network.loadingFinished = on_loading_finished
245
+
246
+ tab.start()
247
+ tab.Network.enable()
248
+ # Clear cookies, caches
249
+ tab.Network.clearBrowserCookies()
250
+ tab.Network.clearBrowserCache()
251
+ tab.Storage.clearDataForOrigin(
252
+ origin="https://shopee.tw",
253
+ storageTypes="all"
254
+ )
255
+ tab.Network.setBlockedURLs(urls=[
256
+ "*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.svg",
257
+ ])
258
+ # Intercept all requests
259
+ tab.Network.setRequestInterception(patterns=[{"urlPattern": "*"}])
260
+ tab.Network.requestIntercepted = on_request_intercepted
261
+
262
+ return tab
263
+
264
+ def scrape_url(url, debug_port, csv_file, response_dir, resource_cache):
265
+ """
266
+ Launch a separate Chrome instance, navigate, pick random URLs, do scraping.
267
+ Retries if it fails, up to RETRIES times.
268
+ """
269
+ temp_dir = tempfile.mkdtemp(prefix="chrome_profile_")
270
+ print(f"[Thread-{debug_port}] Launching Chrome for {url} | Profile: {temp_dir}")
271
+
272
+ chrome_proc = launch_chrome(debug_port, temp_dir)
273
+ time.sleep(5) # Wait for Chrome to launch
274
+
275
+ browser_url = f"http://127.0.0.1:{debug_port}"
276
+ attempt = 0
277
+
278
+ try:
279
+ while attempt < RETRIES:
280
+ try:
281
+ print(f"[Thread-{debug_port}] Attempt {attempt + 1}/{RETRIES} - Navigating to {url}")
282
+
283
+ browser = pychrome.Browser(url=browser_url)
284
+ tab = setup_tab(browser, debug_port, resource_cache)
285
+
286
+ # Example random sleep before navigation
287
+ time.sleep(random.uniform(20, 40))
288
+ tab.Page.navigate(url=url)
289
+ time.sleep(random.uniform(60, 110)) # Wait for page to load
290
+
291
+ # If navigation succeeds, break the retry loop
292
+ print(f"[Thread-{debug_port}] Successfully navigated to {url}")
293
+ break
294
+
295
+ except Exception as e:
296
+ print(f"[Thread-{debug_port}] Error scraping {url}: {e}")
297
+ attempt += 1
298
+ if attempt < RETRIES:
299
+ wait_time = random.uniform(30, 60)
300
+ print(f"[Thread-{debug_port}] Retrying in {wait_time:.2f} seconds...")
301
+ time.sleep(wait_time)
302
+ else:
303
+ print(f"[Thread-{debug_port}] ❌ Max retries reached for {url}. Skipping.")
304
+
305
+ # Cleanup after finishing attempts
306
+ finally:
307
+ try:
308
+ if chrome_proc:
309
+ chrome_proc.kill()
310
+ except Exception:
311
+ pass
312
+ shutil.rmtree(temp_dir, ignore_errors=True)
313
+ print(f"[Thread-{debug_port}] Finished scraping {url}")
modules/file_utils.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modules/file_utils.py
2
+ import os
3
+ import re
4
+ import json
5
+ import pandas as pd
6
+ from urllib.parse import urlparse, parse_qs
7
+
8
+ def load_urls_from_csv(csv_file):
9
+ """Read URLs from a CSV file (no header)."""
10
+ if not os.path.exists(csv_file):
11
+ print(f"❌ Error: CSV file '{csv_file}' not found.")
12
+ return []
13
+ try:
14
+ df = pd.read_csv(csv_file, header=None)
15
+ urls = df[0].dropna().tolist()
16
+ print(f"✅ Loaded {len(urls)} URLs from {csv_file}")
17
+ return urls
18
+ except Exception as e:
19
+ print(f"❌ Error reading CSV: {e}")
20
+ return []
21
+
22
+ def generate_filename_from_url_request(url):
23
+ """Convert Shopee API URL to the expected filename format (shop_id, item_id)."""
24
+ parsed_url = urlparse(url)
25
+ domain = parsed_url.netloc
26
+ query_params = parse_qs(parsed_url.query)
27
+ item_id = query_params.get("item_id", ["unknown"])[0]
28
+ shop_id = query_params.get("shop_id", ["unknown"])[0]
29
+ filename = f"{domain}---i.{shop_id}.{item_id}.json"
30
+ return filename
31
+
32
+ def generate_filename_from_url(url):
33
+ """Extract filename from the pattern '---i.shop_id.item_id' in the path."""
34
+ parsed_url = urlparse(url)
35
+ domain = parsed_url.netloc
36
+ match = re.search(r"---i\.(\d+)\.(\d+)", parsed_url.path)
37
+ if match:
38
+ shop_id, item_id = match.groups()
39
+ return f"{domain}---i.{shop_id}.{item_id}.json"
40
+ else:
41
+ print(f"⚠️ Unable to parse shop_id/item_id from URL: {url}")
42
+ return None
43
+
44
+ def load_existing_responses(response_dir):
45
+ """Return a set of filenames already in the response directory."""
46
+ if not os.path.exists(response_dir):
47
+ os.makedirs(response_dir)
48
+ return set()
49
+ return set(os.listdir(response_dir))
50
+
51
+ def save_response_to_file(response_text, url, response_dir):
52
+ """
53
+ Save API response as a JSON file in the response_dir.
54
+ Skips if file already exists.
55
+ """
56
+ filename = generate_filename_from_url_request(url)
57
+ if not filename:
58
+ print(f"⚠️ Cannot generate filename for URL: {url}")
59
+ return
60
+ file_path = os.path.join(response_dir, filename)
61
+ if os.path.exists(file_path):
62
+ print(f"⚠️ Skipping duplicate: {filename} (Already exists)")
63
+ return
64
+ try:
65
+ with open(file_path, "w", encoding="utf-8") as f:
66
+ f.write(response_text)
67
+ print(f"✅ Response saved: {file_path}")
68
+ except Exception as e:
69
+ print(f"❌ Error saving response for {url}: {e}")
70
+
71
+ def find_missing_urls(csv_file, response_dir):
72
+ """Find CSV URLs that do not yet exist in the response_dir."""
73
+ urls = load_urls_from_csv(csv_file)
74
+ existing_files = load_existing_responses(response_dir)
75
+ missing_urls = []
76
+ for url in urls:
77
+ filename = generate_filename_from_url(url)
78
+ if filename and filename not in existing_files:
79
+ missing_urls.append(url)
80
+ print(f"🔍 {len(missing_urls)} URLs are missing from {response_dir}.")
81
+ return missing_urls
82
+
83
+ def pick_random_url(csv_file, response_dir):
84
+ """Pick one random URL from those that are missing."""
85
+ import random
86
+ missing = find_missing_urls(csv_file, response_dir)
87
+ if missing:
88
+ return random.choice(missing)
89
+ return None
modules/scraper_utils.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modules/scraper_utils.py
2
+ import time
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from modules.file_utils import load_existing_responses, find_missing_urls
5
+ from modules.chrome_utils import scrape_url, load_resource_cache, save_resource_cache
6
+ from config import (CSV_FILE, RESPONSE_DIR, BASE_PORT, MAX_WORKERS)
7
+
8
+ def run_scraper():
9
+ # Print how many responses we already have
10
+ existing_responses = load_existing_responses(RESPONSE_DIR)
11
+ print(f"Total responses in '{RESPONSE_DIR}': {len(existing_responses)}")
12
+
13
+ urls_to_scrape = find_missing_urls(CSV_FILE, RESPONSE_DIR)
14
+ if not urls_to_scrape:
15
+ print("No missing URLs to process.")
16
+ return
17
+
18
+ resource_cache = load_resource_cache()
19
+
20
+ max_workers = min(MAX_WORKERS, len(urls_to_scrape))
21
+ base_port = BASE_PORT
22
+
23
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
24
+ futures = []
25
+ for i, url in enumerate(urls_to_scrape):
26
+ debug_port = base_port + i
27
+ # Check if we already have a file for this URL
28
+ # to skip duplicates if you want:
29
+ futures.append(executor.submit(
30
+ scrape_url,
31
+ url,
32
+ debug_port,
33
+ CSV_FILE,
34
+ RESPONSE_DIR,
35
+ resource_cache
36
+ ))
37
+
38
+ for future in as_completed(futures):
39
+ try:
40
+ future.result()
41
+ except Exception as exc:
42
+ print(f"[Thread] Exception: {exc}")
43
+
44
+ # After all threads complete, save updated resource cache
45
+ save_resource_cache(resource_cache)
46
+ print("All scraping tasks completed.")