Muttered3 commited on
Commit
ebcc930
·
verified ·
1 Parent(s): b33ebac

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +41 -105
scraper.py CHANGED
@@ -9,6 +9,7 @@ from parser import parse_html
9
 
10
  log = get_logger()
11
 
 
12
  USER_AGENTS = [
13
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
14
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 Version/17.4.1 Safari/605.1.15",
@@ -16,138 +17,73 @@ USER_AGENTS = [
16
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
17
  ]
18
 
19
- # Singleton synchronization variables for proxy-free execution
20
- _global_api_hash = None
21
- _global_cookies = None
22
- _hash_fetched_at = 0.0
23
- _hash_lock = asyncio.Lock()
24
  _rate_limited_until = 0.0
25
 
26
- async def _get_valid_api_context() -> tuple:
27
- """
28
- Centralized Gateway Handshake: Extracts session initialization context
29
- safely while ensuring worker threads don't collide.
30
- """
31
- global _global_api_hash, _global_cookies, _hash_fetched_at
32
-
33
- if _global_api_hash and _global_cookies and (time.time() - _hash_fetched_at < 900):
34
- return _global_api_hash, _global_cookies
35
-
36
- async with _hash_lock:
37
- now = time.time()
38
- if _global_api_hash and _global_cookies and (now - _hash_fetched_at < 900):
39
- return _global_api_hash, _global_cookies
40
-
41
- headers = {
42
- "User-Agent": random.choice(USER_AGENTS),
43
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
44
- "Accept-Language": "en-US,en;q=0.5",
45
- }
46
-
47
- try:
48
- timeout = aiohttp.ClientTimeout(total=10)
49
- async with aiohttp.ClientSession(timeout=timeout) as session:
50
- async with session.get("https://fragment.com/", headers=headers, allow_redirects=True) as resp:
51
- html = await resp.text()
52
-
53
- match = (
54
- re.search(r'"hash"\s*:\s*"([a-f0-9]+)"', html, re.IGNORECASE) or
55
- re.search(r'hash\s*=\s*["\']([a-f0-9]+)["\']', html, re.IGNORECASE) or
56
- re.search(r'href="/api\?hash=([a-f0-9]+)"', html, re.IGNORECASE)
57
- )
58
-
59
- if match:
60
- _global_api_hash = match.group(1)
61
- _global_cookies = session.cookie_jar.filter_cookies(resp.url)
62
- _hash_fetched_at = now
63
- log.info(f"🌐 [Handshake Match] Session Context Stabilized: {_global_api_hash}")
64
- return _global_api_hash, _global_cookies
65
- else:
66
- log.error("❌ Token context match fault. Security block may be active.")
67
- except Exception as e:
68
- log.error(f"Handshake connection break: {str(e)}")
69
-
70
- return (_global_api_hash if _global_api_hash else "FAILED_TOKEN"), _global_cookies
71
-
72
  async def check_fragment(word: str, proxy_url: str = None) -> str:
73
  """
74
- High-Throughput Endpoint Scraper incorporating exponential fallback
75
- and adaptive sleep windows for proxy-free environments.
76
  """
77
  global _rate_limited_until
78
  word = word.strip().replace("@", "").lower()
 
79
 
 
80
  for attempt in range(1, 5):
81
  current_time = time.time()
 
 
82
  if current_time < _rate_limited_until:
83
- # 2.5s structural padding allows Fragment's server-side leaky bucket pool to clear cleanly
84
  await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.5, 1.5))
85
 
86
- api_hash, session_cookies = await _get_valid_api_context()
87
- if api_hash == "FAILED_TOKEN":
88
- await asyncio.sleep(3.0)
89
- return "ERROR"
 
 
 
 
90
 
91
  try:
92
- headers = {
93
- "User-Agent": random.choice(USER_AGENTS),
94
- "X-Requested-With": "XMLHttpRequest",
95
- "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
96
- "Origin": "https://fragment.com",
97
- "Referer": "https://fragment.com/"
98
- }
99
-
100
- payload = {
101
- "query": word,
102
- "type": "usernames",
103
- "method": "searchAuctions"
104
- }
105
-
106
- timeout = aiohttp.ClientTimeout(total=10)
107
- async with aiohttp.ClientSession(timeout=timeout, cookie_jar=aiohttp.DummyCookieJar()) as session:
108
- if session_cookies:
109
- session.cookie_jar.update_cookies(session_cookies)
110
-
111
- async with session.post(f"https://fragment.com/api?hash={api_hash}", data=payload, headers=headers) as resp:
112
 
113
- if resp.status in [429, 403]:
114
- backoff = 5 + (3 ** attempt) + random.uniform(1.0, 2.5)
 
115
  _rate_limited_until = time.time() + backoff
116
- log.warning(f"⚠️ [HTTP {resp.status}] Server boundary challenge. Cooling pool {backoff:.1f}s...")
117
  continue
118
 
119
- if resp.status != 200:
 
120
  continue
121
 
122
- try:
123
- response_json = await resp.json()
124
- except ValueError:
125
- # Intercept Cloudflare JS Engine text injections
126
- backoff = 10 + random.uniform(2.0, 5.0)
127
- _rate_limited_until = time.time() + backoff
128
- log.error("❌ JavaScript Browser Challenge Intercepted. Pacing main loops...")
129
- continue
130
 
131
- # INTERCEPT INLINE ARRAY RATE LIMIT SHIFTS
132
- if isinstance(response_json, list):
133
- backoff = 6 + (2 ** attempt) + random.uniform(1.5, 3.5)
134
  _rate_limited_until = time.time() + backoff
135
- log.warning(f"🛑 [Telemetry Capture] RateLimit Array Payload hit checking '{word}'. Backing off {backoff:.1f}s...")
136
- continue
137
-
138
- if not response_json.get("ok"):
139
- global _global_api_hash
140
- _global_api_hash = None # Instantly wipe cached registry signature to trigger context rebuild
141
  continue
142
 
143
- html_fragment = response_json.get("html", "")
144
-
145
- # Offload the structural processing cleanly to your parser module
146
- decision = parse_html(html_fragment, str(resp.url), word)
 
147
  return decision
148
 
149
  except Exception as e:
150
- log.error(f"Internal endpoint transaction pipeline failure for '{word}': {str(e)}")
151
  await asyncio.sleep(1.5 * attempt)
152
 
153
- return "ERROR"
 
9
 
10
  log = get_logger()
11
 
12
+ # Raw browser fingerprint headers to emulate real user navigation paths
13
  USER_AGENTS = [
14
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
15
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 Version/17.4.1 Safari/605.1.15",
 
17
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
18
  ]
19
 
20
+ # Shared global pacing timeline to protect single IP reputation windows
 
 
 
 
21
  _rate_limited_until = 0.0
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  async def check_fragment(word: str, proxy_url: str = None) -> str:
24
  """
25
+ Direct Public DOM Extraction Engine.
26
+ Restructured for proxy-free operation to bypass token handshake verification entirely.
27
  """
28
  global _rate_limited_until
29
  word = word.strip().replace("@", "").lower()
30
+ url = f"https://fragment.com/username/{word}"
31
 
32
+ # 4 Retries incorporating adaptive exponential backoff pacing
33
  for attempt in range(1, 5):
34
  current_time = time.time()
35
+
36
+ # Enforce rate-limit pacing window dynamically
37
  if current_time < _rate_limited_until:
 
38
  await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.5, 1.5))
39
 
40
+ headers = {
41
+ "User-Agent": random.choice(USER_AGENTS),
42
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
43
+ "Accept-Language": "en-US,en;q=0.5",
44
+ "Referer": "https://fragment.com/",
45
+ "Cache-Control": "no-cache",
46
+ "Pragma": "no-cache"
47
+ }
48
 
49
  try:
50
+ # Short connect timeout boundaries drop dead sockets cleanly
51
+ timeout = aiohttp.ClientTimeout(total=12, connect=4)
52
+ async with aiohttp.ClientSession(timeout=timeout) as session:
53
+
54
+ # allow_redirects=True is critical to resolve unassigned usernames cleanly
55
+ async with session.get(url, headers=headers, allow_redirects=True) as resp:
56
+ status = resp.status
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ if status in [429, 403]:
59
+ # Back off exponentially to preserve single-IP request pools
60
+ backoff = 5 + (3 ** attempt) + random.uniform(1.0, 3.0)
61
  _rate_limited_until = time.time() + backoff
62
+ log.warning(f"⚠️ Single IP throttled (HTTP {status}) on '{word}'. Cooling pool for {backoff:.1f}s...")
63
  continue
64
 
65
+ if status != 200:
66
+ await asyncio.sleep(1.0)
67
  continue
68
 
69
+ html = await resp.text()
 
 
 
 
 
 
 
70
 
71
+ # Trap Cloudflare anti-bot challenge scripts directly
72
+ if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower():
73
+ backoff = 15 + random.uniform(2.0, 5.0)
74
  _rate_limited_until = time.time() + backoff
75
+ log.error(" Cloudflare Browser Challenge engaged. Slowing worker loops down...")
 
 
 
 
 
76
  continue
77
 
78
+ # Capture the resolved destination URL metadata signature
79
+ final_url = str(resp.url)
80
+
81
+ # Offload the raw page layout data directly to your processing parser
82
+ decision = parse_html(html, final_url, word)
83
  return decision
84
 
85
  except Exception as e:
86
+ log.error(f"Network pipe transaction failure for '{word}': {str(e)}")
87
  await asyncio.sleep(1.5 * attempt)
88
 
89
+ return "ERROR"