Muttered3 commited on
Commit
2e6f7cf
Β·
verified Β·
1 Parent(s): 253b732

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +51 -48
scraper.py CHANGED
@@ -15,47 +15,38 @@ USER_AGENTS = [
15
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
16
  ]
17
 
18
- # Thread-safe global synchronization boundaries for token balancing
19
  _global_api_hash = None
 
20
  _hash_fetched_at = 0.0
21
  _hash_lock = asyncio.Lock()
22
  _rate_limited_until = 0.0
23
 
24
- async def _get_valid_api_hash(proxy_url: str = None) -> str:
25
  """
26
- Ensures a singular valid session context hash token is distributed across
27
- all background threads simultaneously, utilizing proxy routing to bypass
28
- initial Cloudflare blocks on the server IP range.
29
  """
30
- global _global_api_hash, _hash_fetched_at
31
 
32
  async with _hash_lock:
33
  now = time.time()
34
- # Recycle token cache window for 15 minutes before executing refreshing sequences
35
- if _global_api_hash and (now - _hash_fetched_at < 900):
36
- return _global_api_hash
37
-
38
- # Force proxy utilization on handshake layer to mask server footprint
39
- active_proxy = proxy_url
40
- if not active_proxy and state.proxies:
41
- active_proxy = random.choice(state.proxies)
42
 
43
  headers = {
44
  "User-Agent": random.choice(USER_AGENTS),
45
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
46
  "Accept-Language": "en-US,en;q=0.5",
47
- "Cache-Control": "no-cache",
48
- "Pragma": "no-cache"
49
  }
50
 
51
  try:
52
  timeout = aiohttp.ClientTimeout(total=10)
53
  async with aiohttp.ClientSession(timeout=timeout) as session:
54
- # Route the home page request through the active proxy tunnel
55
- async with session.get("https://fragment.com/", headers=headers, proxy=active_proxy, allow_redirects=True) as resp:
56
  html = await resp.text()
57
 
58
- # Target matching layout configurations capturing nested App.init configurations
59
  match = (
60
  re.search(r'"hash"\s*:\s*"([a-f0-9]+)"', html, re.IGNORECASE) or
61
  re.search(r'hash\s*=\s*["\']([a-f0-9]+)["\']', html, re.IGNORECASE) or
@@ -64,34 +55,32 @@ async def _get_valid_api_hash(proxy_url: str = None) -> str:
64
 
65
  if match:
66
  _global_api_hash = match.group(1)
 
67
  _hash_fetched_at = now
68
- log.info(f"🌐 Successfully Stabilized Global API Hash Token: {_global_api_hash} (via Proxy)")
69
- return _global_api_hash
70
  else:
71
- log.error("❌ Token extraction fault. DOM format structure layout changed or proxy blocked.")
72
- with open("token_dom_error.log", "w", encoding="utf-8") as f:
73
- f.write(html[:2000])
74
-
75
  except Exception as e:
76
- log.error(f"Global Handshake Sync Failure: {str(e)}")
77
 
78
- return _global_api_hash if _global_api_hash else "FAILED_TOKEN"
79
 
80
  async def check_fragment(word: str, proxy_url: str = None) -> str:
81
  """
82
- Connects directly to Fragment's internal AJAX endpoint using XMLHttpRequest signatures,
83
- bypassing front-end web layout redirection problems entirely.
84
  """
85
  global _rate_limited_until
86
  word = word.strip().replace("@", "").lower()
87
 
88
- # 4 Retries matching your advanced optimization matrix
89
  for attempt in range(1, 5):
90
  current_time = time.time()
91
  if current_time < _rate_limited_until:
92
- await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.1, 0.4))
 
93
 
94
- api_hash = await _get_valid_api_hash(proxy_url)
95
  if api_hash == "FAILED_TOKEN":
96
  return "ERROR"
97
 
@@ -101,8 +90,7 @@ async def check_fragment(word: str, proxy_url: str = None) -> str:
101
  "X-Requested-With": "XMLHttpRequest",
102
  "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
103
  "Origin": "https://fragment.com",
104
- "Referer": "https://fragment.com/",
105
- "Accept": "application/json, text/javascript, */*; q=0.01"
106
  }
107
 
108
  payload = {
@@ -112,36 +100,53 @@ async def check_fragment(word: str, proxy_url: str = None) -> str:
112
  }
113
 
114
  timeout = aiohttp.ClientTimeout(total=12)
115
- async with aiohttp.ClientSession(timeout=timeout) as session:
 
 
 
 
116
  async with session.post(f"https://fragment.com/api?hash={api_hash}", data=payload, headers=headers, proxy=proxy_url) as resp:
117
 
118
  if resp.status in [429, 403]:
119
- backoff = 4 + (2 ** attempt) + random.uniform(0.5, 1.5)
120
  _rate_limited_until = time.time() + backoff
121
- log.warning(f"⚠️ Proxy soft-banned on '{word}'. Backing off {backoff:.2f}s...")
122
  continue
123
 
124
  if resp.status != 200:
 
 
 
 
 
 
 
 
 
 
125
  continue
126
 
127
- response_json = await resp.json()
 
 
 
 
 
128
 
 
129
  if not response_json.get("ok"):
130
- # Force token reload sequence if current register gets dropped by endpoint engine
131
  global _global_api_hash
132
- _global_api_hash = None
133
  continue
134
 
135
  html_fragment = response_json.get("html", "")
136
  if not html_fragment.strip():
137
  return "AVAILABLE"
138
 
139
- # Clean whitespace structure to allow seamless single-line tracking
140
  clean_html = re.sub(r'\s+', ' ', html_fragment)
141
 
142
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
143
- # ENGINE 1: STRUCTURAL TABLE LAYER SCANNING
144
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
145
  search_regex = re.compile(rf'>@{word}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
146
  match = search_regex.search(clean_html)
147
 
@@ -154,9 +159,7 @@ async def check_fragment(word: str, proxy_url: str = None) -> str:
154
  if "sale" in s or "purchase" in s: return "FOR_SALE"
155
  return s.upper()
156
 
157
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
158
- # ENGINE 2: GLOBAL FALLBACK LAYER MATCHING
159
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
160
  fallback_text = clean_html.lower()
161
  if "on auction" in fallback_text: return "ON_AUCTION"
162
  if "sold" in fallback_text: return "SOLD"
@@ -167,7 +170,7 @@ async def check_fragment(word: str, proxy_url: str = None) -> str:
167
  return "AVAILABLE"
168
 
169
  except Exception as e:
170
- log.error(f"Internal API pipeline fault for '{word}' via proxy: {str(e)}")
171
  await asyncio.sleep(1.0 * attempt)
172
 
173
  return "ERROR"
 
15
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
16
  ]
17
 
18
+ # Thread-safe global telemetry registers and cookie state stores
19
  _global_api_hash = None
20
+ _global_cookies = None
21
  _hash_fetched_at = 0.0
22
  _hash_lock = asyncio.Lock()
23
  _rate_limited_until = 0.0
24
 
25
+ async def _get_valid_api_context(proxy_url: str = None) -> tuple:
26
  """
27
+ Synchronizes handshakes across concurrent worker processes, caching security
28
+ tokens and structural cookie strings synchronously.
 
29
  """
30
+ global _global_api_hash, _global_cookies, _hash_fetched_at
31
 
32
  async with _hash_lock:
33
  now = time.time()
34
+ # Keep context valid in memory for 15 minutes before executing renewal
35
+ if _global_api_hash and _global_cookies and (now - _hash_fetched_at < 900):
36
+ return _global_api_hash, _global_cookies
 
 
 
 
 
37
 
38
  headers = {
39
  "User-Agent": random.choice(USER_AGENTS),
40
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
41
  "Accept-Language": "en-US,en;q=0.5",
 
 
42
  }
43
 
44
  try:
45
  timeout = aiohttp.ClientTimeout(total=10)
46
  async with aiohttp.ClientSession(timeout=timeout) as session:
47
+ async with session.get("https://fragment.com/", headers=headers, proxy=proxy_url, allow_redirects=True) as resp:
 
48
  html = await resp.text()
49
 
 
50
  match = (
51
  re.search(r'"hash"\s*:\s*"([a-f0-9]+)"', html, re.IGNORECASE) or
52
  re.search(r'hash\s*=\s*["\']([a-f0-9]+)["\']', html, re.IGNORECASE) or
 
55
 
56
  if match:
57
  _global_api_hash = match.group(1)
58
+ _global_cookies = session.cookie_jar.filter_cookies(resp.url)
59
  _hash_fetched_at = now
60
+ log.info(f"🌐 Synchronized Security Tokens. Base Hash: {_global_api_hash}")
61
+ return _global_api_hash, _global_cookies
62
  else:
63
+ log.error("❌ Handshake payload parse failure. Initializing emergency drop context.")
 
 
 
64
  except Exception as e:
65
+ log.error(f"Handshake connection exception: {str(e)}")
66
 
67
+ return (_global_api_hash if _global_api_hash else "FAILED_TOKEN"), _global_cookies
68
 
69
  async def check_fragment(word: str, proxy_url: str = None) -> str:
70
  """
71
+ Direct endpoint query module processing explicit payload variants
72
+ and capturing hidden type mutations.
73
  """
74
  global _rate_limited_until
75
  word = word.strip().replace("@", "").lower()
76
 
 
77
  for attempt in range(1, 5):
78
  current_time = time.time()
79
  if current_time < _rate_limited_until:
80
+ # 2.5s sweet-spot padding allows Fragment's leaky bucket queue to clear cleanly
81
+ await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.5, 1.5))
82
 
83
+ api_hash, session_cookies = await _get_valid_api_context(proxy_url)
84
  if api_hash == "FAILED_TOKEN":
85
  return "ERROR"
86
 
 
90
  "X-Requested-With": "XMLHttpRequest",
91
  "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
92
  "Origin": "https://fragment.com",
93
+ "Referer": "https://fragment.com/"
 
94
  }
95
 
96
  payload = {
 
100
  }
101
 
102
  timeout = aiohttp.ClientTimeout(total=12)
103
+ # Share cookies explicitly across global sessions to avoid context tracking errors
104
+ async with aiohttp.ClientSession(timeout=timeout, cookie_jar=aiohttp.DummyCookieJar()) as session:
105
+ if session_cookies:
106
+ session.cookie_jar.update_cookies(session_cookies)
107
+
108
  async with session.post(f"https://fragment.com/api?hash={api_hash}", data=payload, headers=headers, proxy=proxy_url) as resp:
109
 
110
  if resp.status in [429, 403]:
111
+ backoff = 4 + (2 ** attempt) + random.uniform(1.0, 2.5)
112
  _rate_limited_until = time.time() + backoff
113
+ log.warning(f"⚠️ Hard HTTP {resp.status} intercept on '{word}'. Pacing pool...")
114
  continue
115
 
116
  if resp.status != 200:
117
+ await asyncio.sleep(1.0)
118
+ continue
119
+
120
+ # ⚑ PRODUCTION USE CASE CRITICAL VARIANT INTERCEPT
121
+ try:
122
+ response_json = await resp.json()
123
+ except ValueError:
124
+ # Catch Cloudflare challenge scripts hidden in standard text strings
125
+ backoff = 5 + random.uniform(1.0, 3.0)
126
+ _rate_limited_until = time.time() + backoff
127
  continue
128
 
129
+ # Intercept Fragment Array Payload RateLimit variation
130
+ if isinstance(response_json, list):
131
+ backoff = 6 + (2 ** attempt) + random.uniform(1.5, 3.5)
132
+ _rate_limited_until = time.time() + backoff
133
+ log.warning(f"πŸ›‘ [Telemetry Capture] RateLimit Array Payload engaged checking '{word}'. Backing off {backoff:.2f}s...")
134
+ continue
135
 
136
+ # Intercept internal engine errors or tracking token drops
137
  if not response_json.get("ok"):
 
138
  global _global_api_hash
139
+ _global_api_hash = None # Clear invalid signature cache register instantly
140
  continue
141
 
142
  html_fragment = response_json.get("html", "")
143
  if not html_fragment.strip():
144
  return "AVAILABLE"
145
 
146
+ # Flatten spatial formatting structures to allow linear execution lookups
147
  clean_html = re.sub(r'\s+', ' ', html_fragment)
148
 
149
+ # Structural string boundary lookups
 
 
150
  search_regex = re.compile(rf'>@{word}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
151
  match = search_regex.search(clean_html)
152
 
 
159
  if "sale" in s or "purchase" in s: return "FOR_SALE"
160
  return s.upper()
161
 
162
+ # System text-layer validation block failsafes
 
 
163
  fallback_text = clean_html.lower()
164
  if "on auction" in fallback_text: return "ON_AUCTION"
165
  if "sold" in fallback_text: return "SOLD"
 
170
  return "AVAILABLE"
171
 
172
  except Exception as e:
173
+ log.error(f"Internal endpoint transaction pipeline failure for '{word}': {str(e)}")
174
  await asyncio.sleep(1.0 * attempt)
175
 
176
  return "ERROR"