Muttered3 commited on
Commit
b33ebac
·
verified ·
1 Parent(s): 02a8ebc

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +126 -82
scraper.py CHANGED
@@ -2,108 +2,152 @@ import asyncio
2
  import random
3
  import re
4
  import time
5
- from curl_cffi.requests import AsyncSession
6
  from logger import get_logger
 
 
7
 
8
  log = get_logger()
9
 
10
- # Shared global pacing throttle to manage single-IP footprint
 
 
 
 
 
 
 
 
 
 
 
11
  _rate_limited_until = 0.0
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  async def check_fragment(word: str, proxy_url: str = None) -> str:
14
  """
15
- Emulated browser engine using curl_cffi to punch through
16
- datacenter IP challenges proxy-free.
17
  """
18
  global _rate_limited_until
19
  word = word.strip().replace("@", "").lower()
20
- url = f"https://fragment.com/username/{word}"
21
 
22
- # 4 Retries with progressive backoff pacing matching your matrix
23
  for attempt in range(1, 5):
24
  current_time = time.time()
25
-
26
- # Enforce strategic delay gaps between single-IP connections
27
  if current_time < _rate_limited_until:
 
28
  await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.5, 1.5))
29
 
 
 
 
 
 
30
  try:
31
- # We open an isolated impersonation session to clear sockets cleanly
32
- async with AsyncSession(impersonate="chrome120", timeout=12) as session:
33
- resp = await session.get(url, allow_redirects=True)
34
-
35
- # Capture structural rate limits or bans
36
- if resp.status_code in [429, 403]:
37
- backoff = 6 + (2 ** attempt) + random.uniform(1.0, 2.5)
38
- _rate_limited_until = time.time() + backoff
39
- log.warning(f"⚠️ IP Pacing Throttle Engaged (HTTP {resp.status_code}) on '{word}'. Cooling...")
40
- continue
41
-
42
- if resp.status_code != 200:
43
- continue
44
-
45
- html = resp.text
46
-
47
- # Check if the page is a security block layout page
48
- if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower():
49
- backoff = 12 + random.uniform(2.0, 5.0)
50
- _rate_limited_until = time.time() + backoff
51
- log.error("❌ Emulated validation gate triggered. Expanding cooldown window...")
52
- continue
53
-
54
- final_url = str(resp.url)
55
-
56
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
57
- # ENGINE 1: NATIVE PROFILE DOM SCANNING
58
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
59
- if "/username/" in final_url:
60
- status_match = re.search(r'class="tm-section-header-status[^"]*"[^>]*>\s*([^<]+?)\s*</span>', html, re.IGNORECASE)
61
- if status_match:
62
- s = status_match.group(1).strip().lower()
63
- if "sold" in s: return "SOLD"
64
- if "taken" in s: return "TAKEN"
65
- if "auction" in s: return "ON_AUCTION"
66
- if "available" in s: return "AVAILABLE"
67
- if "sale" in s or "purchase" in s: return "FOR_SALE"
68
- return s.upper()
69
-
70
- if 'class="tm-status-taken"' in html:
71
- return "TAKEN"
72
- if 'class="tm-status-unavail"' in html or 'tm-section-header-status tm-status-unavail' in html:
73
- if "sold for" in html.lower() or "tm-username-usable" in html.lower() or "recently sold" in html.lower():
74
- return "SOLD"
75
- return "UNAVAILABLE"
76
-
77
- return "AVAILABLE"
78
-
79
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
80
- # ENGINE 2: REDIRECTED SEARCH SELECTION INTERCEPT
81
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
82
- else:
83
- clean_html = re.sub(r'\s+', ' ', html)
84
- search_regex = re.compile(rf'>@{word}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
85
- match = search_regex.search(clean_html)
86
 
87
- if match:
88
- s = match.group(1).strip().lower()
89
- if "taken" in s: return "TAKEN"
90
- if "available" in s: return "AVAILABLE"
91
- if "sold" in s: return "SOLD"
92
- if "auction" in s: return "ON_AUCTION"
93
- if "unavailable" in s: return "UNAVAILABLE"
94
- return s.upper()
95
-
96
- # Direct structural layout fallback scans
97
- fallback_text = clean_html.lower()
98
- if "on auction" in fallback_text: return "ON_AUCTION"
99
- if "sold" in fallback_text: return "SOLD"
100
- if "unavailable" in fallback_text: return "UNAVAILABLE"
101
- if "taken" in fallback_text or "make an offer" in fallback_text: return "TAKEN"
102
-
103
- return "UNAVAILABLE"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  except Exception as e:
106
- log.error(f"Impersonation channel error for '{word}': {str(e)}")
107
  await asyncio.sleep(1.5 * attempt)
108
 
109
- return "ERROR"
 
2
  import random
3
  import re
4
  import time
5
+ import aiohttp
6
  from logger import get_logger
7
+ from state import state
8
+ from parser import parse_html
9
 
10
  log = get_logger()
11
 
12
+ USER_AGENTS = [
13
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
14
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 Version/17.4.1 Safari/605.1.15",
15
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36",
16
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
17
+ ]
18
+
19
+ # Singleton synchronization variables for proxy-free execution
20
+ _global_api_hash = None
21
+ _global_cookies = None
22
+ _hash_fetched_at = 0.0
23
+ _hash_lock = asyncio.Lock()
24
  _rate_limited_until = 0.0
25
 
26
+ async def _get_valid_api_context() -> tuple:
27
+ """
28
+ Centralized Gateway Handshake: Extracts session initialization context
29
+ safely while ensuring worker threads don't collide.
30
+ """
31
+ global _global_api_hash, _global_cookies, _hash_fetched_at
32
+
33
+ if _global_api_hash and _global_cookies and (time.time() - _hash_fetched_at < 900):
34
+ return _global_api_hash, _global_cookies
35
+
36
+ async with _hash_lock:
37
+ now = time.time()
38
+ if _global_api_hash and _global_cookies and (now - _hash_fetched_at < 900):
39
+ return _global_api_hash, _global_cookies
40
+
41
+ headers = {
42
+ "User-Agent": random.choice(USER_AGENTS),
43
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
44
+ "Accept-Language": "en-US,en;q=0.5",
45
+ }
46
+
47
+ try:
48
+ timeout = aiohttp.ClientTimeout(total=10)
49
+ async with aiohttp.ClientSession(timeout=timeout) as session:
50
+ async with session.get("https://fragment.com/", headers=headers, allow_redirects=True) as resp:
51
+ html = await resp.text()
52
+
53
+ match = (
54
+ re.search(r'"hash"\s*:\s*"([a-f0-9]+)"', html, re.IGNORECASE) or
55
+ re.search(r'hash\s*=\s*["\']([a-f0-9]+)["\']', html, re.IGNORECASE) or
56
+ re.search(r'href="/api\?hash=([a-f0-9]+)"', html, re.IGNORECASE)
57
+ )
58
+
59
+ if match:
60
+ _global_api_hash = match.group(1)
61
+ _global_cookies = session.cookie_jar.filter_cookies(resp.url)
62
+ _hash_fetched_at = now
63
+ log.info(f"🌐 [Handshake Match] Session Context Stabilized: {_global_api_hash}")
64
+ return _global_api_hash, _global_cookies
65
+ else:
66
+ log.error("❌ Token context match fault. Security block may be active.")
67
+ except Exception as e:
68
+ log.error(f"Handshake connection break: {str(e)}")
69
+
70
+ return (_global_api_hash if _global_api_hash else "FAILED_TOKEN"), _global_cookies
71
+
72
  async def check_fragment(word: str, proxy_url: str = None) -> str:
73
  """
74
+ High-Throughput Endpoint Scraper incorporating exponential fallback
75
+ and adaptive sleep windows for proxy-free environments.
76
  """
77
  global _rate_limited_until
78
  word = word.strip().replace("@", "").lower()
 
79
 
 
80
  for attempt in range(1, 5):
81
  current_time = time.time()
 
 
82
  if current_time < _rate_limited_until:
83
+ # 2.5s structural padding allows Fragment's server-side leaky bucket pool to clear cleanly
84
  await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.5, 1.5))
85
 
86
+ api_hash, session_cookies = await _get_valid_api_context()
87
+ if api_hash == "FAILED_TOKEN":
88
+ await asyncio.sleep(3.0)
89
+ return "ERROR"
90
+
91
  try:
92
+ headers = {
93
+ "User-Agent": random.choice(USER_AGENTS),
94
+ "X-Requested-With": "XMLHttpRequest",
95
+ "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
96
+ "Origin": "https://fragment.com",
97
+ "Referer": "https://fragment.com/"
98
+ }
99
+
100
+ payload = {
101
+ "query": word,
102
+ "type": "usernames",
103
+ "method": "searchAuctions"
104
+ }
105
+
106
+ timeout = aiohttp.ClientTimeout(total=10)
107
+ async with aiohttp.ClientSession(timeout=timeout, cookie_jar=aiohttp.DummyCookieJar()) as session:
108
+ if session_cookies:
109
+ session.cookie_jar.update_cookies(session_cookies)
110
+
111
+ async with session.post(f"https://fragment.com/api?hash={api_hash}", data=payload, headers=headers) as resp:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ if resp.status in [429, 403]:
114
+ backoff = 5 + (3 ** attempt) + random.uniform(1.0, 2.5)
115
+ _rate_limited_until = time.time() + backoff
116
+ log.warning(f"⚠️ [HTTP {resp.status}] Server boundary challenge. Cooling pool {backoff:.1f}s...")
117
+ continue
118
+
119
+ if resp.status != 200:
120
+ continue
121
+
122
+ try:
123
+ response_json = await resp.json()
124
+ except ValueError:
125
+ # Intercept Cloudflare JS Engine text injections
126
+ backoff = 10 + random.uniform(2.0, 5.0)
127
+ _rate_limited_until = time.time() + backoff
128
+ log.error("❌ JavaScript Browser Challenge Intercepted. Pacing main loops...")
129
+ continue
130
+
131
+ # ⚡ INTERCEPT INLINE ARRAY RATE LIMIT SHIFTS
132
+ if isinstance(response_json, list):
133
+ backoff = 6 + (2 ** attempt) + random.uniform(1.5, 3.5)
134
+ _rate_limited_until = time.time() + backoff
135
+ log.warning(f"🛑 [Telemetry Capture] RateLimit Array Payload hit checking '{word}'. Backing off {backoff:.1f}s...")
136
+ continue
137
+
138
+ if not response_json.get("ok"):
139
+ global _global_api_hash
140
+ _global_api_hash = None # Instantly wipe cached registry signature to trigger context rebuild
141
+ continue
142
+
143
+ html_fragment = response_json.get("html", "")
144
+
145
+ # Offload the structural processing cleanly to your parser module
146
+ decision = parse_html(html_fragment, str(resp.url), word)
147
+ return decision
148
 
149
  except Exception as e:
150
+ log.error(f"Internal endpoint transaction pipeline failure for '{word}': {str(e)}")
151
  await asyncio.sleep(1.5 * attempt)
152
 
153
+ return "ERROR"