Muttered3 commited on
Commit
8309889
Β·
verified Β·
1 Parent(s): 2328e00

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +81 -132
scraper.py CHANGED
@@ -1,175 +1,124 @@
1
  import asyncio
2
  import random
3
  import re
4
- import time
5
  import aiohttp
6
  from logger import get_logger
7
- from state import state
8
 
9
  log = get_logger()
10
 
 
11
  USER_AGENTS = [
12
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
13
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 Version/17.4.1 Safari/605.1.15",
14
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36"
15
  ]
16
 
17
- # Thread-safe global synchronization gates
18
- _global_api_hash = None
19
- _global_cookies = None
20
- _hash_fetched_at = 0.0
21
- _hash_lock = asyncio.Lock()
22
  _rate_limited_until = 0.0
23
 
24
- async def _get_valid_api_context(proxy_url: str = None) -> tuple:
25
  """
26
- Thread-Safe Concurrency Gatekeeper: Prevents simultaneous worker requests from
27
- clashing on the token endpoint, handling cookie contexts seamlessly.
28
  """
29
- global _global_api_hash, _global_cookies, _hash_fetched_at
30
-
31
- # 1. Check if an active worker already safely grabbed a valid token while we were waiting
32
- if _global_api_hash and _global_cookies and (time.time() - _hash_fetched_at < 900):
33
- return _global_api_hash, _global_cookies
34
-
35
- # 2. Gatekeeper Lock: Forces workers 1-9 to wait until worker 0 finishes its execution
36
- async with _hash_lock:
37
- # Double-check inside the lock to see if worker 0 just finished writing it
38
- now = time.time()
39
- if _global_api_hash and _global_cookies and (now - _hash_fetched_at < 900):
40
- return _global_api_hash, _global_cookies
41
-
42
- # Allocate active proxy cleanly
43
- active_proxy = proxy_url
44
- if not active_proxy and state.proxies:
45
- active_proxy = random.choice(state.proxies)
46
-
47
- headers = {
48
- "User-Agent": random.choice(USER_AGENTS),
49
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
50
- "Accept-Language": "en-US,en;q=0.5",
51
- }
52
-
53
- # Stagger/Jitter handshake layer requests to prevent simultaneous pool blocking
54
- await asyncio.sleep(random.uniform(0.5, 2.0))
55
-
56
- try:
57
- timeout = aiohttp.ClientTimeout(total=12)
58
- async with aiohttp.ClientSession(timeout=timeout) as session:
59
- async with session.get("https://fragment.com/", headers=headers, proxy=active_proxy, allow_redirects=True) as resp:
60
- html = await resp.text()
61
-
62
- match = (
63
- re.search(r'"hash"\s*:\s*"([a-f0-9]+)"', html, re.IGNORECASE) or
64
- re.search(r'hash\s*=\s*["\']([a-f0-9]+)["\']', html, re.IGNORECASE) or
65
- re.search(r'href="/api\?hash=([a-f0-9]+)"', html, re.IGNORECASE)
66
- )
67
-
68
- if match:
69
- _global_api_hash = match.group(1)
70
- _global_cookies = session.cookie_jar.filter_cookies(resp.url)
71
- _hash_fetched_at = now
72
- log.info(f"🌐 [Gatekeeper Match] Token Stabilized: {_global_api_hash} (via {active_proxy.split('@')[-1] if '@' in active_proxy else active_proxy})")
73
- return _global_api_hash, _global_cookies
74
- else:
75
- log.error("❌ Handshake parse failure. Falling back to previous cache state.")
76
- except Exception as e:
77
- log.error(f"Handshake network failure: {str(e)}")
78
-
79
- return (_global_api_hash if _global_api_hash else "FAILED_TOKEN"), _global_cookies
80
-
81
- async def check_fragment(word: str, proxy_url: str = None) -> str:
82
  global _rate_limited_until
83
  word = word.strip().replace("@", "").lower()
 
84
 
 
85
  for attempt in range(1, 5):
86
- current_time = time.time()
 
 
87
  if current_time < _rate_limited_until:
88
  await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.5, 1.5))
89
 
90
- api_hash, session_cookies = await _get_valid_api_context(proxy_url)
91
- if api_hash == "FAILED_TOKEN":
92
- # If token cannot be retrieved, wait a bit before retrying the loop
93
- await asyncio.sleep(2.0)
94
- return "ERROR"
 
95
 
96
  try:
97
- headers = {
98
- "User-Agent": random.choice(USER_AGENTS),
99
- "X-Requested-With": "XMLHttpRequest",
100
- "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
101
- "Origin": "https://fragment.com",
102
- "Referer": "https://fragment.com/"
103
- }
104
-
105
- payload = {
106
- "query": word,
107
- "type": "usernames",
108
- "method": "searchAuctions"
109
- }
110
-
111
- timeout = aiohttp.ClientTimeout(total=12)
112
- async with aiohttp.ClientSession(timeout=timeout, cookie_jar=aiohttp.DummyCookieJar()) as session:
113
- if session_cookies:
114
- session.cookie_jar.update_cookies(session_cookies)
115
-
116
- async with session.post(f"https://fragment.com/api?hash={api_hash}", data=payload, headers=headers, proxy=proxy_url) as resp:
117
 
 
118
  if resp.status in [429, 403]:
119
- backoff = 4 + (2 ** attempt) + random.uniform(1.0, 2.5)
120
- _rate_limited_until = time.time() + backoff
 
121
  continue
122
 
123
  if resp.status != 200:
124
  continue
125
 
126
- try:
127
- response_json = await resp.json()
128
- except ValueError:
129
- backoff = 5 + random.uniform(1.0, 3.0)
130
- _rate_limited_until = time.time() + backoff
131
- continue
132
 
133
- if isinstance(response_json, list):
134
- backoff = 6 + (2 ** attempt) + random.uniform(1.5, 3.5)
135
- _rate_limited_until = time.time() + backoff
136
- log.warning(f"πŸ›‘ Array RateLimit Intercept. Pacing pool for {backoff:.2f}s...")
137
- continue
138
-
139
- if not response_json.get("ok"):
140
- global _global_api_hash
141
- _global_api_hash = None # Reset register immediately to force a single fresh update
142
  continue
143
 
144
- html_fragment = response_json.get("html", "")
145
- if not html_fragment.strip():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  return "AVAILABLE"
147
 
148
- clean_html = re.sub(r'\s+', ' ', html_fragment)
149
-
150
- search_regex = re.compile(rf'>@{word}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
151
- match = search_regex.search(clean_html)
152
-
153
- if match:
154
- s = match.group(1).strip().lower()
155
- if "auction" in s or "bidding" in s: return "ON_AUCTION"
156
- if "sold" in s: return "SOLD"
157
- if "unavailable" in s: return "UNAVAILABLE"
158
- if "taken" in s or "offer" in s: return "TAKEN"
159
- if "sale" in s or "purchase" in s: return "FOR_SALE"
160
- return s.upper()
161
-
162
- fallback_text = clean_html.lower()
163
- if "on auction" in fallback_text: return "ON_AUCTION"
164
- if "sold" in fallback_text: return "SOLD"
165
- if "unavailable" in fallback_text: return "UNAVAILABLE"
166
- if "taken" in fallback_text or "make an offer" in fallback_text: return "TAKEN"
167
- if "for sale" in fallback_text: return "FOR_SALE"
168
-
169
- return "AVAILABLE"
 
 
 
170
 
171
  except Exception as e:
172
- log.error(f"Internal API transaction fault for '{word}': {str(e)}")
173
  await asyncio.sleep(1.0 * attempt)
174
 
175
  return "ERROR"
 
1
  import asyncio
2
  import random
3
  import re
 
4
  import aiohttp
5
  from logger import get_logger
 
6
 
7
  log = get_logger()
8
 
9
+ # Stablized native browser headers to simulate real organic search traffic
10
  USER_AGENTS = [
11
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
12
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 Version/17.4.1 Safari/605.1.15",
13
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
14
  ]
15
 
16
+ # Shared pacing registry to slow down the single IP footprint
 
 
 
 
17
  _rate_limited_until = 0.0
18
 
19
+ async def check_fragment(word: str, proxy_url: str = None) -> str:
20
  """
21
+ Direct Proxy-Free Web Extraction Engine.
22
+ Queries the raw HTML layout directly to bypass token tracking mechanisms.
23
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  global _rate_limited_until
25
  word = word.strip().replace("@", "").lower()
26
+ url = f"https://fragment.com/username/{word}"
27
 
28
+ # 4 Retries with progressive pacing windows
29
  for attempt in range(1, 5):
30
+ current_time = time.time() if 'time' in globals() else asyncio.get_event_loop().time()
31
+
32
+ # Pacing throttle block to maximize single-IP longevity
33
  if current_time < _rate_limited_until:
34
  await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.5, 1.5))
35
 
36
+ headers = {
37
+ "User-Agent": random.choice(USER_AGENTS),
38
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
39
+ "Accept-Language": "en-US,en;q=0.5",
40
+ "Referer": "https://fragment.com/"
41
+ }
42
 
43
  try:
44
+ # Short connection timeouts prevent sockets from hanging indefinitely
45
+ timeout = aiohttp.ClientTimeout(total=8, connect=3)
46
+ async with aiohttp.ClientSession(timeout=timeout) as session:
47
+ # Setting allow_redirects=True allows us to handle the search table fallback smoothly
48
+ async with session.get(url, headers=headers, allow_redirects=True) as resp:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Intercept standard firewall response failures
51
  if resp.status in [429, 403]:
52
+ backoff = 5 + (3 ** attempt) + random.uniform(1.0, 3.0)
53
+ _rate_limited_until = (time.time() if 'time' in globals() else asyncio.get_event_loop().time()) + backoff
54
+ log.warning(f"⚠️ Datacenter IP Throttled (HTTP {resp.status}) on '{word}'. Cooling pool for {backoff:.1f}s...")
55
  continue
56
 
57
  if resp.status != 200:
58
  continue
59
 
60
+ html = await resp.text()
 
 
 
 
 
61
 
62
+ # Detect if Cloudflare has completely locked the server out
63
+ if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower():
64
+ backoff = 10 + random.uniform(2.0, 5.0)
65
+ _rate_limited_until = (time.time() if 'time' in globals() else asyncio.get_event_loop().time()) + backoff
66
+ log.error("❌ Cloudflare Intercept Encountered. Slowing down engine loops...")
 
 
 
 
67
  continue
68
 
69
+ final_url = str(resp.url)
70
+
71
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
72
+ # ENGINE 1: DIRECT PROFILE RESOLUTION
73
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
74
+ if "/username/" in final_url:
75
+ status_match = re.search(r'class="tm-section-header-status[^"]*"[^>]*>\s*([^<]+?)\s*</span>', html, re.IGNORECASE)
76
+ if status_match:
77
+ s = status_match.group(1).strip().lower()
78
+ if "sold" in s: return "SOLD"
79
+ if "taken" in s: return "TAKEN"
80
+ if "auction" in s: return "ON_AUCTION"
81
+ if "available" in s: return "AVAILABLE"
82
+ if "sale" in s or "purchase" in s: return "FOR_SALE"
83
+ return s.upper()
84
+
85
+ if 'class="tm-status-taken"' in html:
86
+ return "TAKEN"
87
+ if 'class="tm-status-unavail"' in html or 'tm-section-header-status tm-status-unavail' in html:
88
+ if "sold for" in html.lower() or "tm-username-usable" in html.lower() or "recently sold" in html.lower():
89
+ return "SOLD"
90
+ return "UNAVAILABLE"
91
+
92
  return "AVAILABLE"
93
 
94
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
95
+ # ENGINE 2: REDIRECTED SEARCH LOUKUP FALLBACK
96
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
97
+ else:
98
+ clean_html = re.sub(r'\s+', ' ', html)
99
+ search_regex = re.compile(rf'>@{word}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
100
+ match = search_regex.search(clean_html)
101
+
102
+ if match:
103
+ s = match.group(1).strip().lower()
104
+ if "taken" in s: return "TAKEN"
105
+ if "available" in s: return "AVAILABLE"
106
+ if "sold" in s: return "SOLD"
107
+ if "auction" in s: return "ON_AUCTION"
108
+ if "unavailable" in s: return "UNAVAILABLE"
109
+ return s.upper()
110
+
111
+ # Failsafe textual scans if structures are obfuscated
112
+ fallback_text = clean_html.lower()
113
+ if "on auction" in fallback_text: return "ON_AUCTION"
114
+ if "sold" in fallback_text: return "SOLD"
115
+ if "unavailable" in fallback_text: return "UNAVAILABLE"
116
+ if "taken" in fallback_text or "make an offer" in fallback_text: return "TAKEN"
117
+
118
+ return "UNAVAILABLE"
119
 
120
  except Exception as e:
121
+ log.error(f"Network processing bottleneck for '{word}': {str(e)}")
122
  await asyncio.sleep(1.0 * attempt)
123
 
124
  return "ERROR"