Files changed (1) hide show
  1. scraper.py +40 -55
scraper.py CHANGED
@@ -2,88 +2,73 @@ import asyncio
2
  import random
3
  import re
4
  import time
5
- import aiohttp
6
  from logger import get_logger
7
- from state import state
8
  from parser import parse_html
9
 
10
  log = get_logger()
11
 
12
- # Raw browser fingerprint headers to emulate real user navigation paths
13
- USER_AGENTS = [
14
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
15
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 Version/17.4.1 Safari/605.1.15",
16
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36",
17
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
18
- ]
19
-
20
- # Shared global pacing timeline to protect single IP reputation windows
21
  _rate_limited_until = 0.0
22
 
23
  async def check_fragment(word: str, proxy_url: str = None) -> str:
24
  """
25
- Direct Public DOM Extraction Engine.
26
- Restructured for proxy-free operation to bypass token handshake verification entirely.
27
  """
28
  global _rate_limited_until
29
  word = word.strip().replace("@", "").lower()
30
  url = f"https://fragment.com/username/{word}"
31
 
32
- # 4 Retries incorporating adaptive exponential backoff pacing
33
  for attempt in range(1, 5):
34
  current_time = time.time()
35
 
36
- # Enforce rate-limit pacing window dynamically
37
  if current_time < _rate_limited_until:
38
- await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.5, 1.5))
 
 
39
 
40
- headers = {
41
- "User-Agent": random.choice(USER_AGENTS),
42
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
43
- "Accept-Language": "en-US,en;q=0.5",
44
- "Referer": "https://fragment.com/",
45
- "Cache-Control": "no-cache",
46
- "Pragma": "no-cache"
47
- }
48
 
49
  try:
50
- # Short connect timeout boundaries drop dead sockets cleanly
51
- timeout = aiohttp.ClientTimeout(total=12, connect=4)
52
- async with aiohttp.ClientSession(timeout=timeout) as session:
 
 
 
53
 
54
- # allow_redirects=True is critical to resolve unassigned usernames cleanly
55
- async with session.get(url, headers=headers, allow_redirects=True) as resp:
56
- status = resp.status
57
-
58
- if status in [429, 403]:
59
- # Back off exponentially to preserve single-IP request pools
60
- backoff = 5 + (3 ** attempt) + random.uniform(1.0, 3.0)
61
- _rate_limited_until = time.time() + backoff
62
- log.warning(f"⚠️ Single IP throttled (HTTP {status}) on '{word}'. Cooling pool for {backoff:.1f}s...")
63
- continue
64
-
65
- if status != 200:
66
- await asyncio.sleep(1.0)
67
- continue
68
 
69
- html = await resp.text()
 
 
70
 
71
- # Trap Cloudflare anti-bot challenge scripts directly
72
- if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower():
73
- backoff = 15 + random.uniform(2.0, 5.0)
74
- _rate_limited_until = time.time() + backoff
75
- log.error("❌ Cloudflare Browser Challenge engaged. Slowing worker loops down...")
76
- continue
77
 
78
- # Capture the resolved destination URL metadata signature
79
- final_url = str(resp.url)
 
 
 
 
80
 
81
- # Offload the raw page layout data directly to your processing parser
82
- decision = parse_html(html, final_url, word)
83
- return decision
 
84
 
85
  except Exception as e:
86
- log.error(f"Network pipe transaction failure for '{word}': {str(e)}")
87
- await asyncio.sleep(1.5 * attempt)
88
 
89
  return "ERROR"
 
2
  import random
3
  import re
4
  import time
5
+ from curl_cffi.requests import AsyncSession
6
  from logger import get_logger
 
7
  from parser import parse_html
8
 
9
  log = get_logger()
10
 
11
+ # Shared global timeline across all worker threads to protect single IP footprints
 
 
 
 
 
 
 
 
12
  _rate_limited_until = 0.0
13
 
14
  async def check_fragment(word: str, proxy_url: str = None) -> str:
15
  """
16
+ Direct DOM view extractor using curl_cffi impersonation.
17
+ Optimized with adaptive randomized pacing maps to ensure single-IP longevity.
18
  """
19
  global _rate_limited_until
20
  word = word.strip().replace("@", "").lower()
21
  url = f"https://fragment.com/username/{word}"
22
 
23
+ # 4 Adaptive backoff loop attempts before declaring failure state
24
  for attempt in range(1, 5):
25
  current_time = time.time()
26
 
27
+ # Enforce rate-limit cooling loops dynamically across threads
28
  if current_time < _rate_limited_until:
29
+ wait_duration = _rate_limited_until - current_time + random.uniform(1.0, 3.0)
30
+ log.info(f"⏳ Thread waiting layout triggered for '{word}'. Dynamic sleep: {wait_duration:.2f}s")
31
+ await asyncio.sleep(wait_duration)
32
 
33
+ # Add organic human jitter before starting the connection socket pool
34
+ jitter = random.uniform(0.5, 2.5)
35
+ await asyncio.sleep(jitter)
 
 
 
 
 
36
 
37
  try:
38
+ log.info(f"🛰️ Requesting public DOM data grid for: @{word} [Attempt {attempt}/4]")
39
+
40
+ # Short connection window parameters ensure blocked connections drop cleanly
41
+ async with AsyncSession(impersonate="chrome120", timeout=10) as session:
42
+ resp = await session.get(url, allow_redirects=True)
43
+ final_url = str(resp.url)
44
 
45
+ # Trace explicit rate limiting/firewall blocks
46
+ if resp.status_code in [429, 403]:
47
+ backoff = 15 + (5 * attempt) + random.uniform(2.0, 5.0)
48
+ _rate_limited_until = time.time() + backoff
49
+ log.warning(f"⚠️ Single IP Throttled (HTTP {resp.status_code}) on '{word}'. Cooling pool for {backoff:.1f}s...")
50
+ continue
 
 
 
 
 
 
 
 
51
 
52
+ if resp.status_code != 200:
53
+ log.warning(f"⚠️ Non-200 Response status encountered for '{word}': HTTP {resp.status_code}")
54
+ continue
55
 
56
+ html = resp.text
 
 
 
 
 
57
 
58
+ # Intercept Cloudflare CAPTCHA/Turnstile pages directly before parsing
59
+ if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower():
60
+ backoff = 30 + (10 * attempt) + random.uniform(5.0, 10.0)
61
+ _rate_limited_until = time.time() + backoff
62
+ log.error(f"❌ Cloudflare Challenge Intercepted on '{word}'. Slowing workers down for {backoff:.1f}s...")
63
+ continue
64
 
65
+ # Offload raw page chunk payload string straight to the unified parser module
66
+ decision = parse_html(html, final_url, word)
67
+ log.info(f"🎯 Resilient Parse Map Complete for '{word}' -> Outcome Classification: {decision}")
68
+ return decision
69
 
70
  except Exception as e:
71
+ log.error(f"Network transaction fault for '{word}' during connection lookup: {str(e)}")
72
+ await asyncio.sleep(2.0 * attempt)
73
 
74
  return "ERROR"