Spaces:

Muttered3
/

Dvs

Paused

App Files Files Community

Update scraper.py

by UNUSUALxd - opened Jun 1

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+40

-55

Files changed (1) hide show

scraper.py +40 -55

scraper.py CHANGED Viewed

@@ -2,88 +2,73 @@ import asyncio
 import random
 import re
 import time
-import aiohttp
 from logger import get_logger
-from state import state
 from parser import parse_html
 log = get_logger()
-# Raw browser fingerprint headers to emulate real user navigation paths
-USER_AGENTS = [
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 Version/17.4.1 Safari/605.1.15",
-    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
-]
-# Shared global pacing timeline to protect single IP reputation windows
 _rate_limited_until = 0.0
 async def check_fragment(word: str, proxy_url: str = None) -> str:
     """
-    Direct Public DOM Extraction Engine.
-    Restructured for proxy-free operation to bypass token handshake verification entirely.
     """
     global _rate_limited_until
     word = word.strip().replace("@", "").lower()
     url = f"https://fragment.com/username/{word}"
-    # 4 Retries incorporating adaptive exponential backoff pacing
     for attempt in range(1, 5):
         current_time = time.time()
-        # Enforce rate-limit pacing window dynamically
         if current_time < _rate_limited_until:
-            await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.5, 1.5))
-        headers = {
-            "User-Agent": random.choice(USER_AGENTS),
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
-            "Accept-Language": "en-US,en;q=0.5",
-            "Referer": "https://fragment.com/",
-            "Cache-Control": "no-cache",
-            "Pragma": "no-cache"
-        }
         try:
-            # Short connect timeout boundaries drop dead sockets cleanly
-            timeout = aiohttp.ClientTimeout(total=12, connect=4)
-            async with aiohttp.ClientSession(timeout=timeout) as session:
-                # allow_redirects=True is critical to resolve unassigned usernames cleanly
-                async with session.get(url, headers=headers, allow_redirects=True) as resp:
-                    status = resp.status
-                    if status in [429, 403]:
-                        # Back off exponentially to preserve single-IP request pools
-                        backoff = 5 + (3 ** attempt) + random.uniform(1.0, 3.0)
-                        _rate_limited_until = time.time() + backoff
-                        log.warning(f"⚠️ Single IP throttled (HTTP {status}) on '{word}'. Cooling pool for {backoff:.1f}s...")
-                        continue
-                    if status != 200:
-                        await asyncio.sleep(1.0)
-                        continue
-                    html = await resp.text()
-                    # Trap Cloudflare anti-bot challenge scripts directly
-                    if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower():
-                        backoff = 15 + random.uniform(2.0, 5.0)
-                        _rate_limited_until = time.time() + backoff
-                        log.error("❌ Cloudflare Browser Challenge engaged. Slowing worker loops down...")
-                        continue
-                    # Capture the resolved destination URL metadata signature
-                    final_url = str(resp.url)
-                    # Offload the raw page layout data directly to your processing parser
-                    decision = parse_html(html, final_url, word)
-                    return decision
         except Exception as e:
-            log.error(f"Network pipe transaction failure for '{word}': {str(e)}")
-            await asyncio.sleep(1.5 * attempt)
     return "ERROR"

 import random
 import re
 import time
+from curl_cffi.requests import AsyncSession
 from logger import get_logger
 from parser import parse_html
 log = get_logger()
+# Shared global timeline across all worker threads to protect single IP footprints
 _rate_limited_until = 0.0
 async def check_fragment(word: str, proxy_url: str = None) -> str:
     """
+    Direct DOM view extractor using curl_cffi impersonation.
+    Optimized with adaptive randomized pacing maps to ensure single-IP longevity.
     """
     global _rate_limited_until
     word = word.strip().replace("@", "").lower()
     url = f"https://fragment.com/username/{word}"
+    # 4 Adaptive backoff loop attempts before declaring failure state
     for attempt in range(1, 5):
         current_time = time.time()
+        # Enforce rate-limit cooling loops dynamically across threads
         if current_time < _rate_limited_until:
+            wait_duration = _rate_limited_until - current_time + random.uniform(1.0, 3.0)
+            log.info(f"⏳ Thread waiting layout triggered for '{word}'. Dynamic sleep: {wait_duration:.2f}s")
+            await asyncio.sleep(wait_duration)
+        # Add organic human jitter before starting the connection socket pool
+        jitter = random.uniform(0.5, 2.5)
+        await asyncio.sleep(jitter)
         try:
+            log.info(f"🛰️ Requesting public DOM data grid for: @{word} [Attempt {attempt}/4]")
+            # Short connection window parameters ensure blocked connections drop cleanly
+            async with AsyncSession(impersonate="chrome120", timeout=10) as session:
+                resp = await session.get(url, allow_redirects=True)
+                final_url = str(resp.url)
+                # Trace explicit rate limiting/firewall blocks
+                if resp.status_code in [429, 403]:
+                    backoff = 15 + (5 * attempt) + random.uniform(2.0, 5.0)
+                    _rate_limited_until = time.time() + backoff
+                    log.warning(f"⚠️ Single IP Throttled (HTTP {resp.status_code}) on '{word}'. Cooling pool for {backoff:.1f}s...")
+                    continue
+                if resp.status_code != 200:
+                    log.warning(f"⚠️ Non-200 Response status encountered for '{word}': HTTP {resp.status_code}")
+                    continue
+                html = resp.text
+                # Intercept Cloudflare CAPTCHA/Turnstile pages directly before parsing
+                if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower():
+                    backoff = 30 + (10 * attempt) + random.uniform(5.0, 10.0)
+                    _rate_limited_until = time.time() + backoff
+                    log.error(f"❌ Cloudflare Challenge Intercepted on '{word}'. Slowing workers down for {backoff:.1f}s...")
+                    continue
+                # Offload raw page chunk payload string straight to the unified parser module
+                decision = parse_html(html, final_url, word)
+                log.info(f"🎯 Resilient Parse Map Complete for '{word}' -> Outcome Classification: {decision}")
+                return decision
         except Exception as e:
+            log.error(f"Network transaction fault for '{word}' during connection lookup: {str(e)}")
+            await asyncio.sleep(2.0 * attempt)
     return "ERROR"