Dvs / scraper.py
Muttered3's picture
Update scraper.py (#2)
fe67d53
import asyncio
import random
import re
import time
from curl_cffi.requests import AsyncSession
from logger import get_logger
from parser import parse_html
log = get_logger()
# Shared global timeline across all worker threads to protect single IP footprints
_rate_limited_until = 0.0
async def check_fragment(word: str, proxy_url: str = None) -> str:
"""
Direct DOM view extractor using curl_cffi impersonation.
Optimized with adaptive randomized pacing maps to ensure single-IP longevity.
"""
global _rate_limited_until
word = word.strip().replace("@", "").lower()
url = f"https://fragment.com/username/{word}"
# 4 Adaptive backoff loop attempts before declaring failure state
for attempt in range(1, 5):
current_time = time.time()
# Enforce rate-limit cooling loops dynamically across threads
if current_time < _rate_limited_until:
wait_duration = _rate_limited_until - current_time + random.uniform(1.0, 3.0)
log.info(f"⏳ Thread waiting layout triggered for '{word}'. Dynamic sleep: {wait_duration:.2f}s")
await asyncio.sleep(wait_duration)
# Add organic human jitter before starting the connection socket pool
jitter = random.uniform(0.5, 2.5)
await asyncio.sleep(jitter)
try:
log.info(f"🛰️ Requesting public DOM data grid for: @{word} [Attempt {attempt}/4]")
# Short connection window parameters ensure blocked connections drop cleanly
async with AsyncSession(impersonate="chrome120", timeout=10) as session:
resp = await session.get(url, allow_redirects=True)
final_url = str(resp.url)
# Trace explicit rate limiting/firewall blocks
if resp.status_code in [429, 403]:
backoff = 15 + (5 * attempt) + random.uniform(2.0, 5.0)
_rate_limited_until = time.time() + backoff
log.warning(f"⚠️ Single IP Throttled (HTTP {resp.status_code}) on '{word}'. Cooling pool for {backoff:.1f}s...")
continue
if resp.status_code != 200:
log.warning(f"⚠️ Non-200 Response status encountered for '{word}': HTTP {resp.status_code}")
continue
html = resp.text
# Intercept Cloudflare CAPTCHA/Turnstile pages directly before parsing
if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower():
backoff = 30 + (10 * attempt) + random.uniform(5.0, 10.0)
_rate_limited_until = time.time() + backoff
log.error(f"❌ Cloudflare Challenge Intercepted on '{word}'. Slowing workers down for {backoff:.1f}s...")
continue
# Offload raw page chunk payload string straight to the unified parser module
decision = parse_html(html, final_url, word)
log.info(f"🎯 Resilient Parse Map Complete for '{word}' -> Outcome Classification: {decision}")
return decision
except Exception as e:
log.error(f"Network transaction fault for '{word}' during connection lookup: {str(e)}")
await asyncio.sleep(2.0 * attempt)
return "ERROR"