Spaces:
Paused
Paused
File size: 3,402 Bytes
2418b45 873798b 8e12b43 fe67d53 4b0add0 b33ebac 4b0add0 69f02a2 fe67d53 5afe800 8309889 5afe800 fe67d53 5afe800 ebcc930 8c15670 fe67d53 8c15670 8e12b43 ebcc930 fe67d53 5afe800 fe67d53 5afe800 fe67d53 b33ebac 935b8a6 fe67d53 ebcc930 fe67d53 b33ebac fe67d53 b33ebac fe67d53 b33ebac fe67d53 ebcc930 fe67d53 5afe800 935b8a6 fe67d53 5afe800 ebcc930 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | import asyncio
import random
import re
import time
from curl_cffi.requests import AsyncSession
from logger import get_logger
from parser import parse_html
log = get_logger()
# Shared global timeline across all worker threads to protect single IP footprints
_rate_limited_until = 0.0
async def check_fragment(word: str, proxy_url: str = None) -> str:
"""
Direct DOM view extractor using curl_cffi impersonation.
Optimized with adaptive randomized pacing maps to ensure single-IP longevity.
"""
global _rate_limited_until
word = word.strip().replace("@", "").lower()
url = f"https://fragment.com/username/{word}"
# 4 Adaptive backoff loop attempts before declaring failure state
for attempt in range(1, 5):
current_time = time.time()
# Enforce rate-limit cooling loops dynamically across threads
if current_time < _rate_limited_until:
wait_duration = _rate_limited_until - current_time + random.uniform(1.0, 3.0)
log.info(f"⏳ Thread waiting layout triggered for '{word}'. Dynamic sleep: {wait_duration:.2f}s")
await asyncio.sleep(wait_duration)
# Add organic human jitter before starting the connection socket pool
jitter = random.uniform(0.5, 2.5)
await asyncio.sleep(jitter)
try:
log.info(f"🛰️ Requesting public DOM data grid for: @{word} [Attempt {attempt}/4]")
# Short connection window parameters ensure blocked connections drop cleanly
async with AsyncSession(impersonate="chrome120", timeout=10) as session:
resp = await session.get(url, allow_redirects=True)
final_url = str(resp.url)
# Trace explicit rate limiting/firewall blocks
if resp.status_code in [429, 403]:
backoff = 15 + (5 * attempt) + random.uniform(2.0, 5.0)
_rate_limited_until = time.time() + backoff
log.warning(f"⚠️ Single IP Throttled (HTTP {resp.status_code}) on '{word}'. Cooling pool for {backoff:.1f}s...")
continue
if resp.status_code != 200:
log.warning(f"⚠️ Non-200 Response status encountered for '{word}': HTTP {resp.status_code}")
continue
html = resp.text
# Intercept Cloudflare CAPTCHA/Turnstile pages directly before parsing
if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower():
backoff = 30 + (10 * attempt) + random.uniform(5.0, 10.0)
_rate_limited_until = time.time() + backoff
log.error(f"❌ Cloudflare Challenge Intercepted on '{word}'. Slowing workers down for {backoff:.1f}s...")
continue
# Offload raw page chunk payload string straight to the unified parser module
decision = parse_html(html, final_url, word)
log.info(f"🎯 Resilient Parse Map Complete for '{word}' -> Outcome Classification: {decision}")
return decision
except Exception as e:
log.error(f"Network transaction fault for '{word}' during connection lookup: {str(e)}")
await asyncio.sleep(2.0 * attempt)
return "ERROR" |