Spaces:
Paused
Paused
| import asyncio | |
| import random | |
| import re | |
| import time | |
| from curl_cffi.requests import AsyncSession | |
| from logger import get_logger | |
| from parser import parse_html | |
| log = get_logger() | |
| # Shared global timeline across all worker threads to protect single IP footprints | |
| _rate_limited_until = 0.0 | |
| async def check_fragment(word: str, proxy_url: str = None) -> str: | |
| """ | |
| Direct DOM view extractor using curl_cffi impersonation. | |
| Optimized with adaptive randomized pacing maps to ensure single-IP longevity. | |
| """ | |
| global _rate_limited_until | |
| word = word.strip().replace("@", "").lower() | |
| url = f"https://fragment.com/username/{word}" | |
| # 4 Adaptive backoff loop attempts before declaring failure state | |
| for attempt in range(1, 5): | |
| current_time = time.time() | |
| # Enforce rate-limit cooling loops dynamically across threads | |
| if current_time < _rate_limited_until: | |
| wait_duration = _rate_limited_until - current_time + random.uniform(1.0, 3.0) | |
| log.info(f"⏳ Thread waiting layout triggered for '{word}'. Dynamic sleep: {wait_duration:.2f}s") | |
| await asyncio.sleep(wait_duration) | |
| # Add organic human jitter before starting the connection socket pool | |
| jitter = random.uniform(0.5, 2.5) | |
| await asyncio.sleep(jitter) | |
| try: | |
| log.info(f"🛰️ Requesting public DOM data grid for: @{word} [Attempt {attempt}/4]") | |
| # Short connection window parameters ensure blocked connections drop cleanly | |
| async with AsyncSession(impersonate="chrome120", timeout=10) as session: | |
| resp = await session.get(url, allow_redirects=True) | |
| final_url = str(resp.url) | |
| # Trace explicit rate limiting/firewall blocks | |
| if resp.status_code in [429, 403]: | |
| backoff = 15 + (5 * attempt) + random.uniform(2.0, 5.0) | |
| _rate_limited_until = time.time() + backoff | |
| log.warning(f"⚠️ Single IP Throttled (HTTP {resp.status_code}) on '{word}'. Cooling pool for {backoff:.1f}s...") | |
| continue | |
| if resp.status_code != 200: | |
| log.warning(f"⚠️ Non-200 Response status encountered for '{word}': HTTP {resp.status_code}") | |
| continue | |
| html = resp.text | |
| # Intercept Cloudflare CAPTCHA/Turnstile pages directly before parsing | |
| if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower(): | |
| backoff = 30 + (10 * attempt) + random.uniform(5.0, 10.0) | |
| _rate_limited_until = time.time() + backoff | |
| log.error(f"❌ Cloudflare Challenge Intercepted on '{word}'. Slowing workers down for {backoff:.1f}s...") | |
| continue | |
| # Offload raw page chunk payload string straight to the unified parser module | |
| decision = parse_html(html, final_url, word) | |
| log.info(f"🎯 Resilient Parse Map Complete for '{word}' -> Outcome Classification: {decision}") | |
| return decision | |
| except Exception as e: | |
| log.error(f"Network transaction fault for '{word}' during connection lookup: {str(e)}") | |
| await asyncio.sleep(2.0 * attempt) | |
| return "ERROR" |