Dvs / scraper.py
Muttered3's picture
Update scraper.py
ebcc930 verified
raw
history blame
3.89 kB
import asyncio
import random
import re
import time
import aiohttp
from logger import get_logger
from state import state
from parser import parse_html
log = get_logger()
# Raw browser fingerprint headers to emulate real user navigation paths
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 Version/17.4.1 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
]
# Shared global pacing timeline to protect single IP reputation windows
_rate_limited_until = 0.0
async def check_fragment(word: str, proxy_url: str = None) -> str:
"""
Direct Public DOM Extraction Engine.
Restructured for proxy-free operation to bypass token handshake verification entirely.
"""
global _rate_limited_until
word = word.strip().replace("@", "").lower()
url = f"https://fragment.com/username/{word}"
# 4 Retries incorporating adaptive exponential backoff pacing
for attempt in range(1, 5):
current_time = time.time()
# Enforce rate-limit pacing window dynamically
if current_time < _rate_limited_until:
await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.5, 1.5))
headers = {
"User-Agent": random.choice(USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://fragment.com/",
"Cache-Control": "no-cache",
"Pragma": "no-cache"
}
try:
# Short connect timeout boundaries drop dead sockets cleanly
timeout = aiohttp.ClientTimeout(total=12, connect=4)
async with aiohttp.ClientSession(timeout=timeout) as session:
# allow_redirects=True is critical to resolve unassigned usernames cleanly
async with session.get(url, headers=headers, allow_redirects=True) as resp:
status = resp.status
if status in [429, 403]:
# Back off exponentially to preserve single-IP request pools
backoff = 5 + (3 ** attempt) + random.uniform(1.0, 3.0)
_rate_limited_until = time.time() + backoff
log.warning(f"⚠️ Single IP throttled (HTTP {status}) on '{word}'. Cooling pool for {backoff:.1f}s...")
continue
if status != 200:
await asyncio.sleep(1.0)
continue
html = await resp.text()
# Trap Cloudflare anti-bot challenge scripts directly
if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower():
backoff = 15 + random.uniform(2.0, 5.0)
_rate_limited_until = time.time() + backoff
log.error("❌ Cloudflare Browser Challenge engaged. Slowing worker loops down...")
continue
# Capture the resolved destination URL metadata signature
final_url = str(resp.url)
# Offload the raw page layout data directly to your processing parser
decision = parse_html(html, final_url, word)
return decision
except Exception as e:
log.error(f"Network pipe transaction failure for '{word}': {str(e)}")
await asyncio.sleep(1.5 * attempt)
return "ERROR"