Spaces:

Muttered3
/

Dvs

Paused

App Files Files Community

Dvs / scraper.py

Muttered3

Update scraper.py

ebcc930 verified 10 days ago

raw

history blame

3.89 kB

	import asyncio
	import random
	import re
	import time
	import aiohttp
	from logger import get_logger
	from state import state
	from parser import parse_html

	log = get_logger()

	# Raw browser fingerprint headers to emulate real user navigation paths
	USER_AGENTS = [
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 Version/17.4.1 Safari/605.1.15",
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36",
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
	]

	# Shared global pacing timeline to protect single IP reputation windows
	_rate_limited_until = 0.0

	async def check_fragment(word: str, proxy_url: str = None) -> str:
	"""
	Direct Public DOM Extraction Engine.
	Restructured for proxy-free operation to bypass token handshake verification entirely.
	"""
	global _rate_limited_until
	word = word.strip().replace("@", "").lower()
	url = f"https://fragment.com/username/{word}"

	# 4 Retries incorporating adaptive exponential backoff pacing
	for attempt in range(1, 5):
	current_time = time.time()

	# Enforce rate-limit pacing window dynamically
	if current_time < _rate_limited_until:
	await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.5, 1.5))

	headers = {
	"User-Agent": random.choice(USER_AGENTS),
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	"Referer": "https://fragment.com/",
	"Cache-Control": "no-cache",
	"Pragma": "no-cache"
	}

	try:
	# Short connect timeout boundaries drop dead sockets cleanly
	timeout = aiohttp.ClientTimeout(total=12, connect=4)
	async with aiohttp.ClientSession(timeout=timeout) as session:

	# allow_redirects=True is critical to resolve unassigned usernames cleanly
	async with session.get(url, headers=headers, allow_redirects=True) as resp:
	status = resp.status

	if status in [429, 403]:
	# Back off exponentially to preserve single-IP request pools
	backoff = 5 + (3 ** attempt) + random.uniform(1.0, 3.0)
	_rate_limited_until = time.time() + backoff
	log.warning(f"⚠️ Single IP throttled (HTTP {status}) on '{word}'. Cooling pool for {backoff:.1f}s...")
	continue

	if status != 200:
	await asyncio.sleep(1.0)
	continue

	html = await resp.text()

	# Trap Cloudflare anti-bot challenge scripts directly
	if "Just a moment..." in html or "cf-browser-verification" in html or "cloudflare" in html.lower():
	backoff = 15 + random.uniform(2.0, 5.0)
	_rate_limited_until = time.time() + backoff
	log.error("❌ Cloudflare Browser Challenge engaged. Slowing worker loops down...")
	continue

	# Capture the resolved destination URL metadata signature
	final_url = str(resp.url)

	# Offload the raw page layout data directly to your processing parser
	decision = parse_html(html, final_url, word)
	return decision

	except Exception as e:
	log.error(f"Network pipe transaction failure for '{word}': {str(e)}")
	await asyncio.sleep(1.5 * attempt)

	return "ERROR"