Spaces:
Running
Running
| import asyncio | |
| import itertools | |
| import logging | |
| import random | |
| import httpx | |
| logger = logging.getLogger(__name__) | |
| # Rotate through realistic browser User-Agent strings so requests look organic. | |
| _USER_AGENTS = [ | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15", | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", | |
| ] | |
| _ua_cycle = itertools.cycle(_USER_AGENTS) | |
| _BASE_HEADERS = { | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| "Accept-Encoding": "gzip, deflate, br", | |
| "DNT": "1", | |
| } | |
| # Serial fetching (1 in-flight request) + generous gap — avoids triggering | |
| # per-IP rate limiting on AoPS and Paul's Online Math Notes. | |
| _CONCURRENCY = 1 | |
| _MIN_DELAY = 2.0 # seconds between requests | |
| _MAX_DELAY = 4.0 # upper bound of random jitter window | |
| # Codes that warrant a retry with backoff (server-side transient errors). | |
| _RETRYABLE_CODES = frozenset({429, 500, 502, 503, 504}) | |
| # Max retry attempts per URL. | |
| _MAX_ATTEMPTS = 5 | |
| # Lazy semaphore — created on first use inside a running event loop. | |
| _semaphore: asyncio.Semaphore | None = None | |
| # Shared client — single connection pool for the lifetime of the crawl run. | |
| _client: httpx.AsyncClient | None = None | |
| def _next_headers() -> dict[str, str]: | |
| return {**_BASE_HEADERS, "User-Agent": next(_ua_cycle)} | |
| def get_client() -> httpx.AsyncClient: | |
| global _client | |
| if _client is None or _client.is_closed: | |
| _client = httpx.AsyncClient( | |
| headers=_next_headers(), | |
| timeout=30, | |
| follow_redirects=True, | |
| limits=httpx.Limits( | |
| max_connections=_CONCURRENCY, | |
| max_keepalive_connections=_CONCURRENCY, | |
| ), | |
| ) | |
| return _client | |
| def safe_text(resp: httpx.Response) -> str: | |
| """Decode response body, replacing un-decodable bytes instead of raising.""" | |
| encoding = resp.encoding or "utf-8" | |
| return resp.content.decode(encoding, errors="replace") | |
| def safe_json(resp: httpx.Response) -> dict: | |
| """Parse JSON from response; returns {} on empty or invalid body.""" | |
| import json | |
| text = safe_text(resp) | |
| if not text.strip(): | |
| logger.warning("safe_json: empty body from %s", resp.url) | |
| return {} | |
| try: | |
| return json.loads(text) | |
| except json.JSONDecodeError as exc: | |
| logger.warning("safe_json: invalid JSON from %s: %s", resp.url, exc) | |
| return {} | |
| async def close_client() -> None: | |
| global _client | |
| if _client and not _client.is_closed: | |
| await _client.aclose() | |
| _client = None | |
| def _get_semaphore() -> asyncio.Semaphore: | |
| global _semaphore | |
| if _semaphore is None: | |
| _semaphore = asyncio.Semaphore(_CONCURRENCY) | |
| return _semaphore | |
| async def fetch_with_retry( | |
| url: str, | |
| params: dict | None = None, | |
| max_attempts: int = _MAX_ATTEMPTS, | |
| ) -> httpx.Response: | |
| """GET with exponential backoff + jitter on transient 5xx/429. | |
| Rotates the User-Agent on each request. Honours Retry-After when | |
| present; otherwise uses full-jitter exponential back-off: | |
| delay = uniform(0, min(cap, base * 2**attempt)) | |
| with cap=60 s and base=2 s. | |
| """ | |
| sem = _get_semaphore() | |
| client = get_client() | |
| async with sem: | |
| resp: httpx.Response | None = None | |
| for attempt in range(max_attempts): | |
| headers = _next_headers() | |
| try: | |
| resp = await client.get(url, params=params, headers=headers) | |
| except (httpx.ConnectError, httpx.ReadTimeout, httpx.RemoteProtocolError) as exc: | |
| # Network-level failure — back off and retry. | |
| wait = random.uniform(2.0, min(60.0, 2.0 * (2 ** attempt))) | |
| logger.warning( | |
| "Network error on attempt %d/%d for %s: %s — retrying in %.1fs", | |
| attempt + 1, max_attempts, url, exc, wait, | |
| ) | |
| await asyncio.sleep(wait) | |
| continue | |
| if resp.status_code in _RETRYABLE_CODES: | |
| retry_after_raw = resp.headers.get("retry-after") | |
| if retry_after_raw and retry_after_raw.isdigit(): | |
| wait = float(retry_after_raw) | |
| else: | |
| # Full-jitter: uniform(0, cap) where cap grows with each attempt. | |
| cap = min(60.0, 2.0 * (2 ** attempt)) | |
| wait = random.uniform(cap / 2, cap) | |
| logger.warning( | |
| "HTTP %d on attempt %d/%d for %s — retrying in %.1fs", | |
| resp.status_code, attempt + 1, max_attempts, url, wait, | |
| ) | |
| await asyncio.sleep(wait) | |
| continue | |
| resp.raise_for_status() | |
| # Politeness delay with jitter so the inter-request gap looks natural. | |
| await asyncio.sleep(random.uniform(_MIN_DELAY, _MAX_DELAY)) | |
| return resp | |
| # All attempts exhausted. | |
| assert resp is not None | |
| resp.raise_for_status() | |