Spaces:
Running
Running
File size: 5,475 Bytes
dd6cc27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | import asyncio
import itertools
import logging
import random
import httpx
logger = logging.getLogger(__name__)
# Rotate through realistic browser User-Agent strings so requests look organic.
_USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
]
_ua_cycle = itertools.cycle(_USER_AGENTS)
_BASE_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
}
# Serial fetching (1 in-flight request) + generous gap — avoids triggering
# per-IP rate limiting on AoPS and Paul's Online Math Notes.
_CONCURRENCY = 1
_MIN_DELAY = 2.0 # seconds between requests
_MAX_DELAY = 4.0 # upper bound of random jitter window
# Codes that warrant a retry with backoff (server-side transient errors).
_RETRYABLE_CODES = frozenset({429, 500, 502, 503, 504})
# Max retry attempts per URL.
_MAX_ATTEMPTS = 5
# Lazy semaphore — created on first use inside a running event loop.
_semaphore: asyncio.Semaphore | None = None
# Shared client — single connection pool for the lifetime of the crawl run.
_client: httpx.AsyncClient | None = None
def _next_headers() -> dict[str, str]:
return {**_BASE_HEADERS, "User-Agent": next(_ua_cycle)}
def get_client() -> httpx.AsyncClient:
global _client
if _client is None or _client.is_closed:
_client = httpx.AsyncClient(
headers=_next_headers(),
timeout=30,
follow_redirects=True,
limits=httpx.Limits(
max_connections=_CONCURRENCY,
max_keepalive_connections=_CONCURRENCY,
),
)
return _client
def safe_text(resp: httpx.Response) -> str:
"""Decode response body, replacing un-decodable bytes instead of raising."""
encoding = resp.encoding or "utf-8"
return resp.content.decode(encoding, errors="replace")
def safe_json(resp: httpx.Response) -> dict:
"""Parse JSON from response; returns {} on empty or invalid body."""
import json
text = safe_text(resp)
if not text.strip():
logger.warning("safe_json: empty body from %s", resp.url)
return {}
try:
return json.loads(text)
except json.JSONDecodeError as exc:
logger.warning("safe_json: invalid JSON from %s: %s", resp.url, exc)
return {}
async def close_client() -> None:
global _client
if _client and not _client.is_closed:
await _client.aclose()
_client = None
def _get_semaphore() -> asyncio.Semaphore:
global _semaphore
if _semaphore is None:
_semaphore = asyncio.Semaphore(_CONCURRENCY)
return _semaphore
async def fetch_with_retry(
url: str,
params: dict | None = None,
max_attempts: int = _MAX_ATTEMPTS,
) -> httpx.Response:
"""GET with exponential backoff + jitter on transient 5xx/429.
Rotates the User-Agent on each request. Honours Retry-After when
present; otherwise uses full-jitter exponential back-off:
delay = uniform(0, min(cap, base * 2**attempt))
with cap=60 s and base=2 s.
"""
sem = _get_semaphore()
client = get_client()
async with sem:
resp: httpx.Response | None = None
for attempt in range(max_attempts):
headers = _next_headers()
try:
resp = await client.get(url, params=params, headers=headers)
except (httpx.ConnectError, httpx.ReadTimeout, httpx.RemoteProtocolError) as exc:
# Network-level failure — back off and retry.
wait = random.uniform(2.0, min(60.0, 2.0 * (2 ** attempt)))
logger.warning(
"Network error on attempt %d/%d for %s: %s — retrying in %.1fs",
attempt + 1, max_attempts, url, exc, wait,
)
await asyncio.sleep(wait)
continue
if resp.status_code in _RETRYABLE_CODES:
retry_after_raw = resp.headers.get("retry-after")
if retry_after_raw and retry_after_raw.isdigit():
wait = float(retry_after_raw)
else:
# Full-jitter: uniform(0, cap) where cap grows with each attempt.
cap = min(60.0, 2.0 * (2 ** attempt))
wait = random.uniform(cap / 2, cap)
logger.warning(
"HTTP %d on attempt %d/%d for %s — retrying in %.1fs",
resp.status_code, attempt + 1, max_attempts, url, wait,
)
await asyncio.sleep(wait)
continue
resp.raise_for_status()
# Politeness delay with jitter so the inter-request gap looks natural.
await asyncio.sleep(random.uniform(_MIN_DELAY, _MAX_DELAY))
return resp
# All attempts exhausted.
assert resp is not None
resp.raise_for_status()
|