Spaces:

MinhTai
/

ai-agent-app

Running

File size: 5,475 Bytes

dd6cc27

import asyncio
import itertools
import logging
import random
import httpx

logger = logging.getLogger(__name__)

# Rotate through realistic browser User-Agent strings so requests look organic.
_USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
]
_ua_cycle = itertools.cycle(_USER_AGENTS)

_BASE_HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "DNT": "1",
}

# Serial fetching (1 in-flight request) + generous gap — avoids triggering
# per-IP rate limiting on AoPS and Paul's Online Math Notes.
_CONCURRENCY = 1
_MIN_DELAY = 2.0   # seconds between requests
_MAX_DELAY = 4.0   # upper bound of random jitter window

# Codes that warrant a retry with backoff (server-side transient errors).
_RETRYABLE_CODES = frozenset({429, 500, 502, 503, 504})

# Max retry attempts per URL.
_MAX_ATTEMPTS = 5

# Lazy semaphore — created on first use inside a running event loop.
_semaphore: asyncio.Semaphore | None = None

# Shared client — single connection pool for the lifetime of the crawl run.
_client: httpx.AsyncClient | None = None


def _next_headers() -> dict[str, str]:
    return {**_BASE_HEADERS, "User-Agent": next(_ua_cycle)}


def get_client() -> httpx.AsyncClient:
    global _client
    if _client is None or _client.is_closed:
        _client = httpx.AsyncClient(
            headers=_next_headers(),
            timeout=30,
            follow_redirects=True,
            limits=httpx.Limits(
                max_connections=_CONCURRENCY,
                max_keepalive_connections=_CONCURRENCY,
            ),
        )
    return _client


def safe_text(resp: httpx.Response) -> str:
    """Decode response body, replacing un-decodable bytes instead of raising."""
    encoding = resp.encoding or "utf-8"
    return resp.content.decode(encoding, errors="replace")


def safe_json(resp: httpx.Response) -> dict:
    """Parse JSON from response; returns {} on empty or invalid body."""
    import json
    text = safe_text(resp)
    if not text.strip():
        logger.warning("safe_json: empty body from %s", resp.url)
        return {}
    try:
        return json.loads(text)
    except json.JSONDecodeError as exc:
        logger.warning("safe_json: invalid JSON from %s: %s", resp.url, exc)
        return {}


async def close_client() -> None:
    global _client
    if _client and not _client.is_closed:
        await _client.aclose()
    _client = None


def _get_semaphore() -> asyncio.Semaphore:
    global _semaphore
    if _semaphore is None:
        _semaphore = asyncio.Semaphore(_CONCURRENCY)
    return _semaphore


async def fetch_with_retry(
    url: str,
    params: dict | None = None,
    max_attempts: int = _MAX_ATTEMPTS,
) -> httpx.Response:
    """GET with exponential backoff + jitter on transient 5xx/429.

    Rotates the User-Agent on each request.  Honours Retry-After when
    present; otherwise uses full-jitter exponential back-off:
        delay = uniform(0, min(cap, base * 2**attempt))
    with cap=60 s and base=2 s.
    """
    sem = _get_semaphore()
    client = get_client()
    async with sem:
        resp: httpx.Response | None = None
        for attempt in range(max_attempts):
            headers = _next_headers()
            try:
                resp = await client.get(url, params=params, headers=headers)
            except (httpx.ConnectError, httpx.ReadTimeout, httpx.RemoteProtocolError) as exc:
                # Network-level failure — back off and retry.
                wait = random.uniform(2.0, min(60.0, 2.0 * (2 ** attempt)))
                logger.warning(
                    "Network error on attempt %d/%d for %s: %s — retrying in %.1fs",
                    attempt + 1, max_attempts, url, exc, wait,
                )
                await asyncio.sleep(wait)
                continue

            if resp.status_code in _RETRYABLE_CODES:
                retry_after_raw = resp.headers.get("retry-after")
                if retry_after_raw and retry_after_raw.isdigit():
                    wait = float(retry_after_raw)
                else:
                    # Full-jitter: uniform(0, cap) where cap grows with each attempt.
                    cap = min(60.0, 2.0 * (2 ** attempt))
                    wait = random.uniform(cap / 2, cap)
                logger.warning(
                    "HTTP %d on attempt %d/%d for %s — retrying in %.1fs",
                    resp.status_code, attempt + 1, max_attempts, url, wait,
                )
                await asyncio.sleep(wait)
                continue

            resp.raise_for_status()
            # Politeness delay with jitter so the inter-request gap looks natural.
            await asyncio.sleep(random.uniform(_MIN_DELAY, _MAX_DELAY))
            return resp

        # All attempts exhausted.
        assert resp is not None
        resp.raise_for_status()