Spaces:

thinkwee
/

BibGuard

Running

File size: 8,338 Bytes

"""
Shared HTTP client: pooled session + auto-retry + optional on-disk cache.

All fetchers should go through `get_session()` instead of bare `requests.get`.
This gives them consistent retry/backoff on 429/5xx, polite-pool User-Agent,
and (when enabled) SQLite-backed response caching to skip re-querying the
same URL on re-runs.

The application-level circuit breaker (``is_open`` / ``record_failure``) is
the primary defense against bad networks: any source that fails twice gets
skipped for the rest of the run. urllib3's own retry is intentionally
narrow (5xx only, no connect/read retries) so the breaker can trip fast.

For deploys where you know certain sources won't work (e.g. HF Spaces
egress IPs are routinely blocked by DBLP and arxiv), set
``BIBGUARD_DISABLE_SOURCES=dblp,arxiv`` to permanently mark those breakers
as open at startup so we never even try them.
"""
from __future__ import annotations

import logging
import os
import threading
from pathlib import Path
from typing import Optional

import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

logger = logging.getLogger(__name__)

# Global per-process state
_lock = threading.Lock()
_settings: dict = {
    "contact_email": "",
    "cache_enabled": True,
    "cache_ttl_hours": 24,
    "retry_total": 5,
    "retry_backoff_factor": 1.5,
    "cache_dir": None,  # Path or None
}
_session: Optional[requests.Session] = None


def configure(
    contact_email: str = "",
    cache_enabled: bool = True,
    cache_ttl_hours: int = 24,
    retry_total: int = 5,
    retry_backoff_factor: float = 1.5,
    cache_dir: Optional[Path] = None,
) -> None:
    """Configure HTTP layer. Call once at startup before any fetcher is used."""
    global _session
    with _lock:
        _settings.update({
            "contact_email": contact_email or "",
            "cache_enabled": cache_enabled,
            "cache_ttl_hours": int(cache_ttl_hours),
            "retry_total": int(retry_total),
            "retry_backoff_factor": float(retry_backoff_factor),
            "cache_dir": cache_dir,
        })
        # Force rebuild on next get_session()
        _session = None


def user_agent() -> str:
    """Build a polite User-Agent string. Includes contact email if configured."""
    email = _settings.get("contact_email") or ""
    if email:
        return f"BibGuard/1.0 (+https://github.com/thinkwee/BibGuard; mailto:{email})"
    return "BibGuard/1.0 (+https://github.com/thinkwee/BibGuard)"


def _build_session() -> requests.Session:
    """Construct a Session with retry and (optionally) caching."""
    cache_enabled = _settings["cache_enabled"]
    ttl = _settings["cache_ttl_hours"] * 3600

    if cache_enabled:
        try:
            from requests_cache import CachedSession  # type: ignore
            cache_dir = _settings.get("cache_dir")
            if cache_dir is None:
                cache_dir = Path.home() / ".cache" / "bibguard"
            cache_dir.mkdir(parents=True, exist_ok=True)
            session = CachedSession(
                cache_name=str(cache_dir / "http_cache"),
                backend="sqlite",
                expire_after=ttl,
                allowable_methods=("GET", "HEAD"),
                allowable_codes=(200, 203, 300, 301, 308),
                stale_if_error=True,
            )
            logger.debug("HTTP cache enabled: %s (ttl=%ss)", cache_dir, ttl)
        except ImportError:
            logger.info(
                "requests-cache not installed; running without HTTP cache. "
                "Install via `pip install requests-cache` for big speedups on re-runs."
            )
            session = requests.Session()
    else:
        session = requests.Session()

    # Retry policy is deliberately surgical:
    #   - 429 NOT in status_forcelist: rate-limit means "back off", not "retry";
    #     retrying just blocks the thread while a parallel source could answer.
    #   - connect=0, read=0: do NOT retry on ConnectionReset / ReadTimeout /
    #     ConnectError. On hostile-network deploys (e.g. HF Spaces' egress IPs
    #     are sometimes blocked by DBLP / arxiv export), these errors persist
    #     across retries — retries just multiply the wall-clock penalty
    #     before our application-level circuit breaker can trip the source.
    #   - status retries are capped at min(retry_total, 2) for genuine 5xx,
    #     which are usually transient.
    # The application-level circuit breaker (below) is the source-of-truth
    # for "stop hitting this host"; urllib3's job is just one fast attempt.
    status_retries = min(int(_settings["retry_total"]), 2)
    retry = Retry(
        total=status_retries,
        connect=0,
        read=0,
        status=status_retries,
        backoff_factor=_settings["retry_backoff_factor"],
        status_forcelist=(500, 502, 503, 504),
        allowed_methods=("GET", "HEAD"),
        raise_on_status=False,
        respect_retry_after_header=False,
    )
    adapter = HTTPAdapter(max_retries=retry, pool_connections=20, pool_maxsize=20)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    session.headers.update({"User-Agent": user_agent()})
    return session


def get_session() -> requests.Session:
    """Return the shared, configured Session. Thread-safe."""
    global _session
    if _session is None:
        with _lock:
            if _session is None:
                _session = _build_session()
    return _session


def reset_for_tests() -> None:
    """Drop the shared session. Used by tests to force a rebuild."""
    global _session
    with _lock:
        _session = None


# ---------------------------------------------------------------------------
# Circuit breaker: trip a source after N consecutive failures so the rest of
# the run skips it instead of paying its rate-limit/timeout penalty per entry.
# ---------------------------------------------------------------------------
_breakers: dict[str, dict] = {}
_breakers_lock = threading.Lock()


def is_open(source: str) -> bool:
    """True if the source's circuit is currently tripped (skip it)."""
    with _breakers_lock:
        b = _breakers.get(source)
        return bool(b and b.get("open"))


def record_failure(source: str, threshold: int = 2) -> bool:
    """Note a failure for `source`; trip the breaker after `threshold`.

    The default of 2 is intentionally aggressive: with urllib3 retries on
    connection/read errors disabled (see ``_build_session``), each failure
    completes in 1-3 seconds. Two quick fails ≈ 4-6 s wasted before the
    source is shut off for the rest of the run, which is far cheaper than
    the alternative of paying the timeout-per-entry on bad networks (HF
    Spaces' egress IP being blocked by DBLP, e.g.).

    Returns True if the breaker is now (or was already) open.
    """
    with _breakers_lock:
        b = _breakers.setdefault(source, {"failures": 0, "open": False})
        b["failures"] += 1
        if b["failures"] >= threshold:
            if not b["open"]:
                logger.warning(
                    "Circuit breaker tripped for %s after %d failures; "
                    "skipping for the rest of this run.",
                    source, b["failures"],
                )
            b["open"] = True
        return b["open"]


def record_success(source: str) -> None:
    """Reset the failure counter on a success."""
    with _breakers_lock:
        b = _breakers.get(source)
        if b:
            b["failures"] = 0
            b["open"] = False


def reset_breakers() -> None:
    """Clear all breaker state (called at the start of a fresh run).

    After clearing, sources listed in ``BIBGUARD_DISABLE_SOURCES`` (comma- or
    space-separated, case-insensitive) are immediately re-marked as open so
    the run never even attempts them. Useful on hostile-network deploys.
    """
    with _breakers_lock:
        _breakers.clear()
    disabled = os.environ.get("BIBGUARD_DISABLE_SOURCES", "")
    for raw in disabled.replace(",", " ").split():
        name = raw.strip().lower()
        if not name:
            continue
        with _breakers_lock:
            _breakers[name] = {"failures": 9999, "open": True, "disabled": True}
        logger.info("Source %r pre-disabled via BIBGUARD_DISABLE_SOURCES", name)