Spaces:

thinkwee
/

BibGuard

Running

App Files Files Community

thinkwee commited on 17 days ago

Commit

f58a6b2

1 Parent(s): fcffa22

fix retry storm

Browse files

Files changed (5) hide show

README.md +16 -0
main.py +2 -0
src/fetchers/arxiv_fetcher.py +2 -2
src/fetchers/crossref_fetcher.py +2 -2
src/utils/http.py +50 -8

README.md CHANGED Viewed

@@ -245,6 +245,22 @@ BibGuard is strict, but false positives happen:
 - **Quick mode** (`python main.py --quick`) bypasses all network calls; runs in <1 second on most papers.
 - **Retraction lookup** is concurrent; ~5-10 seconds for 100 entries with cache cold.
 ## 🤝 Contributing
 Contributions welcome. Open an issue or pull request.

 - **Quick mode** (`python main.py --quick`) bypasses all network calls; runs in <1 second on most papers.
 - **Retraction lookup** is concurrent; ~5-10 seconds for 100 entries with cache cold.
+### Hostile networks (HF Spaces, restricted egress)
+BibGuard's networking is tuned for "fail fast, then circuit-break":
+- urllib3 retries are restricted to genuine HTTP 5xx — connection resets and read timeouts are **not** retried, so a blocked source fails in 1-3 s instead of 20+ s.
+- The application-level circuit breaker trips after **2** consecutive failures and skips that source for the rest of the run.
+If you know in advance that a source won't work from your deploy (e.g. HF Spaces' egress IPs are routinely blocked by DBLP and `export.arxiv.org`), pre-disable them so the run never even tries:
+```bash
+export BIBGUARD_DISABLE_SOURCES="dblp,arxiv"
+python app.py     # or main.py
+```
+Comma- or space-separated, case-insensitive. Other sources (CrossRef, Semantic Scholar, OpenAlex) keep working.
 ## 🤝 Contributing
 Contributions welcome. Open an issue or pull request.

main.py CHANGED Viewed

@@ -139,6 +139,8 @@ Usage Examples:
         retry_total=config.network.retry_total,
         retry_backoff_factor=config.network.retry_backoff_factor,
     )
     # Validate required fields
     mode_dir = bool(config.files.input_dir)

         retry_total=config.network.retry_total,
         retry_backoff_factor=config.network.retry_backoff_factor,
     )
+    # Apply BIBGUARD_DISABLE_SOURCES (if set) by pre-tripping breakers.
+    http_layer.reset_breakers()
     # Validate required fields
     mode_dir = bool(config.files.input_dir)

src/fetchers/arxiv_fetcher.py CHANGED Viewed

@@ -72,7 +72,7 @@ class ArxivFetcher:
         params = {'id_list': arxiv_id, 'max_results': 1}
         try:
-            response = get_session().get(self.API_BASE, params=params, timeout=12)
             response.raise_for_status()
             record_success(_SOURCE)
         except requests.RequestException as e:
@@ -100,7 +100,7 @@ class ArxivFetcher:
         }
         try:
-            response = get_session().get(self.API_BASE, params=params, timeout=12)
             response.raise_for_status()
             record_success(_SOURCE)
         except requests.RequestException as e:

         params = {'id_list': arxiv_id, 'max_results': 1}
         try:
+            response = get_session().get(self.API_BASE, params=params, timeout=(5, 8))
             response.raise_for_status()
             record_success(_SOURCE)
         except requests.RequestException as e:
         }
         try:
+            response = get_session().get(self.API_BASE, params=params, timeout=(5, 8))
             response.raise_for_status()
             record_success(_SOURCE)
         except requests.RequestException as e:

src/fetchers/crossref_fetcher.py CHANGED Viewed

@@ -83,7 +83,7 @@ class CrossRefFetcher:
                 self.BASE_URL,
                 params=params,
                 headers=self._get_headers(),
-                timeout=12,
             )
             response.raise_for_status()
@@ -117,7 +117,7 @@ class CrossRefFetcher:
             response = get_session().get(
                 f"{self.BASE_URL}/{doi}",
                 headers=self._get_headers(),
-                timeout=12,
             )
             response.raise_for_status()

                 self.BASE_URL,
                 params=params,
                 headers=self._get_headers(),
+                timeout=(5, 8),
             )
             response.raise_for_status()
             response = get_session().get(
                 f"{self.BASE_URL}/{doi}",
                 headers=self._get_headers(),
+                timeout=(5, 8),
             )
             response.raise_for_status()

src/utils/http.py CHANGED Viewed

@@ -5,10 +5,21 @@ All fetchers should go through `get_session()` instead of bare `requests.get`.
 This gives them consistent retry/backoff on 429/5xx, polite-pool User-Agent,
 and (when enabled) SQLite-backed response caching to skip re-querying the
 same URL on re-runs.
 """
 from __future__ import annotations
 import logging
 import threading
 from pathlib import Path
 from typing import Optional
@@ -93,13 +104,24 @@ def _build_session() -> requests.Session:
     else:
         session = requests.Session()
-    # Important: 429 is NOT in status_forcelist. A 429 means "you're being
-    # rate-limited" — retrying just blocks the calling thread for tens of
-    # seconds while another parallel source could already have answered.
-    # We let the caller see the 429 immediately and move on; the circuit
-    # breaker (below) will skip the offending source for the rest of the run.
     retry = Retry(
-        total=_settings["retry_total"],
         backoff_factor=_settings["retry_backoff_factor"],
         status_forcelist=(500, 502, 503, 504),
         allowed_methods=("GET", "HEAD"),
@@ -145,9 +167,16 @@ def is_open(source: str) -> bool:
         return bool(b and b.get("open"))
-def record_failure(source: str, threshold: int = 3) -> bool:
     """Note a failure for `source`; trip the breaker after `threshold`.
     Returns True if the breaker is now (or was already) open.
     """
     with _breakers_lock:
@@ -174,6 +203,19 @@ def record_success(source: str) -> None:
 def reset_breakers() -> None:
-    """Clear all breaker state (called at the start of a fresh run)."""
     with _breakers_lock:
         _breakers.clear()

 This gives them consistent retry/backoff on 429/5xx, polite-pool User-Agent,
 and (when enabled) SQLite-backed response caching to skip re-querying the
 same URL on re-runs.
+The application-level circuit breaker (``is_open`` / ``record_failure``) is
+the primary defense against bad networks: any source that fails twice gets
+skipped for the rest of the run. urllib3's own retry is intentionally
+narrow (5xx only, no connect/read retries) so the breaker can trip fast.
+For deploys where you know certain sources won't work (e.g. HF Spaces
+egress IPs are routinely blocked by DBLP and arxiv), set
+``BIBGUARD_DISABLE_SOURCES=dblp,arxiv`` to permanently mark those breakers
+as open at startup so we never even try them.
 """
 from __future__ import annotations
 import logging
+import os
 import threading
 from pathlib import Path
 from typing import Optional
     else:
         session = requests.Session()
+    # Retry policy is deliberately surgical:
+    #   - 429 NOT in status_forcelist: rate-limit means "back off", not "retry";
+    #     retrying just blocks the thread while a parallel source could answer.
+    #   - connect=0, read=0: do NOT retry on ConnectionReset / ReadTimeout /
+    #     ConnectError. On hostile-network deploys (e.g. HF Spaces' egress IPs
+    #     are sometimes blocked by DBLP / arxiv export), these errors persist
+    #     across retries — retries just multiply the wall-clock penalty
+    #     before our application-level circuit breaker can trip the source.
+    #   - status retries are capped at min(retry_total, 2) for genuine 5xx,
+    #     which are usually transient.
+    # The application-level circuit breaker (below) is the source-of-truth
+    # for "stop hitting this host"; urllib3's job is just one fast attempt.
+    status_retries = min(int(_settings["retry_total"]), 2)
     retry = Retry(
+        total=status_retries,
+        connect=0,
+        read=0,
+        status=status_retries,
         backoff_factor=_settings["retry_backoff_factor"],
         status_forcelist=(500, 502, 503, 504),
         allowed_methods=("GET", "HEAD"),
         return bool(b and b.get("open"))
+def record_failure(source: str, threshold: int = 2) -> bool:
     """Note a failure for `source`; trip the breaker after `threshold`.
+    The default of 2 is intentionally aggressive: with urllib3 retries on
+    connection/read errors disabled (see ``_build_session``), each failure
+    completes in 1-3 seconds. Two quick fails ≈ 4-6 s wasted before the
+    source is shut off for the rest of the run, which is far cheaper than
+    the alternative of paying the timeout-per-entry on bad networks (HF
+    Spaces' egress IP being blocked by DBLP, e.g.).
     Returns True if the breaker is now (or was already) open.
     """
     with _breakers_lock:
 def reset_breakers() -> None:
+    """Clear all breaker state (called at the start of a fresh run).
+    After clearing, sources listed in ``BIBGUARD_DISABLE_SOURCES`` (comma- or
+    space-separated, case-insensitive) are immediately re-marked as open so
+    the run never even attempts them. Useful on hostile-network deploys.
+    """
     with _breakers_lock:
         _breakers.clear()
+    disabled = os.environ.get("BIBGUARD_DISABLE_SOURCES", "")
+    for raw in disabled.replace(",", " ").split():
+        name = raw.strip().lower()
+        if not name:
+            continue
+        with _breakers_lock:
+            _breakers[name] = {"failures": 9999, "open": True, "disabled": True}
+        logger.info("Source %r pre-disabled via BIBGUARD_DISABLE_SOURCES", name)