thinkwee commited on
Commit ·
f58a6b2
1
Parent(s): fcffa22
fix retry storm
Browse files- README.md +16 -0
- main.py +2 -0
- src/fetchers/arxiv_fetcher.py +2 -2
- src/fetchers/crossref_fetcher.py +2 -2
- src/utils/http.py +50 -8
README.md
CHANGED
|
@@ -245,6 +245,22 @@ BibGuard is strict, but false positives happen:
|
|
| 245 |
- **Quick mode** (`python main.py --quick`) bypasses all network calls; runs in <1 second on most papers.
|
| 246 |
- **Retraction lookup** is concurrent; ~5-10 seconds for 100 entries with cache cold.
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
## 🤝 Contributing
|
| 249 |
|
| 250 |
Contributions welcome. Open an issue or pull request.
|
|
|
|
| 245 |
- **Quick mode** (`python main.py --quick`) bypasses all network calls; runs in <1 second on most papers.
|
| 246 |
- **Retraction lookup** is concurrent; ~5-10 seconds for 100 entries with cache cold.
|
| 247 |
|
| 248 |
+
### Hostile networks (HF Spaces, restricted egress)
|
| 249 |
+
|
| 250 |
+
BibGuard's networking is tuned for "fail fast, then circuit-break":
|
| 251 |
+
|
| 252 |
+
- urllib3 retries are restricted to genuine HTTP 5xx — connection resets and read timeouts are **not** retried, so a blocked source fails in 1-3 s instead of 20+ s.
|
| 253 |
+
- The application-level circuit breaker trips after **2** consecutive failures and skips that source for the rest of the run.
|
| 254 |
+
|
| 255 |
+
If you know in advance that a source won't work from your deploy (e.g. HF Spaces' egress IPs are routinely blocked by DBLP and `export.arxiv.org`), pre-disable them so the run never even tries:
|
| 256 |
+
|
| 257 |
+
```bash
|
| 258 |
+
export BIBGUARD_DISABLE_SOURCES="dblp,arxiv"
|
| 259 |
+
python app.py # or main.py
|
| 260 |
+
```
|
| 261 |
+
|
| 262 |
+
Comma- or space-separated, case-insensitive. Other sources (CrossRef, Semantic Scholar, OpenAlex) keep working.
|
| 263 |
+
|
| 264 |
## 🤝 Contributing
|
| 265 |
|
| 266 |
Contributions welcome. Open an issue or pull request.
|
main.py
CHANGED
|
@@ -139,6 +139,8 @@ Usage Examples:
|
|
| 139 |
retry_total=config.network.retry_total,
|
| 140 |
retry_backoff_factor=config.network.retry_backoff_factor,
|
| 141 |
)
|
|
|
|
|
|
|
| 142 |
|
| 143 |
# Validate required fields
|
| 144 |
mode_dir = bool(config.files.input_dir)
|
|
|
|
| 139 |
retry_total=config.network.retry_total,
|
| 140 |
retry_backoff_factor=config.network.retry_backoff_factor,
|
| 141 |
)
|
| 142 |
+
# Apply BIBGUARD_DISABLE_SOURCES (if set) by pre-tripping breakers.
|
| 143 |
+
http_layer.reset_breakers()
|
| 144 |
|
| 145 |
# Validate required fields
|
| 146 |
mode_dir = bool(config.files.input_dir)
|
src/fetchers/arxiv_fetcher.py
CHANGED
|
@@ -72,7 +72,7 @@ class ArxivFetcher:
|
|
| 72 |
params = {'id_list': arxiv_id, 'max_results': 1}
|
| 73 |
|
| 74 |
try:
|
| 75 |
-
response = get_session().get(self.API_BASE, params=params, timeout=
|
| 76 |
response.raise_for_status()
|
| 77 |
record_success(_SOURCE)
|
| 78 |
except requests.RequestException as e:
|
|
@@ -100,7 +100,7 @@ class ArxivFetcher:
|
|
| 100 |
}
|
| 101 |
|
| 102 |
try:
|
| 103 |
-
response = get_session().get(self.API_BASE, params=params, timeout=
|
| 104 |
response.raise_for_status()
|
| 105 |
record_success(_SOURCE)
|
| 106 |
except requests.RequestException as e:
|
|
|
|
| 72 |
params = {'id_list': arxiv_id, 'max_results': 1}
|
| 73 |
|
| 74 |
try:
|
| 75 |
+
response = get_session().get(self.API_BASE, params=params, timeout=(5, 8))
|
| 76 |
response.raise_for_status()
|
| 77 |
record_success(_SOURCE)
|
| 78 |
except requests.RequestException as e:
|
|
|
|
| 100 |
}
|
| 101 |
|
| 102 |
try:
|
| 103 |
+
response = get_session().get(self.API_BASE, params=params, timeout=(5, 8))
|
| 104 |
response.raise_for_status()
|
| 105 |
record_success(_SOURCE)
|
| 106 |
except requests.RequestException as e:
|
src/fetchers/crossref_fetcher.py
CHANGED
|
@@ -83,7 +83,7 @@ class CrossRefFetcher:
|
|
| 83 |
self.BASE_URL,
|
| 84 |
params=params,
|
| 85 |
headers=self._get_headers(),
|
| 86 |
-
timeout=
|
| 87 |
)
|
| 88 |
response.raise_for_status()
|
| 89 |
|
|
@@ -117,7 +117,7 @@ class CrossRefFetcher:
|
|
| 117 |
response = get_session().get(
|
| 118 |
f"{self.BASE_URL}/{doi}",
|
| 119 |
headers=self._get_headers(),
|
| 120 |
-
timeout=
|
| 121 |
)
|
| 122 |
response.raise_for_status()
|
| 123 |
|
|
|
|
| 83 |
self.BASE_URL,
|
| 84 |
params=params,
|
| 85 |
headers=self._get_headers(),
|
| 86 |
+
timeout=(5, 8),
|
| 87 |
)
|
| 88 |
response.raise_for_status()
|
| 89 |
|
|
|
|
| 117 |
response = get_session().get(
|
| 118 |
f"{self.BASE_URL}/{doi}",
|
| 119 |
headers=self._get_headers(),
|
| 120 |
+
timeout=(5, 8),
|
| 121 |
)
|
| 122 |
response.raise_for_status()
|
| 123 |
|
src/utils/http.py
CHANGED
|
@@ -5,10 +5,21 @@ All fetchers should go through `get_session()` instead of bare `requests.get`.
|
|
| 5 |
This gives them consistent retry/backoff on 429/5xx, polite-pool User-Agent,
|
| 6 |
and (when enabled) SQLite-backed response caching to skip re-querying the
|
| 7 |
same URL on re-runs.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
from __future__ import annotations
|
| 10 |
|
| 11 |
import logging
|
|
|
|
| 12 |
import threading
|
| 13 |
from pathlib import Path
|
| 14 |
from typing import Optional
|
|
@@ -93,13 +104,24 @@ def _build_session() -> requests.Session:
|
|
| 93 |
else:
|
| 94 |
session = requests.Session()
|
| 95 |
|
| 96 |
-
#
|
| 97 |
-
#
|
| 98 |
-
#
|
| 99 |
-
#
|
| 100 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
retry = Retry(
|
| 102 |
-
total=
|
|
|
|
|
|
|
|
|
|
| 103 |
backoff_factor=_settings["retry_backoff_factor"],
|
| 104 |
status_forcelist=(500, 502, 503, 504),
|
| 105 |
allowed_methods=("GET", "HEAD"),
|
|
@@ -145,9 +167,16 @@ def is_open(source: str) -> bool:
|
|
| 145 |
return bool(b and b.get("open"))
|
| 146 |
|
| 147 |
|
| 148 |
-
def record_failure(source: str, threshold: int =
|
| 149 |
"""Note a failure for `source`; trip the breaker after `threshold`.
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
Returns True if the breaker is now (or was already) open.
|
| 152 |
"""
|
| 153 |
with _breakers_lock:
|
|
@@ -174,6 +203,19 @@ def record_success(source: str) -> None:
|
|
| 174 |
|
| 175 |
|
| 176 |
def reset_breakers() -> None:
|
| 177 |
-
"""Clear all breaker state (called at the start of a fresh run).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
with _breakers_lock:
|
| 179 |
_breakers.clear()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
This gives them consistent retry/backoff on 429/5xx, polite-pool User-Agent,
|
| 6 |
and (when enabled) SQLite-backed response caching to skip re-querying the
|
| 7 |
same URL on re-runs.
|
| 8 |
+
|
| 9 |
+
The application-level circuit breaker (``is_open`` / ``record_failure``) is
|
| 10 |
+
the primary defense against bad networks: any source that fails twice gets
|
| 11 |
+
skipped for the rest of the run. urllib3's own retry is intentionally
|
| 12 |
+
narrow (5xx only, no connect/read retries) so the breaker can trip fast.
|
| 13 |
+
|
| 14 |
+
For deploys where you know certain sources won't work (e.g. HF Spaces
|
| 15 |
+
egress IPs are routinely blocked by DBLP and arxiv), set
|
| 16 |
+
``BIBGUARD_DISABLE_SOURCES=dblp,arxiv`` to permanently mark those breakers
|
| 17 |
+
as open at startup so we never even try them.
|
| 18 |
"""
|
| 19 |
from __future__ import annotations
|
| 20 |
|
| 21 |
import logging
|
| 22 |
+
import os
|
| 23 |
import threading
|
| 24 |
from pathlib import Path
|
| 25 |
from typing import Optional
|
|
|
|
| 104 |
else:
|
| 105 |
session = requests.Session()
|
| 106 |
|
| 107 |
+
# Retry policy is deliberately surgical:
|
| 108 |
+
# - 429 NOT in status_forcelist: rate-limit means "back off", not "retry";
|
| 109 |
+
# retrying just blocks the thread while a parallel source could answer.
|
| 110 |
+
# - connect=0, read=0: do NOT retry on ConnectionReset / ReadTimeout /
|
| 111 |
+
# ConnectError. On hostile-network deploys (e.g. HF Spaces' egress IPs
|
| 112 |
+
# are sometimes blocked by DBLP / arxiv export), these errors persist
|
| 113 |
+
# across retries — retries just multiply the wall-clock penalty
|
| 114 |
+
# before our application-level circuit breaker can trip the source.
|
| 115 |
+
# - status retries are capped at min(retry_total, 2) for genuine 5xx,
|
| 116 |
+
# which are usually transient.
|
| 117 |
+
# The application-level circuit breaker (below) is the source-of-truth
|
| 118 |
+
# for "stop hitting this host"; urllib3's job is just one fast attempt.
|
| 119 |
+
status_retries = min(int(_settings["retry_total"]), 2)
|
| 120 |
retry = Retry(
|
| 121 |
+
total=status_retries,
|
| 122 |
+
connect=0,
|
| 123 |
+
read=0,
|
| 124 |
+
status=status_retries,
|
| 125 |
backoff_factor=_settings["retry_backoff_factor"],
|
| 126 |
status_forcelist=(500, 502, 503, 504),
|
| 127 |
allowed_methods=("GET", "HEAD"),
|
|
|
|
| 167 |
return bool(b and b.get("open"))
|
| 168 |
|
| 169 |
|
| 170 |
+
def record_failure(source: str, threshold: int = 2) -> bool:
|
| 171 |
"""Note a failure for `source`; trip the breaker after `threshold`.
|
| 172 |
|
| 173 |
+
The default of 2 is intentionally aggressive: with urllib3 retries on
|
| 174 |
+
connection/read errors disabled (see ``_build_session``), each failure
|
| 175 |
+
completes in 1-3 seconds. Two quick fails ≈ 4-6 s wasted before the
|
| 176 |
+
source is shut off for the rest of the run, which is far cheaper than
|
| 177 |
+
the alternative of paying the timeout-per-entry on bad networks (HF
|
| 178 |
+
Spaces' egress IP being blocked by DBLP, e.g.).
|
| 179 |
+
|
| 180 |
Returns True if the breaker is now (or was already) open.
|
| 181 |
"""
|
| 182 |
with _breakers_lock:
|
|
|
|
| 203 |
|
| 204 |
|
| 205 |
def reset_breakers() -> None:
|
| 206 |
+
"""Clear all breaker state (called at the start of a fresh run).
|
| 207 |
+
|
| 208 |
+
After clearing, sources listed in ``BIBGUARD_DISABLE_SOURCES`` (comma- or
|
| 209 |
+
space-separated, case-insensitive) are immediately re-marked as open so
|
| 210 |
+
the run never even attempts them. Useful on hostile-network deploys.
|
| 211 |
+
"""
|
| 212 |
with _breakers_lock:
|
| 213 |
_breakers.clear()
|
| 214 |
+
disabled = os.environ.get("BIBGUARD_DISABLE_SOURCES", "")
|
| 215 |
+
for raw in disabled.replace(",", " ").split():
|
| 216 |
+
name = raw.strip().lower()
|
| 217 |
+
if not name:
|
| 218 |
+
continue
|
| 219 |
+
with _breakers_lock:
|
| 220 |
+
_breakers[name] = {"failures": 9999, "open": True, "disabled": True}
|
| 221 |
+
logger.info("Source %r pre-disabled via BIBGUARD_DISABLE_SOURCES", name)
|