thinkwee commited on
Commit
f58a6b2
·
1 Parent(s): fcffa22

fix retry storm

Browse files
README.md CHANGED
@@ -245,6 +245,22 @@ BibGuard is strict, but false positives happen:
245
  - **Quick mode** (`python main.py --quick`) bypasses all network calls; runs in <1 second on most papers.
246
  - **Retraction lookup** is concurrent; ~5-10 seconds for 100 entries with cache cold.
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  ## 🤝 Contributing
249
 
250
  Contributions welcome. Open an issue or pull request.
 
245
  - **Quick mode** (`python main.py --quick`) bypasses all network calls; runs in <1 second on most papers.
246
  - **Retraction lookup** is concurrent; ~5-10 seconds for 100 entries with cache cold.
247
 
248
+ ### Hostile networks (HF Spaces, restricted egress)
249
+
250
+ BibGuard's networking is tuned for "fail fast, then circuit-break":
251
+
252
+ - urllib3 retries are restricted to genuine HTTP 5xx — connection resets and read timeouts are **not** retried, so a blocked source fails in 1-3 s instead of 20+ s.
253
+ - The application-level circuit breaker trips after **2** consecutive failures and skips that source for the rest of the run.
254
+
255
+ If you know in advance that a source won't work from your deploy (e.g. HF Spaces' egress IPs are routinely blocked by DBLP and `export.arxiv.org`), pre-disable them so the run never even tries:
256
+
257
+ ```bash
258
+ export BIBGUARD_DISABLE_SOURCES="dblp,arxiv"
259
+ python app.py # or main.py
260
+ ```
261
+
262
+ Comma- or space-separated, case-insensitive. Other sources (CrossRef, Semantic Scholar, OpenAlex) keep working.
263
+
264
  ## 🤝 Contributing
265
 
266
  Contributions welcome. Open an issue or pull request.
main.py CHANGED
@@ -139,6 +139,8 @@ Usage Examples:
139
  retry_total=config.network.retry_total,
140
  retry_backoff_factor=config.network.retry_backoff_factor,
141
  )
 
 
142
 
143
  # Validate required fields
144
  mode_dir = bool(config.files.input_dir)
 
139
  retry_total=config.network.retry_total,
140
  retry_backoff_factor=config.network.retry_backoff_factor,
141
  )
142
+ # Apply BIBGUARD_DISABLE_SOURCES (if set) by pre-tripping breakers.
143
+ http_layer.reset_breakers()
144
 
145
  # Validate required fields
146
  mode_dir = bool(config.files.input_dir)
src/fetchers/arxiv_fetcher.py CHANGED
@@ -72,7 +72,7 @@ class ArxivFetcher:
72
  params = {'id_list': arxiv_id, 'max_results': 1}
73
 
74
  try:
75
- response = get_session().get(self.API_BASE, params=params, timeout=12)
76
  response.raise_for_status()
77
  record_success(_SOURCE)
78
  except requests.RequestException as e:
@@ -100,7 +100,7 @@ class ArxivFetcher:
100
  }
101
 
102
  try:
103
- response = get_session().get(self.API_BASE, params=params, timeout=12)
104
  response.raise_for_status()
105
  record_success(_SOURCE)
106
  except requests.RequestException as e:
 
72
  params = {'id_list': arxiv_id, 'max_results': 1}
73
 
74
  try:
75
+ response = get_session().get(self.API_BASE, params=params, timeout=(5, 8))
76
  response.raise_for_status()
77
  record_success(_SOURCE)
78
  except requests.RequestException as e:
 
100
  }
101
 
102
  try:
103
+ response = get_session().get(self.API_BASE, params=params, timeout=(5, 8))
104
  response.raise_for_status()
105
  record_success(_SOURCE)
106
  except requests.RequestException as e:
src/fetchers/crossref_fetcher.py CHANGED
@@ -83,7 +83,7 @@ class CrossRefFetcher:
83
  self.BASE_URL,
84
  params=params,
85
  headers=self._get_headers(),
86
- timeout=12,
87
  )
88
  response.raise_for_status()
89
 
@@ -117,7 +117,7 @@ class CrossRefFetcher:
117
  response = get_session().get(
118
  f"{self.BASE_URL}/{doi}",
119
  headers=self._get_headers(),
120
- timeout=12,
121
  )
122
  response.raise_for_status()
123
 
 
83
  self.BASE_URL,
84
  params=params,
85
  headers=self._get_headers(),
86
+ timeout=(5, 8),
87
  )
88
  response.raise_for_status()
89
 
 
117
  response = get_session().get(
118
  f"{self.BASE_URL}/{doi}",
119
  headers=self._get_headers(),
120
+ timeout=(5, 8),
121
  )
122
  response.raise_for_status()
123
 
src/utils/http.py CHANGED
@@ -5,10 +5,21 @@ All fetchers should go through `get_session()` instead of bare `requests.get`.
5
  This gives them consistent retry/backoff on 429/5xx, polite-pool User-Agent,
6
  and (when enabled) SQLite-backed response caching to skip re-querying the
7
  same URL on re-runs.
 
 
 
 
 
 
 
 
 
 
8
  """
9
  from __future__ import annotations
10
 
11
  import logging
 
12
  import threading
13
  from pathlib import Path
14
  from typing import Optional
@@ -93,13 +104,24 @@ def _build_session() -> requests.Session:
93
  else:
94
  session = requests.Session()
95
 
96
- # Important: 429 is NOT in status_forcelist. A 429 means "you're being
97
- # rate-limited" retrying just blocks the calling thread for tens of
98
- # seconds while another parallel source could already have answered.
99
- # We let the caller see the 429 immediately and move on; the circuit
100
- # breaker (below) will skip the offending source for the rest of the run.
 
 
 
 
 
 
 
 
101
  retry = Retry(
102
- total=_settings["retry_total"],
 
 
 
103
  backoff_factor=_settings["retry_backoff_factor"],
104
  status_forcelist=(500, 502, 503, 504),
105
  allowed_methods=("GET", "HEAD"),
@@ -145,9 +167,16 @@ def is_open(source: str) -> bool:
145
  return bool(b and b.get("open"))
146
 
147
 
148
- def record_failure(source: str, threshold: int = 3) -> bool:
149
  """Note a failure for `source`; trip the breaker after `threshold`.
150
 
 
 
 
 
 
 
 
151
  Returns True if the breaker is now (or was already) open.
152
  """
153
  with _breakers_lock:
@@ -174,6 +203,19 @@ def record_success(source: str) -> None:
174
 
175
 
176
  def reset_breakers() -> None:
177
- """Clear all breaker state (called at the start of a fresh run)."""
 
 
 
 
 
178
  with _breakers_lock:
179
  _breakers.clear()
 
 
 
 
 
 
 
 
 
5
  This gives them consistent retry/backoff on 429/5xx, polite-pool User-Agent,
6
  and (when enabled) SQLite-backed response caching to skip re-querying the
7
  same URL on re-runs.
8
+
9
+ The application-level circuit breaker (``is_open`` / ``record_failure``) is
10
+ the primary defense against bad networks: any source that fails twice gets
11
+ skipped for the rest of the run. urllib3's own retry is intentionally
12
+ narrow (5xx only, no connect/read retries) so the breaker can trip fast.
13
+
14
+ For deploys where you know certain sources won't work (e.g. HF Spaces
15
+ egress IPs are routinely blocked by DBLP and arxiv), set
16
+ ``BIBGUARD_DISABLE_SOURCES=dblp,arxiv`` to permanently mark those breakers
17
+ as open at startup so we never even try them.
18
  """
19
  from __future__ import annotations
20
 
21
  import logging
22
+ import os
23
  import threading
24
  from pathlib import Path
25
  from typing import Optional
 
104
  else:
105
  session = requests.Session()
106
 
107
+ # Retry policy is deliberately surgical:
108
+ # - 429 NOT in status_forcelist: rate-limit means "back off", not "retry";
109
+ # retrying just blocks the thread while a parallel source could answer.
110
+ # - connect=0, read=0: do NOT retry on ConnectionReset / ReadTimeout /
111
+ # ConnectError. On hostile-network deploys (e.g. HF Spaces' egress IPs
112
+ # are sometimes blocked by DBLP / arxiv export), these errors persist
113
+ # across retries — retries just multiply the wall-clock penalty
114
+ # before our application-level circuit breaker can trip the source.
115
+ # - status retries are capped at min(retry_total, 2) for genuine 5xx,
116
+ # which are usually transient.
117
+ # The application-level circuit breaker (below) is the source-of-truth
118
+ # for "stop hitting this host"; urllib3's job is just one fast attempt.
119
+ status_retries = min(int(_settings["retry_total"]), 2)
120
  retry = Retry(
121
+ total=status_retries,
122
+ connect=0,
123
+ read=0,
124
+ status=status_retries,
125
  backoff_factor=_settings["retry_backoff_factor"],
126
  status_forcelist=(500, 502, 503, 504),
127
  allowed_methods=("GET", "HEAD"),
 
167
  return bool(b and b.get("open"))
168
 
169
 
170
+ def record_failure(source: str, threshold: int = 2) -> bool:
171
  """Note a failure for `source`; trip the breaker after `threshold`.
172
 
173
+ The default of 2 is intentionally aggressive: with urllib3 retries on
174
+ connection/read errors disabled (see ``_build_session``), each failure
175
+ completes in 1-3 seconds. Two quick fails ≈ 4-6 s wasted before the
176
+ source is shut off for the rest of the run, which is far cheaper than
177
+ the alternative of paying the timeout-per-entry on bad networks (HF
178
+ Spaces' egress IP being blocked by DBLP, e.g.).
179
+
180
  Returns True if the breaker is now (or was already) open.
181
  """
182
  with _breakers_lock:
 
203
 
204
 
205
  def reset_breakers() -> None:
206
+ """Clear all breaker state (called at the start of a fresh run).
207
+
208
+ After clearing, sources listed in ``BIBGUARD_DISABLE_SOURCES`` (comma- or
209
+ space-separated, case-insensitive) are immediately re-marked as open so
210
+ the run never even attempts them. Useful on hostile-network deploys.
211
+ """
212
  with _breakers_lock:
213
  _breakers.clear()
214
+ disabled = os.environ.get("BIBGUARD_DISABLE_SOURCES", "")
215
+ for raw in disabled.replace(",", " ").split():
216
+ name = raw.strip().lower()
217
+ if not name:
218
+ continue
219
+ with _breakers_lock:
220
+ _breakers[name] = {"failures": 9999, "open": True, "disabled": True}
221
+ logger.info("Source %r pre-disabled via BIBGUARD_DISABLE_SOURCES", name)