thinkwee commited on
Commit
dc4b1cd
·
1 Parent(s): f58a6b2

improve api stability

Browse files
README.md CHANGED
@@ -261,6 +261,22 @@ python app.py # or main.py
261
 
262
  Comma- or space-separated, case-insensitive. Other sources (CrossRef, Semantic Scholar, OpenAlex) keep working.
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  ## 🤝 Contributing
265
 
266
  Contributions welcome. Open an issue or pull request.
 
261
 
262
  Comma- or space-separated, case-insensitive. Other sources (CrossRef, Semantic Scholar, OpenAlex) keep working.
263
 
264
+ On a shared-IP deploy like HF Spaces, two env vars dramatically reduce false positives by lifting per-IP rate limits:
265
+
266
+ ```bash
267
+ # Polite-pool User-Agent → CrossRef and OpenAlex switch us off the anonymous
268
+ # shared queue (which on HF Spaces is hammered by other tenants) and onto a
269
+ # separate fair queue.
270
+ export BIBGUARD_CONTACT_EMAIL="you@example.com"
271
+
272
+ # Semantic Scholar free API key. Without it, S2's limit is 100 req / 5 min
273
+ # shared across the entire HF egress IP — it 429s almost immediately and the
274
+ # circuit breaker disables S2 for the whole run, leaving only title-search
275
+ # fallbacks (which produce the mismatched-paper false positives).
276
+ # Request one at https://www.semanticscholar.org/product/api/
277
+ export SEMANTIC_SCHOLAR_API_KEY="..."
278
+ ```
279
+
280
  ## 🤝 Contributing
281
 
282
  Contributions welcome. Open an issue or pull request.
app.py CHANGED
@@ -766,7 +766,9 @@ def _run_check_impl(
766
 
767
  if bib_config.check_metadata:
768
  arxiv_fetcher = ArxivFetcher()
769
- ss_fetcher = SemanticScholarFetcher()
 
 
770
  oa_fetcher = OpenAlexFetcher()
771
  dblp_fetcher = DBLPFetcher()
772
  crossref_fetcher = CrossRefFetcher()
 
766
 
767
  if bib_config.check_metadata:
768
  arxiv_fetcher = ArxivFetcher()
769
+ ss_fetcher = SemanticScholarFetcher(
770
+ api_key=os.environ.get("SEMANTIC_SCHOLAR_API_KEY") or None
771
+ )
772
  oa_fetcher = OpenAlexFetcher()
773
  dblp_fetcher = DBLPFetcher()
774
  crossref_fetcher = CrossRefFetcher()
app_helper.py CHANGED
@@ -43,6 +43,13 @@ _YEAR_TOL = 1
43
  _TITLE_MATCH_TIGHT = 0.88
44
  # Title similarity required to count as "corroborating" another source.
45
  _TITLE_AGREE = 0.95
 
 
 
 
 
 
 
46
 
47
 
48
  def _title_sim(a: str, b: str) -> float:
@@ -71,12 +78,17 @@ def _year_close(y1: str, y2: str) -> bool:
71
 
72
 
73
  def _pick_best_candidate(bib_title: str, candidates: list) -> Tuple[Optional[object], float]:
74
- """Pick the candidate whose title most closely matches `bib_title`."""
 
 
 
75
  best, best_sim = None, 0.0
76
  for c in candidates:
77
  sim = _title_sim(bib_title, getattr(c, "title", "") or "")
78
  if sim > best_sim:
79
  best, best_sim = c, sim
 
 
80
  return best, best_sim
81
 
82
 
@@ -98,12 +110,18 @@ def fetch_and_compare_with_workflow(
98
  if not (has_doi or has_arxiv or has_title):
99
  return comparator.create_unable_result(entry, "Entry has no DOI, arXiv ID, or title to look up")
100
 
 
 
 
 
 
 
101
  # ------------------------------------------------------------------ stage 1
102
  # Tasks are tuples of (source_name, callable returning ComparisonResult or None).
103
  tasks: list[tuple[str, callable]] = []
104
 
105
  # Identifier-based lookups (high precision).
106
- if has_doi and crossref_fetcher:
107
  def _t_cr_doi(e=entry):
108
  r = crossref_fetcher.search_by_doi(e.doi)
109
  return comparator.compare_with_crossref(e, r) if r else None
@@ -115,7 +133,7 @@ def fetch_and_compare_with_workflow(
115
  return comparator.compare_with_semantic_scholar(e, r) if r else None
116
  tasks.append(("s2(doi)", _t_s2_doi))
117
 
118
- if has_doi and openalex_fetcher:
119
  def _t_oa_doi(e=entry):
120
  r = openalex_fetcher.fetch_by_doi(e.doi)
121
  return comparator.compare_with_openalex(e, r) if r else None
 
43
  _TITLE_MATCH_TIGHT = 0.88
44
  # Title similarity required to count as "corroborating" another source.
45
  _TITLE_AGREE = 0.95
46
+ # Floor for accepting a title-search candidate at all. Below this the
47
+ # "best candidate" is almost certainly an unrelated paper (e.g. OpenAlex's
48
+ # top hit for a 2025 arXiv preprint it doesn't yet index) and reporting it
49
+ # as a mismatch is a false positive — the bib entry is fine, the fetcher
50
+ # just returned junk. Tuned from observed false-positive data on HF Spaces
51
+ # runs where identifier lookups failed and only title-search survived.
52
+ _TITLE_CANDIDATE_FLOOR = 0.6
53
 
54
 
55
  def _title_sim(a: str, b: str) -> float:
 
78
 
79
 
80
  def _pick_best_candidate(bib_title: str, candidates: list) -> Tuple[Optional[object], float]:
81
+ """Pick the candidate whose title most closely matches `bib_title`.
82
+
83
+ Returns (None, 0.0) if no candidate clears `_TITLE_CANDIDATE_FLOOR`.
84
+ """
85
  best, best_sim = None, 0.0
86
  for c in candidates:
87
  sim = _title_sim(bib_title, getattr(c, "title", "") or "")
88
  if sim > best_sim:
89
  best, best_sim = c, sim
90
+ if best_sim < _TITLE_CANDIDATE_FLOOR:
91
+ return None, 0.0
92
  return best, best_sim
93
 
94
 
 
110
  if not (has_doi or has_arxiv or has_title):
111
  return comparator.create_unable_result(entry, "Entry has no DOI, arXiv ID, or title to look up")
112
 
113
+ # arXiv-shaped DOIs (10.48550/ARXIV.*) are NOT indexed by Crossref or
114
+ # OpenAlex's DOI endpoint — querying them just burns retries on
115
+ # guaranteed 404s, which then trips the circuit breaker for the rest
116
+ # of the run. Route those to the arXiv / S2 arxiv-id paths instead.
117
+ doi_is_arxiv = has_doi and "10.48550/arxiv" in (entry.doi or "").lower()
118
+
119
  # ------------------------------------------------------------------ stage 1
120
  # Tasks are tuples of (source_name, callable returning ComparisonResult or None).
121
  tasks: list[tuple[str, callable]] = []
122
 
123
  # Identifier-based lookups (high precision).
124
+ if has_doi and crossref_fetcher and not doi_is_arxiv:
125
  def _t_cr_doi(e=entry):
126
  r = crossref_fetcher.search_by_doi(e.doi)
127
  return comparator.compare_with_crossref(e, r) if r else None
 
133
  return comparator.compare_with_semantic_scholar(e, r) if r else None
134
  tasks.append(("s2(doi)", _t_s2_doi))
135
 
136
+ if has_doi and openalex_fetcher and not doi_is_arxiv:
137
  def _t_oa_doi(e=entry):
138
  r = openalex_fetcher.fetch_by_doi(e.doi)
139
  return comparator.compare_with_openalex(e, r) if r else None
main.py CHANGED
@@ -13,6 +13,7 @@ Usage:
13
  """
14
  import argparse
15
  import logging
 
16
  import sys
17
  from pathlib import Path
18
  from typing import Optional, List
@@ -276,7 +277,9 @@ def run_checker(config: BibGuardConfig, template=None):
276
  arxiv_fetcher = ArxivFetcher()
277
 
278
  if bib_config.check_metadata:
279
- semantic_scholar_fetcher = SemanticScholarFetcher()
 
 
280
  openalex_fetcher = OpenAlexFetcher()
281
  dblp_fetcher = DBLPFetcher()
282
  crossref_fetcher = CrossRefFetcher()
 
13
  """
14
  import argparse
15
  import logging
16
+ import os
17
  import sys
18
  from pathlib import Path
19
  from typing import Optional, List
 
277
  arxiv_fetcher = ArxivFetcher()
278
 
279
  if bib_config.check_metadata:
280
+ semantic_scholar_fetcher = SemanticScholarFetcher(
281
+ api_key=os.environ.get("SEMANTIC_SCHOLAR_API_KEY") or None
282
+ )
283
  openalex_fetcher = OpenAlexFetcher()
284
  dblp_fetcher = DBLPFetcher()
285
  crossref_fetcher = CrossRefFetcher()
src/checkers/url_checker.py CHANGED
@@ -12,6 +12,7 @@ from __future__ import annotations
12
 
13
  import concurrent.futures
14
  import logging
 
15
  from dataclasses import dataclass
16
  from typing import Iterable, List, Optional
17
 
@@ -22,6 +23,17 @@ from src.parsers.bib_parser import BibEntry
22
 
23
  logger = logging.getLogger(__name__)
24
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  @dataclass
27
  class URLFinding:
@@ -41,6 +53,35 @@ class URLChecker:
41
  self.max_workers = max_workers
42
  self.timeout = timeout
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def _check_one(self, entry: BibEntry) -> Optional[URLFinding]:
45
  url = (entry.url or "").strip()
46
  if not url:
@@ -48,6 +89,12 @@ class URLChecker:
48
  if any(url.lower().startswith(p) for p in self.SKIP_PREFIXES):
49
  return URLFinding(entry.key, url, "skipped", detail="non-http scheme")
50
 
 
 
 
 
 
 
51
  session = get_session()
52
  try:
53
  r = session.head(url, allow_redirects=True, timeout=self.timeout)
 
12
 
13
  import concurrent.futures
14
  import logging
15
+ import re
16
  from dataclasses import dataclass
17
  from typing import Iterable, List, Optional
18
 
 
23
 
24
  logger = logging.getLogger(__name__)
25
 
26
+ # arxiv.org HTML endpoints (`/abs/...`, `/pdf/...`) routinely reset
27
+ # connections from cloud egress IPs (HF Spaces, AWS, GCP). The arXiv
28
+ # *export API* — same paper IDs, official endpoint — is far more stable.
29
+ # When we see an arxiv URL, we verify it by querying export.arxiv.org
30
+ # instead of HEAD'ing arxiv.org directly.
31
+ _ARXIV_URL_RE = re.compile(
32
+ r"^https?://(?:www\.)?arxiv\.org/(?:abs|pdf|html)/([\w.\-/]+?)(?:\.pdf|\.html)?(?:[?#]|$)",
33
+ re.IGNORECASE,
34
+ )
35
+ _ARXIV_EXPORT_API = "http://export.arxiv.org/api/query"
36
+
37
 
38
  @dataclass
39
  class URLFinding:
 
53
  self.max_workers = max_workers
54
  self.timeout = timeout
55
 
56
+ def _check_arxiv_via_api(self, entry_key: str, url: str, arxiv_id: str) -> URLFinding:
57
+ """Verify an arxiv URL by hitting export.arxiv.org instead of arxiv.org.
58
+
59
+ Returns "ok" if the export API returns an Atom entry for the ID,
60
+ "broken" if the feed is empty (ID doesn't exist), or "unreachable"
61
+ if the API itself fails.
62
+ """
63
+ session = get_session()
64
+ try:
65
+ r = session.get(
66
+ _ARXIV_EXPORT_API,
67
+ params={"id_list": arxiv_id, "max_results": 1},
68
+ timeout=self.timeout,
69
+ )
70
+ r.raise_for_status()
71
+ except requests.RequestException as e:
72
+ logger.debug("arXiv API check failed for %s: %s", url, e, exc_info=True)
73
+ return URLFinding(entry_key, url, "unreachable", detail=f"arxiv-api: {str(e)[:100]}")
74
+ # The Atom feed contains `<entry>` only when the ID resolves. An
75
+ # empty feed (totalResults=0) means the ID is bogus.
76
+ body = r.text or ""
77
+ if "<entry>" in body or "<entry " in body:
78
+ return URLFinding(entry_key, url, "ok", status_code=200)
79
+ return URLFinding(
80
+ entry_key, url, "broken",
81
+ status_code=200,
82
+ detail=f"arxiv id {arxiv_id!r} not found in export API",
83
+ )
84
+
85
  def _check_one(self, entry: BibEntry) -> Optional[URLFinding]:
86
  url = (entry.url or "").strip()
87
  if not url:
 
89
  if any(url.lower().startswith(p) for p in self.SKIP_PREFIXES):
90
  return URLFinding(entry.key, url, "skipped", detail="non-http scheme")
91
 
92
+ # arxiv.org HEAD requests get connection-reset on shared egress IPs.
93
+ # Re-route to the export API, which is the official liveness signal.
94
+ m = _ARXIV_URL_RE.match(url)
95
+ if m:
96
+ return self._check_arxiv_via_api(entry.key, url, m.group(1))
97
+
98
  session = get_session()
99
  try:
100
  r = session.head(url, allow_redirects=True, timeout=self.timeout)
src/fetchers/openalex_fetcher.py CHANGED
@@ -9,7 +9,7 @@ from typing import Optional
9
 
10
  import requests
11
 
12
- from src.utils.http import get_session, is_open, record_failure, record_success
13
 
14
  logger = logging.getLogger(__name__)
15
  _SOURCE = "openalex"
@@ -66,6 +66,12 @@ class OpenAlexFetcher:
66
 
67
  url = f"{self.BASE_URL}/works"
68
  params = {'search': title, 'per-page': max_results}
 
 
 
 
 
 
69
 
70
  try:
71
  response = get_session().get(url, params=params, timeout=8)
@@ -91,9 +97,11 @@ class OpenAlexFetcher:
91
 
92
  doi_url = f"https://doi.org/{doi}"
93
  url = f"{self.BASE_URL}/works/{doi_url}"
 
 
94
 
95
  try:
96
- response = get_session().get(url, timeout=8)
97
  response.raise_for_status()
98
  data = response.json()
99
  record_success(_SOURCE)
 
9
 
10
  import requests
11
 
12
+ from src.utils.http import get_session, is_open, record_failure, record_success, contact_email
13
 
14
  logger = logging.getLogger(__name__)
15
  _SOURCE = "openalex"
 
66
 
67
  url = f"{self.BASE_URL}/works"
68
  params = {'search': title, 'per-page': max_results}
69
+ # OpenAlex polite pool: docs explicitly recommend mailto as a query
70
+ # param (the UA-embedded mailto is honored too, but the query form is
71
+ # the canonical signal). Costs nothing if no email is configured.
72
+ email = contact_email()
73
+ if email:
74
+ params['mailto'] = email
75
 
76
  try:
77
  response = get_session().get(url, params=params, timeout=8)
 
97
 
98
  doi_url = f"https://doi.org/{doi}"
99
  url = f"{self.BASE_URL}/works/{doi_url}"
100
+ email = contact_email()
101
+ params = {'mailto': email} if email else None
102
 
103
  try:
104
+ response = get_session().get(url, params=params, timeout=8)
105
  response.raise_for_status()
106
  data = response.json()
107
  record_success(_SOURCE)
src/fetchers/retraction_fetcher.py CHANGED
@@ -14,10 +14,16 @@ from typing import Optional
14
 
15
  import requests
16
 
17
- from src.utils.http import get_session
18
 
19
  logger = logging.getLogger(__name__)
20
 
 
 
 
 
 
 
21
 
22
  @dataclass
23
  class RetractionResult:
@@ -48,6 +54,13 @@ class RetractionFetcher:
48
  doi = doi.replace("https://doi.org/", "").replace("http://doi.org/", "").strip()
49
  if not doi:
50
  return None
 
 
 
 
 
 
 
51
 
52
  try:
53
  response = get_session().get(
@@ -56,10 +69,14 @@ class RetractionFetcher:
56
  timeout=20,
57
  )
58
  if response.status_code == 404:
 
 
59
  return None
60
  response.raise_for_status()
 
61
  except requests.RequestException as e:
62
  logger.debug("Retraction lookup failed for %s: %s", doi, e, exc_info=True)
 
63
  return None
64
 
65
  try:
 
14
 
15
  import requests
16
 
17
+ from src.utils.http import get_session, is_open, record_failure, record_success
18
 
19
  logger = logging.getLogger(__name__)
20
 
21
+ # Share the metadata fetcher's circuit breaker. Both hit the same
22
+ # api.crossref.org host, so if metadata lookups already proved Crossref is
23
+ # limiting us, we shouldn't keep firing retraction lookups at the same
24
+ # host — that just deepens the 429 hole.
25
+ _SOURCE = "crossref"
26
+
27
 
28
  @dataclass
29
  class RetractionResult:
 
54
  doi = doi.replace("https://doi.org/", "").replace("http://doi.org/", "").strip()
55
  if not doi:
56
  return None
57
+ # arxiv-shaped DOIs are not indexed by Crossref — skip them outright
58
+ # rather than burning a guaranteed 404 (and a circuit-breaker tick)
59
+ # on every preprint in the bibliography.
60
+ if doi.lower().startswith("10.48550/arxiv"):
61
+ return None
62
+ if is_open(_SOURCE):
63
+ return None
64
 
65
  try:
66
  response = get_session().get(
 
69
  timeout=20,
70
  )
71
  if response.status_code == 404:
72
+ # 404 means "no such DOI" — that's a real answer, not a failure
73
+ record_success(_SOURCE)
74
  return None
75
  response.raise_for_status()
76
+ record_success(_SOURCE)
77
  except requests.RequestException as e:
78
  logger.debug("Retraction lookup failed for %s: %s", doi, e, exc_info=True)
79
+ record_failure(_SOURCE)
80
  return None
81
 
82
  try:
src/fetchers/semantic_scholar_fetcher.py CHANGED
@@ -38,14 +38,23 @@ class SemanticScholarFetcher:
38
  """
39
 
40
  BASE_URL = "https://api.semanticscholar.org/graph/v1"
41
- RATE_LIMIT_DELAY = 0.5 # Conservative delay (120 req/min max)
42
-
 
 
 
 
 
 
43
  def __init__(self, api_key: Optional[str] = None):
44
  """
45
  Semantic Scholar fetcher. Uses shared session; api_key is added per-call
46
  as a header so the cache key includes it.
47
  """
48
  self.api_key = api_key
 
 
 
49
  self._last_request_time = 0.0
50
 
51
  def _headers(self) -> dict:
 
38
  """
39
 
40
  BASE_URL = "https://api.semanticscholar.org/graph/v1"
41
+ # S2 publicly states its "introductory" rate for keyed clients is 1 r/s
42
+ # on all endpoints. Sustained 2 r/s (0.5s delay) was eating burst budget
43
+ # and risking 429s that trip our circuit breaker for the whole run.
44
+ # Without a key, the 100 req / 5 min ≈ 0.33 r/s shared limit applies and
45
+ # we'll 429 within a few calls anyway — the breaker handles that case.
46
+ RATE_LIMIT_DELAY_KEYED = 1.0
47
+ RATE_LIMIT_DELAY_ANON = 0.5
48
+
49
  def __init__(self, api_key: Optional[str] = None):
50
  """
51
  Semantic Scholar fetcher. Uses shared session; api_key is added per-call
52
  as a header so the cache key includes it.
53
  """
54
  self.api_key = api_key
55
+ self.RATE_LIMIT_DELAY = (
56
+ self.RATE_LIMIT_DELAY_KEYED if api_key else self.RATE_LIMIT_DELAY_ANON
57
+ )
58
  self._last_request_time = 0.0
59
 
60
  def _headers(self) -> dict:
src/utils/http.py CHANGED
@@ -74,6 +74,11 @@ def user_agent() -> str:
74
  return "BibGuard/1.0 (+https://github.com/thinkwee/BibGuard)"
75
 
76
 
 
 
 
 
 
77
  def _build_session() -> requests.Session:
78
  """Construct a Session with retry and (optionally) caching."""
79
  cache_enabled = _settings["cache_enabled"]
 
74
  return "BibGuard/1.0 (+https://github.com/thinkwee/BibGuard)"
75
 
76
 
77
+ def contact_email() -> str:
78
+ """Return the configured polite-pool contact email, or empty string."""
79
+ return _settings.get("contact_email") or ""
80
+
81
+
82
  def _build_session() -> requests.Session:
83
  """Construct a Session with retry and (optionally) caching."""
84
  cache_enabled = _settings["cache_enabled"]