Spaces:

thinkwee
/

BibGuard

Running

App Files Files Community

thinkwee commited on 15 days ago

Commit

dc4b1cd

1 Parent(s): f58a6b2

improve api stability

Browse files

Files changed (9) hide show

README.md +16 -0
app.py +3 -1
app_helper.py +21 -3
main.py +4 -1
src/checkers/url_checker.py +47 -0
src/fetchers/openalex_fetcher.py +10 -2
src/fetchers/retraction_fetcher.py +18 -1
src/fetchers/semantic_scholar_fetcher.py +11 -2
src/utils/http.py +5 -0

README.md CHANGED Viewed

@@ -261,6 +261,22 @@ python app.py     # or main.py
 Comma- or space-separated, case-insensitive. Other sources (CrossRef, Semantic Scholar, OpenAlex) keep working.
 ## 🤝 Contributing
 Contributions welcome. Open an issue or pull request.

 Comma- or space-separated, case-insensitive. Other sources (CrossRef, Semantic Scholar, OpenAlex) keep working.
+On a shared-IP deploy like HF Spaces, two env vars dramatically reduce false positives by lifting per-IP rate limits:
+```bash
+# Polite-pool User-Agent → CrossRef and OpenAlex switch us off the anonymous
+# shared queue (which on HF Spaces is hammered by other tenants) and onto a
+# separate fair queue.
+export BIBGUARD_CONTACT_EMAIL="you@example.com"
+# Semantic Scholar free API key. Without it, S2's limit is 100 req / 5 min
+# shared across the entire HF egress IP — it 429s almost immediately and the
+# circuit breaker disables S2 for the whole run, leaving only title-search
+# fallbacks (which produce the mismatched-paper false positives).
+# Request one at https://www.semanticscholar.org/product/api/
+export SEMANTIC_SCHOLAR_API_KEY="..."
+```
 ## 🤝 Contributing
 Contributions welcome. Open an issue or pull request.

app.py CHANGED Viewed

@@ -766,7 +766,9 @@ def _run_check_impl(
     if bib_config.check_metadata:
         arxiv_fetcher = ArxivFetcher()
-        ss_fetcher = SemanticScholarFetcher()
         oa_fetcher = OpenAlexFetcher()
         dblp_fetcher = DBLPFetcher()
         crossref_fetcher = CrossRefFetcher()

     if bib_config.check_metadata:
         arxiv_fetcher = ArxivFetcher()
+        ss_fetcher = SemanticScholarFetcher(
+            api_key=os.environ.get("SEMANTIC_SCHOLAR_API_KEY") or None
+        )
         oa_fetcher = OpenAlexFetcher()
         dblp_fetcher = DBLPFetcher()
         crossref_fetcher = CrossRefFetcher()

app_helper.py CHANGED Viewed

@@ -43,6 +43,13 @@ _YEAR_TOL = 1
 _TITLE_MATCH_TIGHT = 0.88
 # Title similarity required to count as "corroborating" another source.
 _TITLE_AGREE = 0.95
 def _title_sim(a: str, b: str) -> float:
@@ -71,12 +78,17 @@ def _year_close(y1: str, y2: str) -> bool:
 def _pick_best_candidate(bib_title: str, candidates: list) -> Tuple[Optional[object], float]:
-    """Pick the candidate whose title most closely matches `bib_title`."""
     best, best_sim = None, 0.0
     for c in candidates:
         sim = _title_sim(bib_title, getattr(c, "title", "") or "")
         if sim > best_sim:
             best, best_sim = c, sim
     return best, best_sim
@@ -98,12 +110,18 @@ def fetch_and_compare_with_workflow(
     if not (has_doi or has_arxiv or has_title):
         return comparator.create_unable_result(entry, "Entry has no DOI, arXiv ID, or title to look up")
     # ------------------------------------------------------------------ stage 1
     # Tasks are tuples of (source_name, callable returning ComparisonResult or None).
     tasks: list[tuple[str, callable]] = []
     # Identifier-based lookups (high precision).
-    if has_doi and crossref_fetcher:
         def _t_cr_doi(e=entry):
             r = crossref_fetcher.search_by_doi(e.doi)
             return comparator.compare_with_crossref(e, r) if r else None
@@ -115,7 +133,7 @@ def fetch_and_compare_with_workflow(
             return comparator.compare_with_semantic_scholar(e, r) if r else None
         tasks.append(("s2(doi)", _t_s2_doi))
-    if has_doi and openalex_fetcher:
         def _t_oa_doi(e=entry):
             r = openalex_fetcher.fetch_by_doi(e.doi)
             return comparator.compare_with_openalex(e, r) if r else None

 _TITLE_MATCH_TIGHT = 0.88
 # Title similarity required to count as "corroborating" another source.
 _TITLE_AGREE = 0.95
+# Floor for accepting a title-search candidate at all. Below this the
+# "best candidate" is almost certainly an unrelated paper (e.g. OpenAlex's
+# top hit for a 2025 arXiv preprint it doesn't yet index) and reporting it
+# as a mismatch is a false positive — the bib entry is fine, the fetcher
+# just returned junk. Tuned from observed false-positive data on HF Spaces
+# runs where identifier lookups failed and only title-search survived.
+_TITLE_CANDIDATE_FLOOR = 0.6
 def _title_sim(a: str, b: str) -> float:
 def _pick_best_candidate(bib_title: str, candidates: list) -> Tuple[Optional[object], float]:
+    """Pick the candidate whose title most closely matches `bib_title`.
+    Returns (None, 0.0) if no candidate clears `_TITLE_CANDIDATE_FLOOR`.
+    """
     best, best_sim = None, 0.0
     for c in candidates:
         sim = _title_sim(bib_title, getattr(c, "title", "") or "")
         if sim > best_sim:
             best, best_sim = c, sim
+    if best_sim < _TITLE_CANDIDATE_FLOOR:
+        return None, 0.0
     return best, best_sim
     if not (has_doi or has_arxiv or has_title):
         return comparator.create_unable_result(entry, "Entry has no DOI, arXiv ID, or title to look up")
+    # arXiv-shaped DOIs (10.48550/ARXIV.*) are NOT indexed by Crossref or
+    # OpenAlex's DOI endpoint — querying them just burns retries on
+    # guaranteed 404s, which then trips the circuit breaker for the rest
+    # of the run. Route those to the arXiv / S2 arxiv-id paths instead.
+    doi_is_arxiv = has_doi and "10.48550/arxiv" in (entry.doi or "").lower()
     # ------------------------------------------------------------------ stage 1
     # Tasks are tuples of (source_name, callable returning ComparisonResult or None).
     tasks: list[tuple[str, callable]] = []
     # Identifier-based lookups (high precision).
+    if has_doi and crossref_fetcher and not doi_is_arxiv:
         def _t_cr_doi(e=entry):
             r = crossref_fetcher.search_by_doi(e.doi)
             return comparator.compare_with_crossref(e, r) if r else None
             return comparator.compare_with_semantic_scholar(e, r) if r else None
         tasks.append(("s2(doi)", _t_s2_doi))
+    if has_doi and openalex_fetcher and not doi_is_arxiv:
         def _t_oa_doi(e=entry):
             r = openalex_fetcher.fetch_by_doi(e.doi)
             return comparator.compare_with_openalex(e, r) if r else None

main.py CHANGED Viewed

@@ -13,6 +13,7 @@ Usage:
 """
 import argparse
 import logging
 import sys
 from pathlib import Path
 from typing import Optional, List
@@ -276,7 +277,9 @@ def run_checker(config: BibGuardConfig, template=None):
         arxiv_fetcher = ArxivFetcher()
     if bib_config.check_metadata:
-        semantic_scholar_fetcher = SemanticScholarFetcher()
         openalex_fetcher = OpenAlexFetcher()
         dblp_fetcher = DBLPFetcher()
         crossref_fetcher = CrossRefFetcher()

 """
 import argparse
 import logging
+import os
 import sys
 from pathlib import Path
 from typing import Optional, List
         arxiv_fetcher = ArxivFetcher()
     if bib_config.check_metadata:
+        semantic_scholar_fetcher = SemanticScholarFetcher(
+            api_key=os.environ.get("SEMANTIC_SCHOLAR_API_KEY") or None
+        )
         openalex_fetcher = OpenAlexFetcher()
         dblp_fetcher = DBLPFetcher()
         crossref_fetcher = CrossRefFetcher()

src/checkers/url_checker.py CHANGED Viewed

@@ -12,6 +12,7 @@ from __future__ import annotations
 import concurrent.futures
 import logging
 from dataclasses import dataclass
 from typing import Iterable, List, Optional
@@ -22,6 +23,17 @@ from src.parsers.bib_parser import BibEntry
 logger = logging.getLogger(__name__)
 @dataclass
 class URLFinding:
@@ -41,6 +53,35 @@ class URLChecker:
         self.max_workers = max_workers
         self.timeout = timeout
     def _check_one(self, entry: BibEntry) -> Optional[URLFinding]:
         url = (entry.url or "").strip()
         if not url:
@@ -48,6 +89,12 @@ class URLChecker:
         if any(url.lower().startswith(p) for p in self.SKIP_PREFIXES):
             return URLFinding(entry.key, url, "skipped", detail="non-http scheme")
         session = get_session()
         try:
             r = session.head(url, allow_redirects=True, timeout=self.timeout)

 import concurrent.futures
 import logging
+import re
 from dataclasses import dataclass
 from typing import Iterable, List, Optional
 logger = logging.getLogger(__name__)
+# arxiv.org HTML endpoints (`/abs/...`, `/pdf/...`) routinely reset
+# connections from cloud egress IPs (HF Spaces, AWS, GCP). The arXiv
+# *export API* — same paper IDs, official endpoint — is far more stable.
+# When we see an arxiv URL, we verify it by querying export.arxiv.org
+# instead of HEAD'ing arxiv.org directly.
+_ARXIV_URL_RE = re.compile(
+    r"^https?://(?:www\.)?arxiv\.org/(?:abs|pdf|html)/([\w.\-/]+?)(?:\.pdf|\.html)?(?:[?#]|$)",
+    re.IGNORECASE,
+)
+_ARXIV_EXPORT_API = "http://export.arxiv.org/api/query"
 @dataclass
 class URLFinding:
         self.max_workers = max_workers
         self.timeout = timeout
+    def _check_arxiv_via_api(self, entry_key: str, url: str, arxiv_id: str) -> URLFinding:
+        """Verify an arxiv URL by hitting export.arxiv.org instead of arxiv.org.
+        Returns "ok" if the export API returns an Atom entry for the ID,
+        "broken" if the feed is empty (ID doesn't exist), or "unreachable"
+        if the API itself fails.
+        """
+        session = get_session()
+        try:
+            r = session.get(
+                _ARXIV_EXPORT_API,
+                params={"id_list": arxiv_id, "max_results": 1},
+                timeout=self.timeout,
+            )
+            r.raise_for_status()
+        except requests.RequestException as e:
+            logger.debug("arXiv API check failed for %s: %s", url, e, exc_info=True)
+            return URLFinding(entry_key, url, "unreachable", detail=f"arxiv-api: {str(e)[:100]}")
+        # The Atom feed contains `<entry>` only when the ID resolves. An
+        # empty feed (totalResults=0) means the ID is bogus.
+        body = r.text or ""
+        if "<entry>" in body or "<entry " in body:
+            return URLFinding(entry_key, url, "ok", status_code=200)
+        return URLFinding(
+            entry_key, url, "broken",
+            status_code=200,
+            detail=f"arxiv id {arxiv_id!r} not found in export API",
+        )
     def _check_one(self, entry: BibEntry) -> Optional[URLFinding]:
         url = (entry.url or "").strip()
         if not url:
         if any(url.lower().startswith(p) for p in self.SKIP_PREFIXES):
             return URLFinding(entry.key, url, "skipped", detail="non-http scheme")
+        # arxiv.org HEAD requests get connection-reset on shared egress IPs.
+        # Re-route to the export API, which is the official liveness signal.
+        m = _ARXIV_URL_RE.match(url)
+        if m:
+            return self._check_arxiv_via_api(entry.key, url, m.group(1))
         session = get_session()
         try:
             r = session.head(url, allow_redirects=True, timeout=self.timeout)

src/fetchers/openalex_fetcher.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Optional
 import requests
-from src.utils.http import get_session, is_open, record_failure, record_success
 logger = logging.getLogger(__name__)
 _SOURCE = "openalex"
@@ -66,6 +66,12 @@ class OpenAlexFetcher:
         url = f"{self.BASE_URL}/works"
         params = {'search': title, 'per-page': max_results}
         try:
             response = get_session().get(url, params=params, timeout=8)
@@ -91,9 +97,11 @@ class OpenAlexFetcher:
         doi_url = f"https://doi.org/{doi}"
         url = f"{self.BASE_URL}/works/{doi_url}"
         try:
-            response = get_session().get(url, timeout=8)
             response.raise_for_status()
             data = response.json()
             record_success(_SOURCE)

 import requests
+from src.utils.http import get_session, is_open, record_failure, record_success, contact_email
 logger = logging.getLogger(__name__)
 _SOURCE = "openalex"
         url = f"{self.BASE_URL}/works"
         params = {'search': title, 'per-page': max_results}
+        # OpenAlex polite pool: docs explicitly recommend mailto as a query
+        # param (the UA-embedded mailto is honored too, but the query form is
+        # the canonical signal). Costs nothing if no email is configured.
+        email = contact_email()
+        if email:
+            params['mailto'] = email
         try:
             response = get_session().get(url, params=params, timeout=8)
         doi_url = f"https://doi.org/{doi}"
         url = f"{self.BASE_URL}/works/{doi_url}"
+        email = contact_email()
+        params = {'mailto': email} if email else None
         try:
+            response = get_session().get(url, params=params, timeout=8)
             response.raise_for_status()
             data = response.json()
             record_success(_SOURCE)

src/fetchers/retraction_fetcher.py CHANGED Viewed

@@ -14,10 +14,16 @@ from typing import Optional
 import requests
-from src.utils.http import get_session
 logger = logging.getLogger(__name__)
 @dataclass
 class RetractionResult:
@@ -48,6 +54,13 @@ class RetractionFetcher:
         doi = doi.replace("https://doi.org/", "").replace("http://doi.org/", "").strip()
         if not doi:
             return None
         try:
             response = get_session().get(
@@ -56,10 +69,14 @@ class RetractionFetcher:
                 timeout=20,
             )
             if response.status_code == 404:
                 return None
             response.raise_for_status()
         except requests.RequestException as e:
             logger.debug("Retraction lookup failed for %s: %s", doi, e, exc_info=True)
             return None
         try:

 import requests
+from src.utils.http import get_session, is_open, record_failure, record_success
 logger = logging.getLogger(__name__)
+# Share the metadata fetcher's circuit breaker. Both hit the same
+# api.crossref.org host, so if metadata lookups already proved Crossref is
+# limiting us, we shouldn't keep firing retraction lookups at the same
+# host — that just deepens the 429 hole.
+_SOURCE = "crossref"
 @dataclass
 class RetractionResult:
         doi = doi.replace("https://doi.org/", "").replace("http://doi.org/", "").strip()
         if not doi:
             return None
+        # arxiv-shaped DOIs are not indexed by Crossref — skip them outright
+        # rather than burning a guaranteed 404 (and a circuit-breaker tick)
+        # on every preprint in the bibliography.
+        if doi.lower().startswith("10.48550/arxiv"):
+            return None
+        if is_open(_SOURCE):
+            return None
         try:
             response = get_session().get(
                 timeout=20,
             )
             if response.status_code == 404:
+                # 404 means "no such DOI" — that's a real answer, not a failure
+                record_success(_SOURCE)
                 return None
             response.raise_for_status()
+            record_success(_SOURCE)
         except requests.RequestException as e:
             logger.debug("Retraction lookup failed for %s: %s", doi, e, exc_info=True)
+            record_failure(_SOURCE)
             return None
         try:

src/fetchers/semantic_scholar_fetcher.py CHANGED Viewed

@@ -38,14 +38,23 @@ class SemanticScholarFetcher:
     """
     BASE_URL = "https://api.semanticscholar.org/graph/v1"
-    RATE_LIMIT_DELAY = 0.5  # Conservative delay (120 req/min max)
     def __init__(self, api_key: Optional[str] = None):
         """
         Semantic Scholar fetcher. Uses shared session; api_key is added per-call
         as a header so the cache key includes it.
         """
         self.api_key = api_key
         self._last_request_time = 0.0
     def _headers(self) -> dict:

     """
     BASE_URL = "https://api.semanticscholar.org/graph/v1"
+    # S2 publicly states its "introductory" rate for keyed clients is 1 r/s
+    # on all endpoints. Sustained 2 r/s (0.5s delay) was eating burst budget
+    # and risking 429s that trip our circuit breaker for the whole run.
+    # Without a key, the 100 req / 5 min ≈ 0.33 r/s shared limit applies and
+    # we'll 429 within a few calls anyway — the breaker handles that case.
+    RATE_LIMIT_DELAY_KEYED = 1.0
+    RATE_LIMIT_DELAY_ANON = 0.5
     def __init__(self, api_key: Optional[str] = None):
         """
         Semantic Scholar fetcher. Uses shared session; api_key is added per-call
         as a header so the cache key includes it.
         """
         self.api_key = api_key
+        self.RATE_LIMIT_DELAY = (
+            self.RATE_LIMIT_DELAY_KEYED if api_key else self.RATE_LIMIT_DELAY_ANON
+        )
         self._last_request_time = 0.0
     def _headers(self) -> dict:

src/utils/http.py CHANGED Viewed

@@ -74,6 +74,11 @@ def user_agent() -> str:
     return "BibGuard/1.0 (+https://github.com/thinkwee/BibGuard)"
 def _build_session() -> requests.Session:
     """Construct a Session with retry and (optionally) caching."""
     cache_enabled = _settings["cache_enabled"]

     return "BibGuard/1.0 (+https://github.com/thinkwee/BibGuard)"
+def contact_email() -> str:
+    """Return the configured polite-pool contact email, or empty string."""
+    return _settings.get("contact_email") or ""
 def _build_session() -> requests.Session:
     """Construct a Session with retry and (optionally) caching."""
     cache_enabled = _settings["cache_enabled"]