thinkwee commited on
Commit ·
dc4b1cd
1
Parent(s): f58a6b2
improve api stability
Browse files- README.md +16 -0
- app.py +3 -1
- app_helper.py +21 -3
- main.py +4 -1
- src/checkers/url_checker.py +47 -0
- src/fetchers/openalex_fetcher.py +10 -2
- src/fetchers/retraction_fetcher.py +18 -1
- src/fetchers/semantic_scholar_fetcher.py +11 -2
- src/utils/http.py +5 -0
README.md
CHANGED
|
@@ -261,6 +261,22 @@ python app.py # or main.py
|
|
| 261 |
|
| 262 |
Comma- or space-separated, case-insensitive. Other sources (CrossRef, Semantic Scholar, OpenAlex) keep working.
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
## 🤝 Contributing
|
| 265 |
|
| 266 |
Contributions welcome. Open an issue or pull request.
|
|
|
|
| 261 |
|
| 262 |
Comma- or space-separated, case-insensitive. Other sources (CrossRef, Semantic Scholar, OpenAlex) keep working.
|
| 263 |
|
| 264 |
+
On a shared-IP deploy like HF Spaces, two env vars dramatically reduce false positives by lifting per-IP rate limits:
|
| 265 |
+
|
| 266 |
+
```bash
|
| 267 |
+
# Polite-pool User-Agent → CrossRef and OpenAlex switch us off the anonymous
|
| 268 |
+
# shared queue (which on HF Spaces is hammered by other tenants) and onto a
|
| 269 |
+
# separate fair queue.
|
| 270 |
+
export BIBGUARD_CONTACT_EMAIL="you@example.com"
|
| 271 |
+
|
| 272 |
+
# Semantic Scholar free API key. Without it, S2's limit is 100 req / 5 min
|
| 273 |
+
# shared across the entire HF egress IP — it 429s almost immediately and the
|
| 274 |
+
# circuit breaker disables S2 for the whole run, leaving only title-search
|
| 275 |
+
# fallbacks (which produce the mismatched-paper false positives).
|
| 276 |
+
# Request one at https://www.semanticscholar.org/product/api/
|
| 277 |
+
export SEMANTIC_SCHOLAR_API_KEY="..."
|
| 278 |
+
```
|
| 279 |
+
|
| 280 |
## 🤝 Contributing
|
| 281 |
|
| 282 |
Contributions welcome. Open an issue or pull request.
|
app.py
CHANGED
|
@@ -766,7 +766,9 @@ def _run_check_impl(
|
|
| 766 |
|
| 767 |
if bib_config.check_metadata:
|
| 768 |
arxiv_fetcher = ArxivFetcher()
|
| 769 |
-
ss_fetcher = SemanticScholarFetcher(
|
|
|
|
|
|
|
| 770 |
oa_fetcher = OpenAlexFetcher()
|
| 771 |
dblp_fetcher = DBLPFetcher()
|
| 772 |
crossref_fetcher = CrossRefFetcher()
|
|
|
|
| 766 |
|
| 767 |
if bib_config.check_metadata:
|
| 768 |
arxiv_fetcher = ArxivFetcher()
|
| 769 |
+
ss_fetcher = SemanticScholarFetcher(
|
| 770 |
+
api_key=os.environ.get("SEMANTIC_SCHOLAR_API_KEY") or None
|
| 771 |
+
)
|
| 772 |
oa_fetcher = OpenAlexFetcher()
|
| 773 |
dblp_fetcher = DBLPFetcher()
|
| 774 |
crossref_fetcher = CrossRefFetcher()
|
app_helper.py
CHANGED
|
@@ -43,6 +43,13 @@ _YEAR_TOL = 1
|
|
| 43 |
_TITLE_MATCH_TIGHT = 0.88
|
| 44 |
# Title similarity required to count as "corroborating" another source.
|
| 45 |
_TITLE_AGREE = 0.95
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
def _title_sim(a: str, b: str) -> float:
|
|
@@ -71,12 +78,17 @@ def _year_close(y1: str, y2: str) -> bool:
|
|
| 71 |
|
| 72 |
|
| 73 |
def _pick_best_candidate(bib_title: str, candidates: list) -> Tuple[Optional[object], float]:
|
| 74 |
-
"""Pick the candidate whose title most closely matches `bib_title`.
|
|
|
|
|
|
|
|
|
|
| 75 |
best, best_sim = None, 0.0
|
| 76 |
for c in candidates:
|
| 77 |
sim = _title_sim(bib_title, getattr(c, "title", "") or "")
|
| 78 |
if sim > best_sim:
|
| 79 |
best, best_sim = c, sim
|
|
|
|
|
|
|
| 80 |
return best, best_sim
|
| 81 |
|
| 82 |
|
|
@@ -98,12 +110,18 @@ def fetch_and_compare_with_workflow(
|
|
| 98 |
if not (has_doi or has_arxiv or has_title):
|
| 99 |
return comparator.create_unable_result(entry, "Entry has no DOI, arXiv ID, or title to look up")
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
# ------------------------------------------------------------------ stage 1
|
| 102 |
# Tasks are tuples of (source_name, callable returning ComparisonResult or None).
|
| 103 |
tasks: list[tuple[str, callable]] = []
|
| 104 |
|
| 105 |
# Identifier-based lookups (high precision).
|
| 106 |
-
if has_doi and crossref_fetcher:
|
| 107 |
def _t_cr_doi(e=entry):
|
| 108 |
r = crossref_fetcher.search_by_doi(e.doi)
|
| 109 |
return comparator.compare_with_crossref(e, r) if r else None
|
|
@@ -115,7 +133,7 @@ def fetch_and_compare_with_workflow(
|
|
| 115 |
return comparator.compare_with_semantic_scholar(e, r) if r else None
|
| 116 |
tasks.append(("s2(doi)", _t_s2_doi))
|
| 117 |
|
| 118 |
-
if has_doi and openalex_fetcher:
|
| 119 |
def _t_oa_doi(e=entry):
|
| 120 |
r = openalex_fetcher.fetch_by_doi(e.doi)
|
| 121 |
return comparator.compare_with_openalex(e, r) if r else None
|
|
|
|
| 43 |
_TITLE_MATCH_TIGHT = 0.88
|
| 44 |
# Title similarity required to count as "corroborating" another source.
|
| 45 |
_TITLE_AGREE = 0.95
|
| 46 |
+
# Floor for accepting a title-search candidate at all. Below this the
|
| 47 |
+
# "best candidate" is almost certainly an unrelated paper (e.g. OpenAlex's
|
| 48 |
+
# top hit for a 2025 arXiv preprint it doesn't yet index) and reporting it
|
| 49 |
+
# as a mismatch is a false positive — the bib entry is fine, the fetcher
|
| 50 |
+
# just returned junk. Tuned from observed false-positive data on HF Spaces
|
| 51 |
+
# runs where identifier lookups failed and only title-search survived.
|
| 52 |
+
_TITLE_CANDIDATE_FLOOR = 0.6
|
| 53 |
|
| 54 |
|
| 55 |
def _title_sim(a: str, b: str) -> float:
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
def _pick_best_candidate(bib_title: str, candidates: list) -> Tuple[Optional[object], float]:
|
| 81 |
+
"""Pick the candidate whose title most closely matches `bib_title`.
|
| 82 |
+
|
| 83 |
+
Returns (None, 0.0) if no candidate clears `_TITLE_CANDIDATE_FLOOR`.
|
| 84 |
+
"""
|
| 85 |
best, best_sim = None, 0.0
|
| 86 |
for c in candidates:
|
| 87 |
sim = _title_sim(bib_title, getattr(c, "title", "") or "")
|
| 88 |
if sim > best_sim:
|
| 89 |
best, best_sim = c, sim
|
| 90 |
+
if best_sim < _TITLE_CANDIDATE_FLOOR:
|
| 91 |
+
return None, 0.0
|
| 92 |
return best, best_sim
|
| 93 |
|
| 94 |
|
|
|
|
| 110 |
if not (has_doi or has_arxiv or has_title):
|
| 111 |
return comparator.create_unable_result(entry, "Entry has no DOI, arXiv ID, or title to look up")
|
| 112 |
|
| 113 |
+
# arXiv-shaped DOIs (10.48550/ARXIV.*) are NOT indexed by Crossref or
|
| 114 |
+
# OpenAlex's DOI endpoint — querying them just burns retries on
|
| 115 |
+
# guaranteed 404s, which then trips the circuit breaker for the rest
|
| 116 |
+
# of the run. Route those to the arXiv / S2 arxiv-id paths instead.
|
| 117 |
+
doi_is_arxiv = has_doi and "10.48550/arxiv" in (entry.doi or "").lower()
|
| 118 |
+
|
| 119 |
# ------------------------------------------------------------------ stage 1
|
| 120 |
# Tasks are tuples of (source_name, callable returning ComparisonResult or None).
|
| 121 |
tasks: list[tuple[str, callable]] = []
|
| 122 |
|
| 123 |
# Identifier-based lookups (high precision).
|
| 124 |
+
if has_doi and crossref_fetcher and not doi_is_arxiv:
|
| 125 |
def _t_cr_doi(e=entry):
|
| 126 |
r = crossref_fetcher.search_by_doi(e.doi)
|
| 127 |
return comparator.compare_with_crossref(e, r) if r else None
|
|
|
|
| 133 |
return comparator.compare_with_semantic_scholar(e, r) if r else None
|
| 134 |
tasks.append(("s2(doi)", _t_s2_doi))
|
| 135 |
|
| 136 |
+
if has_doi and openalex_fetcher and not doi_is_arxiv:
|
| 137 |
def _t_oa_doi(e=entry):
|
| 138 |
r = openalex_fetcher.fetch_by_doi(e.doi)
|
| 139 |
return comparator.compare_with_openalex(e, r) if r else None
|
main.py
CHANGED
|
@@ -13,6 +13,7 @@ Usage:
|
|
| 13 |
"""
|
| 14 |
import argparse
|
| 15 |
import logging
|
|
|
|
| 16 |
import sys
|
| 17 |
from pathlib import Path
|
| 18 |
from typing import Optional, List
|
|
@@ -276,7 +277,9 @@ def run_checker(config: BibGuardConfig, template=None):
|
|
| 276 |
arxiv_fetcher = ArxivFetcher()
|
| 277 |
|
| 278 |
if bib_config.check_metadata:
|
| 279 |
-
semantic_scholar_fetcher = SemanticScholarFetcher(
|
|
|
|
|
|
|
| 280 |
openalex_fetcher = OpenAlexFetcher()
|
| 281 |
dblp_fetcher = DBLPFetcher()
|
| 282 |
crossref_fetcher = CrossRefFetcher()
|
|
|
|
| 13 |
"""
|
| 14 |
import argparse
|
| 15 |
import logging
|
| 16 |
+
import os
|
| 17 |
import sys
|
| 18 |
from pathlib import Path
|
| 19 |
from typing import Optional, List
|
|
|
|
| 277 |
arxiv_fetcher = ArxivFetcher()
|
| 278 |
|
| 279 |
if bib_config.check_metadata:
|
| 280 |
+
semantic_scholar_fetcher = SemanticScholarFetcher(
|
| 281 |
+
api_key=os.environ.get("SEMANTIC_SCHOLAR_API_KEY") or None
|
| 282 |
+
)
|
| 283 |
openalex_fetcher = OpenAlexFetcher()
|
| 284 |
dblp_fetcher = DBLPFetcher()
|
| 285 |
crossref_fetcher = CrossRefFetcher()
|
src/checkers/url_checker.py
CHANGED
|
@@ -12,6 +12,7 @@ from __future__ import annotations
|
|
| 12 |
|
| 13 |
import concurrent.futures
|
| 14 |
import logging
|
|
|
|
| 15 |
from dataclasses import dataclass
|
| 16 |
from typing import Iterable, List, Optional
|
| 17 |
|
|
@@ -22,6 +23,17 @@ from src.parsers.bib_parser import BibEntry
|
|
| 22 |
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
@dataclass
|
| 27 |
class URLFinding:
|
|
@@ -41,6 +53,35 @@ class URLChecker:
|
|
| 41 |
self.max_workers = max_workers
|
| 42 |
self.timeout = timeout
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
def _check_one(self, entry: BibEntry) -> Optional[URLFinding]:
|
| 45 |
url = (entry.url or "").strip()
|
| 46 |
if not url:
|
|
@@ -48,6 +89,12 @@ class URLChecker:
|
|
| 48 |
if any(url.lower().startswith(p) for p in self.SKIP_PREFIXES):
|
| 49 |
return URLFinding(entry.key, url, "skipped", detail="non-http scheme")
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
session = get_session()
|
| 52 |
try:
|
| 53 |
r = session.head(url, allow_redirects=True, timeout=self.timeout)
|
|
|
|
| 12 |
|
| 13 |
import concurrent.futures
|
| 14 |
import logging
|
| 15 |
+
import re
|
| 16 |
from dataclasses import dataclass
|
| 17 |
from typing import Iterable, List, Optional
|
| 18 |
|
|
|
|
| 23 |
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
| 26 |
+
# arxiv.org HTML endpoints (`/abs/...`, `/pdf/...`) routinely reset
|
| 27 |
+
# connections from cloud egress IPs (HF Spaces, AWS, GCP). The arXiv
|
| 28 |
+
# *export API* — same paper IDs, official endpoint — is far more stable.
|
| 29 |
+
# When we see an arxiv URL, we verify it by querying export.arxiv.org
|
| 30 |
+
# instead of HEAD'ing arxiv.org directly.
|
| 31 |
+
_ARXIV_URL_RE = re.compile(
|
| 32 |
+
r"^https?://(?:www\.)?arxiv\.org/(?:abs|pdf|html)/([\w.\-/]+?)(?:\.pdf|\.html)?(?:[?#]|$)",
|
| 33 |
+
re.IGNORECASE,
|
| 34 |
+
)
|
| 35 |
+
_ARXIV_EXPORT_API = "http://export.arxiv.org/api/query"
|
| 36 |
+
|
| 37 |
|
| 38 |
@dataclass
|
| 39 |
class URLFinding:
|
|
|
|
| 53 |
self.max_workers = max_workers
|
| 54 |
self.timeout = timeout
|
| 55 |
|
| 56 |
+
def _check_arxiv_via_api(self, entry_key: str, url: str, arxiv_id: str) -> URLFinding:
|
| 57 |
+
"""Verify an arxiv URL by hitting export.arxiv.org instead of arxiv.org.
|
| 58 |
+
|
| 59 |
+
Returns "ok" if the export API returns an Atom entry for the ID,
|
| 60 |
+
"broken" if the feed is empty (ID doesn't exist), or "unreachable"
|
| 61 |
+
if the API itself fails.
|
| 62 |
+
"""
|
| 63 |
+
session = get_session()
|
| 64 |
+
try:
|
| 65 |
+
r = session.get(
|
| 66 |
+
_ARXIV_EXPORT_API,
|
| 67 |
+
params={"id_list": arxiv_id, "max_results": 1},
|
| 68 |
+
timeout=self.timeout,
|
| 69 |
+
)
|
| 70 |
+
r.raise_for_status()
|
| 71 |
+
except requests.RequestException as e:
|
| 72 |
+
logger.debug("arXiv API check failed for %s: %s", url, e, exc_info=True)
|
| 73 |
+
return URLFinding(entry_key, url, "unreachable", detail=f"arxiv-api: {str(e)[:100]}")
|
| 74 |
+
# The Atom feed contains `<entry>` only when the ID resolves. An
|
| 75 |
+
# empty feed (totalResults=0) means the ID is bogus.
|
| 76 |
+
body = r.text or ""
|
| 77 |
+
if "<entry>" in body or "<entry " in body:
|
| 78 |
+
return URLFinding(entry_key, url, "ok", status_code=200)
|
| 79 |
+
return URLFinding(
|
| 80 |
+
entry_key, url, "broken",
|
| 81 |
+
status_code=200,
|
| 82 |
+
detail=f"arxiv id {arxiv_id!r} not found in export API",
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
def _check_one(self, entry: BibEntry) -> Optional[URLFinding]:
|
| 86 |
url = (entry.url or "").strip()
|
| 87 |
if not url:
|
|
|
|
| 89 |
if any(url.lower().startswith(p) for p in self.SKIP_PREFIXES):
|
| 90 |
return URLFinding(entry.key, url, "skipped", detail="non-http scheme")
|
| 91 |
|
| 92 |
+
# arxiv.org HEAD requests get connection-reset on shared egress IPs.
|
| 93 |
+
# Re-route to the export API, which is the official liveness signal.
|
| 94 |
+
m = _ARXIV_URL_RE.match(url)
|
| 95 |
+
if m:
|
| 96 |
+
return self._check_arxiv_via_api(entry.key, url, m.group(1))
|
| 97 |
+
|
| 98 |
session = get_session()
|
| 99 |
try:
|
| 100 |
r = session.head(url, allow_redirects=True, timeout=self.timeout)
|
src/fetchers/openalex_fetcher.py
CHANGED
|
@@ -9,7 +9,7 @@ from typing import Optional
|
|
| 9 |
|
| 10 |
import requests
|
| 11 |
|
| 12 |
-
from src.utils.http import get_session, is_open, record_failure, record_success
|
| 13 |
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
_SOURCE = "openalex"
|
|
@@ -66,6 +66,12 @@ class OpenAlexFetcher:
|
|
| 66 |
|
| 67 |
url = f"{self.BASE_URL}/works"
|
| 68 |
params = {'search': title, 'per-page': max_results}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
try:
|
| 71 |
response = get_session().get(url, params=params, timeout=8)
|
|
@@ -91,9 +97,11 @@ class OpenAlexFetcher:
|
|
| 91 |
|
| 92 |
doi_url = f"https://doi.org/{doi}"
|
| 93 |
url = f"{self.BASE_URL}/works/{doi_url}"
|
|
|
|
|
|
|
| 94 |
|
| 95 |
try:
|
| 96 |
-
response = get_session().get(url, timeout=8)
|
| 97 |
response.raise_for_status()
|
| 98 |
data = response.json()
|
| 99 |
record_success(_SOURCE)
|
|
|
|
| 9 |
|
| 10 |
import requests
|
| 11 |
|
| 12 |
+
from src.utils.http import get_session, is_open, record_failure, record_success, contact_email
|
| 13 |
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
_SOURCE = "openalex"
|
|
|
|
| 66 |
|
| 67 |
url = f"{self.BASE_URL}/works"
|
| 68 |
params = {'search': title, 'per-page': max_results}
|
| 69 |
+
# OpenAlex polite pool: docs explicitly recommend mailto as a query
|
| 70 |
+
# param (the UA-embedded mailto is honored too, but the query form is
|
| 71 |
+
# the canonical signal). Costs nothing if no email is configured.
|
| 72 |
+
email = contact_email()
|
| 73 |
+
if email:
|
| 74 |
+
params['mailto'] = email
|
| 75 |
|
| 76 |
try:
|
| 77 |
response = get_session().get(url, params=params, timeout=8)
|
|
|
|
| 97 |
|
| 98 |
doi_url = f"https://doi.org/{doi}"
|
| 99 |
url = f"{self.BASE_URL}/works/{doi_url}"
|
| 100 |
+
email = contact_email()
|
| 101 |
+
params = {'mailto': email} if email else None
|
| 102 |
|
| 103 |
try:
|
| 104 |
+
response = get_session().get(url, params=params, timeout=8)
|
| 105 |
response.raise_for_status()
|
| 106 |
data = response.json()
|
| 107 |
record_success(_SOURCE)
|
src/fetchers/retraction_fetcher.py
CHANGED
|
@@ -14,10 +14,16 @@ from typing import Optional
|
|
| 14 |
|
| 15 |
import requests
|
| 16 |
|
| 17 |
-
from src.utils.http import get_session
|
| 18 |
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
@dataclass
|
| 23 |
class RetractionResult:
|
|
@@ -48,6 +54,13 @@ class RetractionFetcher:
|
|
| 48 |
doi = doi.replace("https://doi.org/", "").replace("http://doi.org/", "").strip()
|
| 49 |
if not doi:
|
| 50 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
try:
|
| 53 |
response = get_session().get(
|
|
@@ -56,10 +69,14 @@ class RetractionFetcher:
|
|
| 56 |
timeout=20,
|
| 57 |
)
|
| 58 |
if response.status_code == 404:
|
|
|
|
|
|
|
| 59 |
return None
|
| 60 |
response.raise_for_status()
|
|
|
|
| 61 |
except requests.RequestException as e:
|
| 62 |
logger.debug("Retraction lookup failed for %s: %s", doi, e, exc_info=True)
|
|
|
|
| 63 |
return None
|
| 64 |
|
| 65 |
try:
|
|
|
|
| 14 |
|
| 15 |
import requests
|
| 16 |
|
| 17 |
+
from src.utils.http import get_session, is_open, record_failure, record_success
|
| 18 |
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
+
# Share the metadata fetcher's circuit breaker. Both hit the same
|
| 22 |
+
# api.crossref.org host, so if metadata lookups already proved Crossref is
|
| 23 |
+
# limiting us, we shouldn't keep firing retraction lookups at the same
|
| 24 |
+
# host — that just deepens the 429 hole.
|
| 25 |
+
_SOURCE = "crossref"
|
| 26 |
+
|
| 27 |
|
| 28 |
@dataclass
|
| 29 |
class RetractionResult:
|
|
|
|
| 54 |
doi = doi.replace("https://doi.org/", "").replace("http://doi.org/", "").strip()
|
| 55 |
if not doi:
|
| 56 |
return None
|
| 57 |
+
# arxiv-shaped DOIs are not indexed by Crossref — skip them outright
|
| 58 |
+
# rather than burning a guaranteed 404 (and a circuit-breaker tick)
|
| 59 |
+
# on every preprint in the bibliography.
|
| 60 |
+
if doi.lower().startswith("10.48550/arxiv"):
|
| 61 |
+
return None
|
| 62 |
+
if is_open(_SOURCE):
|
| 63 |
+
return None
|
| 64 |
|
| 65 |
try:
|
| 66 |
response = get_session().get(
|
|
|
|
| 69 |
timeout=20,
|
| 70 |
)
|
| 71 |
if response.status_code == 404:
|
| 72 |
+
# 404 means "no such DOI" — that's a real answer, not a failure
|
| 73 |
+
record_success(_SOURCE)
|
| 74 |
return None
|
| 75 |
response.raise_for_status()
|
| 76 |
+
record_success(_SOURCE)
|
| 77 |
except requests.RequestException as e:
|
| 78 |
logger.debug("Retraction lookup failed for %s: %s", doi, e, exc_info=True)
|
| 79 |
+
record_failure(_SOURCE)
|
| 80 |
return None
|
| 81 |
|
| 82 |
try:
|
src/fetchers/semantic_scholar_fetcher.py
CHANGED
|
@@ -38,14 +38,23 @@ class SemanticScholarFetcher:
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
BASE_URL = "https://api.semanticscholar.org/graph/v1"
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
def __init__(self, api_key: Optional[str] = None):
|
| 44 |
"""
|
| 45 |
Semantic Scholar fetcher. Uses shared session; api_key is added per-call
|
| 46 |
as a header so the cache key includes it.
|
| 47 |
"""
|
| 48 |
self.api_key = api_key
|
|
|
|
|
|
|
|
|
|
| 49 |
self._last_request_time = 0.0
|
| 50 |
|
| 51 |
def _headers(self) -> dict:
|
|
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
BASE_URL = "https://api.semanticscholar.org/graph/v1"
|
| 41 |
+
# S2 publicly states its "introductory" rate for keyed clients is 1 r/s
|
| 42 |
+
# on all endpoints. Sustained 2 r/s (0.5s delay) was eating burst budget
|
| 43 |
+
# and risking 429s that trip our circuit breaker for the whole run.
|
| 44 |
+
# Without a key, the 100 req / 5 min ≈ 0.33 r/s shared limit applies and
|
| 45 |
+
# we'll 429 within a few calls anyway — the breaker handles that case.
|
| 46 |
+
RATE_LIMIT_DELAY_KEYED = 1.0
|
| 47 |
+
RATE_LIMIT_DELAY_ANON = 0.5
|
| 48 |
+
|
| 49 |
def __init__(self, api_key: Optional[str] = None):
|
| 50 |
"""
|
| 51 |
Semantic Scholar fetcher. Uses shared session; api_key is added per-call
|
| 52 |
as a header so the cache key includes it.
|
| 53 |
"""
|
| 54 |
self.api_key = api_key
|
| 55 |
+
self.RATE_LIMIT_DELAY = (
|
| 56 |
+
self.RATE_LIMIT_DELAY_KEYED if api_key else self.RATE_LIMIT_DELAY_ANON
|
| 57 |
+
)
|
| 58 |
self._last_request_time = 0.0
|
| 59 |
|
| 60 |
def _headers(self) -> dict:
|
src/utils/http.py
CHANGED
|
@@ -74,6 +74,11 @@ def user_agent() -> str:
|
|
| 74 |
return "BibGuard/1.0 (+https://github.com/thinkwee/BibGuard)"
|
| 75 |
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
def _build_session() -> requests.Session:
|
| 78 |
"""Construct a Session with retry and (optionally) caching."""
|
| 79 |
cache_enabled = _settings["cache_enabled"]
|
|
|
|
| 74 |
return "BibGuard/1.0 (+https://github.com/thinkwee/BibGuard)"
|
| 75 |
|
| 76 |
|
| 77 |
+
def contact_email() -> str:
|
| 78 |
+
"""Return the configured polite-pool contact email, or empty string."""
|
| 79 |
+
return _settings.get("contact_email") or ""
|
| 80 |
+
|
| 81 |
+
|
| 82 |
def _build_session() -> requests.Session:
|
| 83 |
"""Construct a Session with retry and (optionally) caching."""
|
| 84 |
cache_enabled = _settings["cache_enabled"]
|