Spaces:

thinkwee
/

BibGuard

Running

App Files Files Community

thinkwee commited on 8 days ago

Commit

58f0f1d

1 Parent(s): dc4b1cd

improve api stability

Browse files

Files changed (2) hide show

src/checkers/url_checker.py +22 -3
src/fetchers/semantic_scholar_fetcher.py +5 -4

src/checkers/url_checker.py CHANGED Viewed

@@ -18,7 +18,7 @@ from typing import Iterable, List, Optional
 import requests
-from src.utils.http import get_session
 from src.parsers.bib_parser import BibEntry
 logger = logging.getLogger(__name__)
@@ -33,6 +33,14 @@ _ARXIV_URL_RE = re.compile(
     re.IGNORECASE,
 )
 _ARXIV_EXPORT_API = "http://export.arxiv.org/api/query"
 @dataclass
@@ -56,10 +64,19 @@ class URLChecker:
     def _check_arxiv_via_api(self, entry_key: str, url: str, arxiv_id: str) -> URLFinding:
         """Verify an arxiv URL by hitting export.arxiv.org instead of arxiv.org.
         Returns "ok" if the export API returns an Atom entry for the ID,
-        "broken" if the feed is empty (ID doesn't exist), or "unreachable"
-        if the API itself fails.
         """
         session = get_session()
         try:
             r = session.get(
@@ -68,8 +85,10 @@ class URLChecker:
                 timeout=self.timeout,
             )
             r.raise_for_status()
         except requests.RequestException as e:
             logger.debug("arXiv API check failed for %s: %s", url, e, exc_info=True)
             return URLFinding(entry_key, url, "unreachable", detail=f"arxiv-api: {str(e)[:100]}")
         # The Atom feed contains `<entry>` only when the ID resolves. An
         # empty feed (totalResults=0) means the ID is bogus.

 import requests
+from src.utils.http import get_session, is_open, record_failure, record_success
 from src.parsers.bib_parser import BibEntry
 logger = logging.getLogger(__name__)
     re.IGNORECASE,
 )
 _ARXIV_EXPORT_API = "http://export.arxiv.org/api/query"
+# Share the arxiv metadata fetcher's circuit breaker. Both hit
+# export.arxiv.org; once the breaker is tripped (typically after 2 quick
+# 429s from the metadata fetcher), it makes no sense for the URL checker
+# to keep firing requests at the same dead host — that was producing
+# 18+ false "unreachable" findings for arxiv URLs that are actually fine.
+# When the breaker is open, mark the URL as "skipped" so the report
+# doesn't falsely claim it's broken.
+_ARXIV_SOURCE = "arxiv"
 @dataclass
     def _check_arxiv_via_api(self, entry_key: str, url: str, arxiv_id: str) -> URLFinding:
         """Verify an arxiv URL by hitting export.arxiv.org instead of arxiv.org.
+        Honors the shared `arxiv` circuit breaker: if the metadata fetcher
+        already proved the host is rate-limiting us, we report "skipped"
+        rather than spamming the host and reporting bogus "unreachable".
         Returns "ok" if the export API returns an Atom entry for the ID,
+        "broken" if the feed is empty (ID doesn't exist), "skipped" if the
+        breaker is open, or "unreachable" if the API itself fails.
         """
+        if is_open(_ARXIV_SOURCE):
+            return URLFinding(
+                entry_key, url, "skipped",
+                detail="arxiv source rate-limited (circuit breaker open)",
+            )
         session = get_session()
         try:
             r = session.get(
                 timeout=self.timeout,
             )
             r.raise_for_status()
+            record_success(_ARXIV_SOURCE)
         except requests.RequestException as e:
             logger.debug("arXiv API check failed for %s: %s", url, e, exc_info=True)
+            record_failure(_ARXIV_SOURCE)
             return URLFinding(entry_key, url, "unreachable", detail=f"arxiv-api: {str(e)[:100]}")
         # The Atom feed contains `<entry>` only when the ID resolves. An
         # empty feed (totalResults=0) means the ID is bogus.

src/fetchers/semantic_scholar_fetcher.py CHANGED Viewed

@@ -38,12 +38,13 @@ class SemanticScholarFetcher:
     """
     BASE_URL = "https://api.semanticscholar.org/graph/v1"
-    # S2 publicly states its "introductory" rate for keyed clients is 1 r/s
-    # on all endpoints. Sustained 2 r/s (0.5s delay) was eating burst budget
-    # and risking 429s that trip our circuit breaker for the whole run.
     # Without a key, the 100 req / 5 min ≈ 0.33 r/s shared limit applies and
     # we'll 429 within a few calls anyway — the breaker handles that case.
-    RATE_LIMIT_DELAY_KEYED = 1.0
     RATE_LIMIT_DELAY_ANON = 0.5
     def __init__(self, api_key: Optional[str] = None):

     """
     BASE_URL = "https://api.semanticscholar.org/graph/v1"
+    # S2's introductory keyed rate is 1 r/s cumulative across endpoints, and
+    # their welcome email explicitly says "Please set your rate limit to
+    # *below* this threshold". 1.1 s gives ~0.91 r/s sustained — under the
+    # ceiling, with a small margin for clock drift / thread scheduling jitter.
     # Without a key, the 100 req / 5 min ≈ 0.33 r/s shared limit applies and
     # we'll 429 within a few calls anyway — the breaker handles that case.
+    RATE_LIMIT_DELAY_KEYED = 1.1
     RATE_LIMIT_DELAY_ANON = 0.5
     def __init__(self, api_key: Optional[str] = None):