thinkwee commited on
Commit ·
58f0f1d
1
Parent(s): dc4b1cd
improve api stability
Browse files
src/checkers/url_checker.py
CHANGED
|
@@ -18,7 +18,7 @@ from typing import Iterable, List, Optional
|
|
| 18 |
|
| 19 |
import requests
|
| 20 |
|
| 21 |
-
from src.utils.http import get_session
|
| 22 |
from src.parsers.bib_parser import BibEntry
|
| 23 |
|
| 24 |
logger = logging.getLogger(__name__)
|
|
@@ -33,6 +33,14 @@ _ARXIV_URL_RE = re.compile(
|
|
| 33 |
re.IGNORECASE,
|
| 34 |
)
|
| 35 |
_ARXIV_EXPORT_API = "http://export.arxiv.org/api/query"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
|
| 38 |
@dataclass
|
|
@@ -56,10 +64,19 @@ class URLChecker:
|
|
| 56 |
def _check_arxiv_via_api(self, entry_key: str, url: str, arxiv_id: str) -> URLFinding:
|
| 57 |
"""Verify an arxiv URL by hitting export.arxiv.org instead of arxiv.org.
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
Returns "ok" if the export API returns an Atom entry for the ID,
|
| 60 |
-
"broken" if the feed is empty (ID doesn't exist),
|
| 61 |
-
if the API itself fails.
|
| 62 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
session = get_session()
|
| 64 |
try:
|
| 65 |
r = session.get(
|
|
@@ -68,8 +85,10 @@ class URLChecker:
|
|
| 68 |
timeout=self.timeout,
|
| 69 |
)
|
| 70 |
r.raise_for_status()
|
|
|
|
| 71 |
except requests.RequestException as e:
|
| 72 |
logger.debug("arXiv API check failed for %s: %s", url, e, exc_info=True)
|
|
|
|
| 73 |
return URLFinding(entry_key, url, "unreachable", detail=f"arxiv-api: {str(e)[:100]}")
|
| 74 |
# The Atom feed contains `<entry>` only when the ID resolves. An
|
| 75 |
# empty feed (totalResults=0) means the ID is bogus.
|
|
|
|
| 18 |
|
| 19 |
import requests
|
| 20 |
|
| 21 |
+
from src.utils.http import get_session, is_open, record_failure, record_success
|
| 22 |
from src.parsers.bib_parser import BibEntry
|
| 23 |
|
| 24 |
logger = logging.getLogger(__name__)
|
|
|
|
| 33 |
re.IGNORECASE,
|
| 34 |
)
|
| 35 |
_ARXIV_EXPORT_API = "http://export.arxiv.org/api/query"
|
| 36 |
+
# Share the arxiv metadata fetcher's circuit breaker. Both hit
|
| 37 |
+
# export.arxiv.org; once the breaker is tripped (typically after 2 quick
|
| 38 |
+
# 429s from the metadata fetcher), it makes no sense for the URL checker
|
| 39 |
+
# to keep firing requests at the same dead host — that was producing
|
| 40 |
+
# 18+ false "unreachable" findings for arxiv URLs that are actually fine.
|
| 41 |
+
# When the breaker is open, mark the URL as "skipped" so the report
|
| 42 |
+
# doesn't falsely claim it's broken.
|
| 43 |
+
_ARXIV_SOURCE = "arxiv"
|
| 44 |
|
| 45 |
|
| 46 |
@dataclass
|
|
|
|
| 64 |
def _check_arxiv_via_api(self, entry_key: str, url: str, arxiv_id: str) -> URLFinding:
|
| 65 |
"""Verify an arxiv URL by hitting export.arxiv.org instead of arxiv.org.
|
| 66 |
|
| 67 |
+
Honors the shared `arxiv` circuit breaker: if the metadata fetcher
|
| 68 |
+
already proved the host is rate-limiting us, we report "skipped"
|
| 69 |
+
rather than spamming the host and reporting bogus "unreachable".
|
| 70 |
+
|
| 71 |
Returns "ok" if the export API returns an Atom entry for the ID,
|
| 72 |
+
"broken" if the feed is empty (ID doesn't exist), "skipped" if the
|
| 73 |
+
breaker is open, or "unreachable" if the API itself fails.
|
| 74 |
"""
|
| 75 |
+
if is_open(_ARXIV_SOURCE):
|
| 76 |
+
return URLFinding(
|
| 77 |
+
entry_key, url, "skipped",
|
| 78 |
+
detail="arxiv source rate-limited (circuit breaker open)",
|
| 79 |
+
)
|
| 80 |
session = get_session()
|
| 81 |
try:
|
| 82 |
r = session.get(
|
|
|
|
| 85 |
timeout=self.timeout,
|
| 86 |
)
|
| 87 |
r.raise_for_status()
|
| 88 |
+
record_success(_ARXIV_SOURCE)
|
| 89 |
except requests.RequestException as e:
|
| 90 |
logger.debug("arXiv API check failed for %s: %s", url, e, exc_info=True)
|
| 91 |
+
record_failure(_ARXIV_SOURCE)
|
| 92 |
return URLFinding(entry_key, url, "unreachable", detail=f"arxiv-api: {str(e)[:100]}")
|
| 93 |
# The Atom feed contains `<entry>` only when the ID resolves. An
|
| 94 |
# empty feed (totalResults=0) means the ID is bogus.
|
src/fetchers/semantic_scholar_fetcher.py
CHANGED
|
@@ -38,12 +38,13 @@ class SemanticScholarFetcher:
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
BASE_URL = "https://api.semanticscholar.org/graph/v1"
|
| 41 |
-
# S2
|
| 42 |
-
#
|
| 43 |
-
#
|
|
|
|
| 44 |
# Without a key, the 100 req / 5 min ≈ 0.33 r/s shared limit applies and
|
| 45 |
# we'll 429 within a few calls anyway — the breaker handles that case.
|
| 46 |
-
RATE_LIMIT_DELAY_KEYED = 1.
|
| 47 |
RATE_LIMIT_DELAY_ANON = 0.5
|
| 48 |
|
| 49 |
def __init__(self, api_key: Optional[str] = None):
|
|
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
BASE_URL = "https://api.semanticscholar.org/graph/v1"
|
| 41 |
+
# S2's introductory keyed rate is 1 r/s cumulative across endpoints, and
|
| 42 |
+
# their welcome email explicitly says "Please set your rate limit to
|
| 43 |
+
# *below* this threshold". 1.1 s gives ~0.91 r/s sustained — under the
|
| 44 |
+
# ceiling, with a small margin for clock drift / thread scheduling jitter.
|
| 45 |
# Without a key, the 100 req / 5 min ≈ 0.33 r/s shared limit applies and
|
| 46 |
# we'll 429 within a few calls anyway — the breaker handles that case.
|
| 47 |
+
RATE_LIMIT_DELAY_KEYED = 1.1
|
| 48 |
RATE_LIMIT_DELAY_ANON = 0.5
|
| 49 |
|
| 50 |
def __init__(self, api_key: Optional[str] = None):
|