thinkwee commited on
Commit
58f0f1d
·
1 Parent(s): dc4b1cd

improve api stability

Browse files
src/checkers/url_checker.py CHANGED
@@ -18,7 +18,7 @@ from typing import Iterable, List, Optional
18
 
19
  import requests
20
 
21
- from src.utils.http import get_session
22
  from src.parsers.bib_parser import BibEntry
23
 
24
  logger = logging.getLogger(__name__)
@@ -33,6 +33,14 @@ _ARXIV_URL_RE = re.compile(
33
  re.IGNORECASE,
34
  )
35
  _ARXIV_EXPORT_API = "http://export.arxiv.org/api/query"
 
 
 
 
 
 
 
 
36
 
37
 
38
  @dataclass
@@ -56,10 +64,19 @@ class URLChecker:
56
  def _check_arxiv_via_api(self, entry_key: str, url: str, arxiv_id: str) -> URLFinding:
57
  """Verify an arxiv URL by hitting export.arxiv.org instead of arxiv.org.
58
 
 
 
 
 
59
  Returns "ok" if the export API returns an Atom entry for the ID,
60
- "broken" if the feed is empty (ID doesn't exist), or "unreachable"
61
- if the API itself fails.
62
  """
 
 
 
 
 
63
  session = get_session()
64
  try:
65
  r = session.get(
@@ -68,8 +85,10 @@ class URLChecker:
68
  timeout=self.timeout,
69
  )
70
  r.raise_for_status()
 
71
  except requests.RequestException as e:
72
  logger.debug("arXiv API check failed for %s: %s", url, e, exc_info=True)
 
73
  return URLFinding(entry_key, url, "unreachable", detail=f"arxiv-api: {str(e)[:100]}")
74
  # The Atom feed contains `<entry>` only when the ID resolves. An
75
  # empty feed (totalResults=0) means the ID is bogus.
 
18
 
19
  import requests
20
 
21
+ from src.utils.http import get_session, is_open, record_failure, record_success
22
  from src.parsers.bib_parser import BibEntry
23
 
24
  logger = logging.getLogger(__name__)
 
33
  re.IGNORECASE,
34
  )
35
  _ARXIV_EXPORT_API = "http://export.arxiv.org/api/query"
36
+ # Share the arxiv metadata fetcher's circuit breaker. Both hit
37
+ # export.arxiv.org; once the breaker is tripped (typically after 2 quick
38
+ # 429s from the metadata fetcher), it makes no sense for the URL checker
39
+ # to keep firing requests at the same dead host — that was producing
40
+ # 18+ false "unreachable" findings for arxiv URLs that are actually fine.
41
+ # When the breaker is open, mark the URL as "skipped" so the report
42
+ # doesn't falsely claim it's broken.
43
+ _ARXIV_SOURCE = "arxiv"
44
 
45
 
46
  @dataclass
 
64
  def _check_arxiv_via_api(self, entry_key: str, url: str, arxiv_id: str) -> URLFinding:
65
  """Verify an arxiv URL by hitting export.arxiv.org instead of arxiv.org.
66
 
67
+ Honors the shared `arxiv` circuit breaker: if the metadata fetcher
68
+ already proved the host is rate-limiting us, we report "skipped"
69
+ rather than spamming the host and reporting bogus "unreachable".
70
+
71
  Returns "ok" if the export API returns an Atom entry for the ID,
72
+ "broken" if the feed is empty (ID doesn't exist), "skipped" if the
73
+ breaker is open, or "unreachable" if the API itself fails.
74
  """
75
+ if is_open(_ARXIV_SOURCE):
76
+ return URLFinding(
77
+ entry_key, url, "skipped",
78
+ detail="arxiv source rate-limited (circuit breaker open)",
79
+ )
80
  session = get_session()
81
  try:
82
  r = session.get(
 
85
  timeout=self.timeout,
86
  )
87
  r.raise_for_status()
88
+ record_success(_ARXIV_SOURCE)
89
  except requests.RequestException as e:
90
  logger.debug("arXiv API check failed for %s: %s", url, e, exc_info=True)
91
+ record_failure(_ARXIV_SOURCE)
92
  return URLFinding(entry_key, url, "unreachable", detail=f"arxiv-api: {str(e)[:100]}")
93
  # The Atom feed contains `<entry>` only when the ID resolves. An
94
  # empty feed (totalResults=0) means the ID is bogus.
src/fetchers/semantic_scholar_fetcher.py CHANGED
@@ -38,12 +38,13 @@ class SemanticScholarFetcher:
38
  """
39
 
40
  BASE_URL = "https://api.semanticscholar.org/graph/v1"
41
- # S2 publicly states its "introductory" rate for keyed clients is 1 r/s
42
- # on all endpoints. Sustained 2 r/s (0.5s delay) was eating burst budget
43
- # and risking 429s that trip our circuit breaker for the whole run.
 
44
  # Without a key, the 100 req / 5 min ≈ 0.33 r/s shared limit applies and
45
  # we'll 429 within a few calls anyway — the breaker handles that case.
46
- RATE_LIMIT_DELAY_KEYED = 1.0
47
  RATE_LIMIT_DELAY_ANON = 0.5
48
 
49
  def __init__(self, api_key: Optional[str] = None):
 
38
  """
39
 
40
  BASE_URL = "https://api.semanticscholar.org/graph/v1"
41
+ # S2's introductory keyed rate is 1 r/s cumulative across endpoints, and
42
+ # their welcome email explicitly says "Please set your rate limit to
43
+ # *below* this threshold". 1.1 s gives ~0.91 r/s sustained under the
44
+ # ceiling, with a small margin for clock drift / thread scheduling jitter.
45
  # Without a key, the 100 req / 5 min ≈ 0.33 r/s shared limit applies and
46
  # we'll 429 within a few calls anyway — the breaker handles that case.
47
+ RATE_LIMIT_DELAY_KEYED = 1.1
48
  RATE_LIMIT_DELAY_ANON = 0.5
49
 
50
  def __init__(self, api_key: Optional[str] = None):