Spaces:
Running
Running
File size: 6,119 Bytes
dd03ac3 dda3229 dd03ac3 dda3229 dd03ac3 dda3229 dd03ac3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | """Fetch arxiv abstracts on demand, verify against the frozen manifest.
The repo deliberately does not commit abstract text — see project memory
(no_arxiv_storage). Only IDs + SHA-256 hashes are versioned. Runners
fetch text at eval time and abort on hash mismatch so retrieval numbers
remain reproducible against the exact snapshot.
Cache layout: $PHOTON_EVAL_CACHE (default ~/.cache/photon-route/eval/),
one .txt file per arxiv ID. The cache is content-addressed implicitly:
manifest.hashes[id] is the authoritative SHA-256.
Network: scrapes the og:description meta tag from arxiv.org/abs/<id>
HTML pages (CDN-cached via Google Frontend, no per-IP rate limit in
practice). The official export.arxiv.org/api endpoint is rate-limited
to ~1 req / 3s and easily 429s during eval runs, so it isn't used.
A browser-like User-Agent is required: arxiv.org returns HTTP 406 to
non-browser UAs from datacenter IPs (caught HF Space build failure
2026-05-05).
"""
from __future__ import annotations
import hashlib
import html
import json
import os
import re
import time
import urllib.parse
import urllib.request
from pathlib import Path
from typing import Iterable
ARXIV_ABS = "https://arxiv.org/abs/"
DEFAULT_CACHE = Path(os.environ.get("PHOTON_EVAL_CACHE", str(Path.home() / ".cache/photon-route/eval")))
_OG_DESC = re.compile(
r'<meta\s+(?:property|name)="og:description"\s+content="([^"]*)"',
re.IGNORECASE,
)
def _normalize(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def sha256_text(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()
_BROWSER_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
def _fetch_one(arxiv_id: str, timeout: float = 30.0, max_retries: int = 4) -> str:
"""Fetch one abstract via abs-page scrape. Returns normalized abstract text."""
url = ARXIV_ABS + arxiv_id
req = urllib.request.Request(url, headers=_BROWSER_HEADERS)
delay = 2.0
last_err: Exception | None = None
for attempt in range(max_retries):
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
body = resp.read().decode("utf-8", errors="replace")
break
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
last_err = e
if attempt + 1 < max_retries:
time.sleep(delay)
delay = min(delay * 2, 32.0)
continue
raise last_err # type: ignore[misc]
m = _OG_DESC.search(body)
if not m:
raise RuntimeError(f"og:description not found for {arxiv_id}")
raw = html.unescape(m.group(1))
return _normalize(raw)
def fetch_all(
ids: Iterable[str],
cache_dir: Path | None = None,
sleep_between: float = 0.5,
) -> dict[str, str]:
"""Return {id: abstract}. Cached entries are read from disk; missing ones
are scraped one by one from arxiv.org/abs/<id> with a small delay so we
don't hammer the CDN."""
cache_dir = cache_dir or DEFAULT_CACHE
cache_dir.mkdir(parents=True, exist_ok=True)
ids = list(ids)
out: dict[str, str] = {}
missing: list[str] = []
for i in ids:
p = cache_dir / f"{i}.txt"
if p.exists():
out[i] = p.read_text("utf-8")
else:
missing.append(i)
for j, arxiv_id in enumerate(missing):
text = _fetch_one(arxiv_id)
(cache_dir / f"{arxiv_id}.txt").write_text(text, encoding="utf-8")
out[arxiv_id] = text
if j + 1 < len(missing):
time.sleep(sleep_between)
return out
def verify_against_manifest(
abstracts: dict[str, str], manifest_path: Path
) -> dict[str, str]:
"""Returns {} on success, or {id: actual_hash} for mismatches."""
manifest = json.loads(manifest_path.read_text("utf-8"))
expected = manifest.get("hashes", {})
if not expected:
return {}
bad: dict[str, str] = {}
for arxiv_id, text in abstracts.items():
actual = sha256_text(text)
if expected.get(arxiv_id) != actual:
bad[arxiv_id] = actual
return bad
def freeze_manifest(
abstracts: dict[str, str],
manifest_path: Path,
source_url: str = ARXIV_ABS,
) -> None:
"""Write a fresh manifest; intended to be run once to lock the snapshot."""
payload = {
"schema_version": 1,
"description": json.loads(manifest_path.read_text("utf-8")).get(
"description", ""
) if manifest_path.exists() else "",
"snapshot_taken_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"snapshot_source": source_url,
"hash_algo": "sha256",
"hashes": {k: sha256_text(v) for k, v in sorted(abstracts.items())},
}
manifest_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("--corpus", type=Path, default=Path(__file__).parent / "corpus_ids.json")
ap.add_argument("--manifest", type=Path, default=Path(__file__).parent / "manifest.json")
ap.add_argument("--freeze", action="store_true",
help="Overwrite manifest with hashes of currently fetched abstracts")
args = ap.parse_args()
ids = json.loads(args.corpus.read_text("utf-8"))["ids"]
abstracts = fetch_all(ids)
print(f"fetched {len(abstracts)} / {len(ids)} abstracts")
if args.freeze:
freeze_manifest(abstracts, args.manifest)
print(f"wrote manifest with {len(abstracts)} hashes -> {args.manifest}")
else:
bad = verify_against_manifest(abstracts, args.manifest)
if bad:
print(f"HASH MISMATCH on {len(bad)} ids: {bad}")
raise SystemExit(2)
print("manifest verified" if json.loads(args.manifest.read_text("utf-8")).get("hashes") else "manifest empty (run with --freeze)")
|