Spaces:
Running
Running
| """Fetch arxiv abstracts on demand, verify against the frozen manifest. | |
| The repo deliberately does not commit abstract text — see project memory | |
| (no_arxiv_storage). Only IDs + SHA-256 hashes are versioned. Runners | |
| fetch text at eval time and abort on hash mismatch so retrieval numbers | |
| remain reproducible against the exact snapshot. | |
| Cache layout: $PHOTON_EVAL_CACHE (default ~/.cache/photon-route/eval/), | |
| one .txt file per arxiv ID. The cache is content-addressed implicitly: | |
| manifest.hashes[id] is the authoritative SHA-256. | |
| Network: scrapes the og:description meta tag from arxiv.org/abs/<id> | |
| HTML pages (CDN-cached via Google Frontend, no per-IP rate limit in | |
| practice). The official export.arxiv.org/api endpoint is rate-limited | |
| to ~1 req / 3s and easily 429s during eval runs, so it isn't used. | |
| A browser-like User-Agent is required: arxiv.org returns HTTP 406 to | |
| non-browser UAs from datacenter IPs (caught HF Space build failure | |
| 2026-05-05). | |
| """ | |
| from __future__ import annotations | |
| import hashlib | |
| import html | |
| import json | |
| import os | |
| import re | |
| import time | |
| import urllib.parse | |
| import urllib.request | |
| from pathlib import Path | |
| from typing import Iterable | |
| ARXIV_ABS = "https://arxiv.org/abs/" | |
| DEFAULT_CACHE = Path(os.environ.get("PHOTON_EVAL_CACHE", str(Path.home() / ".cache/photon-route/eval"))) | |
| _OG_DESC = re.compile( | |
| r'<meta\s+(?:property|name)="og:description"\s+content="([^"]*)"', | |
| re.IGNORECASE, | |
| ) | |
| def _normalize(text: str) -> str: | |
| return re.sub(r"\s+", " ", text).strip() | |
| def sha256_text(text: str) -> str: | |
| return hashlib.sha256(text.encode("utf-8")).hexdigest() | |
| _BROWSER_HEADERS = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " | |
| "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" | |
| ), | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| } | |
| def _fetch_one(arxiv_id: str, timeout: float = 30.0, max_retries: int = 4) -> str: | |
| """Fetch one abstract via abs-page scrape. Returns normalized abstract text.""" | |
| url = ARXIV_ABS + arxiv_id | |
| req = urllib.request.Request(url, headers=_BROWSER_HEADERS) | |
| delay = 2.0 | |
| last_err: Exception | None = None | |
| for attempt in range(max_retries): | |
| try: | |
| with urllib.request.urlopen(req, timeout=timeout) as resp: | |
| body = resp.read().decode("utf-8", errors="replace") | |
| break | |
| except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: | |
| last_err = e | |
| if attempt + 1 < max_retries: | |
| time.sleep(delay) | |
| delay = min(delay * 2, 32.0) | |
| continue | |
| raise last_err # type: ignore[misc] | |
| m = _OG_DESC.search(body) | |
| if not m: | |
| raise RuntimeError(f"og:description not found for {arxiv_id}") | |
| raw = html.unescape(m.group(1)) | |
| return _normalize(raw) | |
| def fetch_all( | |
| ids: Iterable[str], | |
| cache_dir: Path | None = None, | |
| sleep_between: float = 0.5, | |
| ) -> dict[str, str]: | |
| """Return {id: abstract}. Cached entries are read from disk; missing ones | |
| are scraped one by one from arxiv.org/abs/<id> with a small delay so we | |
| don't hammer the CDN.""" | |
| cache_dir = cache_dir or DEFAULT_CACHE | |
| cache_dir.mkdir(parents=True, exist_ok=True) | |
| ids = list(ids) | |
| out: dict[str, str] = {} | |
| missing: list[str] = [] | |
| for i in ids: | |
| p = cache_dir / f"{i}.txt" | |
| if p.exists(): | |
| out[i] = p.read_text("utf-8") | |
| else: | |
| missing.append(i) | |
| for j, arxiv_id in enumerate(missing): | |
| text = _fetch_one(arxiv_id) | |
| (cache_dir / f"{arxiv_id}.txt").write_text(text, encoding="utf-8") | |
| out[arxiv_id] = text | |
| if j + 1 < len(missing): | |
| time.sleep(sleep_between) | |
| return out | |
| def verify_against_manifest( | |
| abstracts: dict[str, str], manifest_path: Path | |
| ) -> dict[str, str]: | |
| """Returns {} on success, or {id: actual_hash} for mismatches.""" | |
| manifest = json.loads(manifest_path.read_text("utf-8")) | |
| expected = manifest.get("hashes", {}) | |
| if not expected: | |
| return {} | |
| bad: dict[str, str] = {} | |
| for arxiv_id, text in abstracts.items(): | |
| actual = sha256_text(text) | |
| if expected.get(arxiv_id) != actual: | |
| bad[arxiv_id] = actual | |
| return bad | |
| def freeze_manifest( | |
| abstracts: dict[str, str], | |
| manifest_path: Path, | |
| source_url: str = ARXIV_ABS, | |
| ) -> None: | |
| """Write a fresh manifest; intended to be run once to lock the snapshot.""" | |
| payload = { | |
| "schema_version": 1, | |
| "description": json.loads(manifest_path.read_text("utf-8")).get( | |
| "description", "" | |
| ) if manifest_path.exists() else "", | |
| "snapshot_taken_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), | |
| "snapshot_source": source_url, | |
| "hash_algo": "sha256", | |
| "hashes": {k: sha256_text(v) for k, v in sorted(abstracts.items())}, | |
| } | |
| manifest_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") | |
| if __name__ == "__main__": | |
| import argparse | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--corpus", type=Path, default=Path(__file__).parent / "corpus_ids.json") | |
| ap.add_argument("--manifest", type=Path, default=Path(__file__).parent / "manifest.json") | |
| ap.add_argument("--freeze", action="store_true", | |
| help="Overwrite manifest with hashes of currently fetched abstracts") | |
| args = ap.parse_args() | |
| ids = json.loads(args.corpus.read_text("utf-8"))["ids"] | |
| abstracts = fetch_all(ids) | |
| print(f"fetched {len(abstracts)} / {len(ids)} abstracts") | |
| if args.freeze: | |
| freeze_manifest(abstracts, args.manifest) | |
| print(f"wrote manifest with {len(abstracts)} hashes -> {args.manifest}") | |
| else: | |
| bad = verify_against_manifest(abstracts, args.manifest) | |
| if bad: | |
| print(f"HASH MISMATCH on {len(bad)} ids: {bad}") | |
| raise SystemExit(2) | |
| print("manifest verified" if json.loads(args.manifest.read_text("utf-8")).get("hashes") else "manifest empty (run with --freeze)") | |