File size: 6,119 Bytes
dd03ac3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dda3229
 
 
dd03ac3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dda3229
 
 
 
 
 
 
 
 
 
dd03ac3
 
 
dda3229
dd03ac3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""Fetch arxiv abstracts on demand, verify against the frozen manifest.

The repo deliberately does not commit abstract text — see project memory
(no_arxiv_storage). Only IDs + SHA-256 hashes are versioned. Runners
fetch text at eval time and abort on hash mismatch so retrieval numbers
remain reproducible against the exact snapshot.

Cache layout: $PHOTON_EVAL_CACHE (default ~/.cache/photon-route/eval/),
one .txt file per arxiv ID. The cache is content-addressed implicitly:
manifest.hashes[id] is the authoritative SHA-256.

Network: scrapes the og:description meta tag from arxiv.org/abs/<id>
HTML pages (CDN-cached via Google Frontend, no per-IP rate limit in
practice). The official export.arxiv.org/api endpoint is rate-limited
to ~1 req / 3s and easily 429s during eval runs, so it isn't used.
A browser-like User-Agent is required: arxiv.org returns HTTP 406 to
non-browser UAs from datacenter IPs (caught HF Space build failure
2026-05-05).
"""

from __future__ import annotations

import hashlib
import html
import json
import os
import re
import time
import urllib.parse
import urllib.request
from pathlib import Path
from typing import Iterable

ARXIV_ABS = "https://arxiv.org/abs/"
DEFAULT_CACHE = Path(os.environ.get("PHOTON_EVAL_CACHE", str(Path.home() / ".cache/photon-route/eval")))
_OG_DESC = re.compile(
    r'<meta\s+(?:property|name)="og:description"\s+content="([^"]*)"',
    re.IGNORECASE,
)


def _normalize(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()


def sha256_text(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()


_BROWSER_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}


def _fetch_one(arxiv_id: str, timeout: float = 30.0, max_retries: int = 4) -> str:
    """Fetch one abstract via abs-page scrape. Returns normalized abstract text."""
    url = ARXIV_ABS + arxiv_id
    req = urllib.request.Request(url, headers=_BROWSER_HEADERS)
    delay = 2.0
    last_err: Exception | None = None
    for attempt in range(max_retries):
        try:
            with urllib.request.urlopen(req, timeout=timeout) as resp:
                body = resp.read().decode("utf-8", errors="replace")
            break
        except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
            last_err = e
            if attempt + 1 < max_retries:
                time.sleep(delay)
                delay = min(delay * 2, 32.0)
                continue
            raise last_err  # type: ignore[misc]
    m = _OG_DESC.search(body)
    if not m:
        raise RuntimeError(f"og:description not found for {arxiv_id}")
    raw = html.unescape(m.group(1))
    return _normalize(raw)


def fetch_all(
    ids: Iterable[str],
    cache_dir: Path | None = None,
    sleep_between: float = 0.5,
) -> dict[str, str]:
    """Return {id: abstract}. Cached entries are read from disk; missing ones
    are scraped one by one from arxiv.org/abs/<id> with a small delay so we
    don't hammer the CDN."""
    cache_dir = cache_dir or DEFAULT_CACHE
    cache_dir.mkdir(parents=True, exist_ok=True)
    ids = list(ids)
    out: dict[str, str] = {}
    missing: list[str] = []
    for i in ids:
        p = cache_dir / f"{i}.txt"
        if p.exists():
            out[i] = p.read_text("utf-8")
        else:
            missing.append(i)
    for j, arxiv_id in enumerate(missing):
        text = _fetch_one(arxiv_id)
        (cache_dir / f"{arxiv_id}.txt").write_text(text, encoding="utf-8")
        out[arxiv_id] = text
        if j + 1 < len(missing):
            time.sleep(sleep_between)
    return out


def verify_against_manifest(
    abstracts: dict[str, str], manifest_path: Path
) -> dict[str, str]:
    """Returns {} on success, or {id: actual_hash} for mismatches."""
    manifest = json.loads(manifest_path.read_text("utf-8"))
    expected = manifest.get("hashes", {})
    if not expected:
        return {}
    bad: dict[str, str] = {}
    for arxiv_id, text in abstracts.items():
        actual = sha256_text(text)
        if expected.get(arxiv_id) != actual:
            bad[arxiv_id] = actual
    return bad


def freeze_manifest(
    abstracts: dict[str, str],
    manifest_path: Path,
    source_url: str = ARXIV_ABS,
) -> None:
    """Write a fresh manifest; intended to be run once to lock the snapshot."""
    payload = {
        "schema_version": 1,
        "description": json.loads(manifest_path.read_text("utf-8")).get(
            "description", ""
        ) if manifest_path.exists() else "",
        "snapshot_taken_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "snapshot_source": source_url,
        "hash_algo": "sha256",
        "hashes": {k: sha256_text(v) for k, v in sorted(abstracts.items())},
    }
    manifest_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")


if __name__ == "__main__":
    import argparse

    ap = argparse.ArgumentParser()
    ap.add_argument("--corpus", type=Path, default=Path(__file__).parent / "corpus_ids.json")
    ap.add_argument("--manifest", type=Path, default=Path(__file__).parent / "manifest.json")
    ap.add_argument("--freeze", action="store_true",
                    help="Overwrite manifest with hashes of currently fetched abstracts")
    args = ap.parse_args()

    ids = json.loads(args.corpus.read_text("utf-8"))["ids"]
    abstracts = fetch_all(ids)
    print(f"fetched {len(abstracts)} / {len(ids)} abstracts")
    if args.freeze:
        freeze_manifest(abstracts, args.manifest)
        print(f"wrote manifest with {len(abstracts)} hashes -> {args.manifest}")
    else:
        bad = verify_against_manifest(abstracts, args.manifest)
        if bad:
            print(f"HASH MISMATCH on {len(bad)} ids: {bad}")
            raise SystemExit(2)
        print("manifest verified" if json.loads(args.manifest.read_text("utf-8")).get("hashes") else "manifest empty (run with --freeze)")