#!/usr/bin/env python3 """小测:对未匹配论文用 OpenAlex 标题搜索兜底,量命中率与质量。""" import json import re import sys import time import urllib.parse import urllib.request SRC = "arxiv_cs_2022_2026.topics.jsonl" MAILTO = "elfsong@outlook.sg" N = 30 def norm(s): return re.sub(r"[^a-z0-9]+", " ", (s or "").lower()).strip() def jacc(a, b): sa, sb = set(norm(a).split()), set(norm(b).split()) return len(sa & sb) / len(sa | sb) if sa | sb else 0 def search(title): q = urllib.parse.urlencode({ "filter": "title.search:" + title, "select": "id,doi,title,primary_topic,topics,publication_year", "per-page": 1, "mailto": MAILTO}) url = "https://api.openalex.org/works?" + q for attempt in range(4): try: req = urllib.request.Request(url, headers={"User-Agent": MAILTO}) return json.load(urllib.request.urlopen(req, timeout=40)).get("results", []) except Exception as e: print(" retry:", e, file=sys.stderr); time.sleep(3 * (attempt + 1)) return [] def main(): # 取未匹配的(优先 2025)样本 pool = [] for line in open(SRC): d = json.loads(line) if d.get("openalex_primary_topic") is None: pool.append((d["year"], d["id"], d["title"])) pool.sort(key=lambda x: 0 if x[0] == 2025 else 1) sample = pool[:N] print(f"unmatched pool={len(pool)}, testing {len(sample)} (2025-first)\n") hit = 0 for yr, aid, title in sample: res = search(title) ok = False if res: w = res[0] sim = jacc(title, w.get("title")) if sim >= 0.7: # 标题高度一致才算命中 ok = True; hit += 1 pt = (w.get("primary_topic") or {}).get("display_name") print(f"[HIT {sim:.2f}] {aid} ({yr}) -> {pt}") print(f" ours: {title[:70]}") print(f" them: {(w.get('title') or '')[:70]}") else: print(f"[low {sim:.2f}] {aid} ({yr}) top result mismatch") else: print(f"[MISS ] {aid} ({yr}) no search result") time.sleep(0.4) print(f"\n命中率: {hit}/{len(sample)} = {hit/len(sample)*100:.0f}%") if __name__ == "__main__": main()