Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """小测:对未匹配论文用 OpenAlex 标题搜索兜底,量命中率与质量。""" | |
| import json | |
| import re | |
| import sys | |
| import time | |
| import urllib.parse | |
| import urllib.request | |
| SRC = "arxiv_cs_2022_2026.topics.jsonl" | |
| MAILTO = "elfsong@outlook.sg" | |
| N = 30 | |
| def norm(s): | |
| return re.sub(r"[^a-z0-9]+", " ", (s or "").lower()).strip() | |
| def jacc(a, b): | |
| sa, sb = set(norm(a).split()), set(norm(b).split()) | |
| return len(sa & sb) / len(sa | sb) if sa | sb else 0 | |
| def search(title): | |
| q = urllib.parse.urlencode({ | |
| "filter": "title.search:" + title, | |
| "select": "id,doi,title,primary_topic,topics,publication_year", | |
| "per-page": 1, "mailto": MAILTO}) | |
| url = "https://api.openalex.org/works?" + q | |
| for attempt in range(4): | |
| try: | |
| req = urllib.request.Request(url, headers={"User-Agent": MAILTO}) | |
| return json.load(urllib.request.urlopen(req, timeout=40)).get("results", []) | |
| except Exception as e: | |
| print(" retry:", e, file=sys.stderr); time.sleep(3 * (attempt + 1)) | |
| return [] | |
| def main(): | |
| # 取未匹配的(优先 2025)样本 | |
| pool = [] | |
| for line in open(SRC): | |
| d = json.loads(line) | |
| if d.get("openalex_primary_topic") is None: | |
| pool.append((d["year"], d["id"], d["title"])) | |
| pool.sort(key=lambda x: 0 if x[0] == 2025 else 1) | |
| sample = pool[:N] | |
| print(f"unmatched pool={len(pool)}, testing {len(sample)} (2025-first)\n") | |
| hit = 0 | |
| for yr, aid, title in sample: | |
| res = search(title) | |
| ok = False | |
| if res: | |
| w = res[0] | |
| sim = jacc(title, w.get("title")) | |
| if sim >= 0.7: # 标题高度一致才算命中 | |
| ok = True; hit += 1 | |
| pt = (w.get("primary_topic") or {}).get("display_name") | |
| print(f"[HIT {sim:.2f}] {aid} ({yr}) -> {pt}") | |
| print(f" ours: {title[:70]}") | |
| print(f" them: {(w.get('title') or '')[:70]}") | |
| else: | |
| print(f"[low {sim:.2f}] {aid} ({yr}) top result mismatch") | |
| else: | |
| print(f"[MISS ] {aid} ({yr}) no search result") | |
| time.sleep(0.4) | |
| print(f"\n命中率: {hit}/{len(sample)} = {hit/len(sample)*100:.0f}%") | |
| if __name__ == "__main__": | |
| main() | |