paper_lifecycle / test_title_fallback.py
elfsong
Add quarterly topic hype-cycle pipeline and slider visualization
82f767a
Raw
History Blame Contribute Delete
2.34 kB
#!/usr/bin/env python3
"""小测:对未匹配论文用 OpenAlex 标题搜索兜底,量命中率与质量。"""
import json
import re
import sys
import time
import urllib.parse
import urllib.request
SRC = "arxiv_cs_2022_2026.topics.jsonl"
MAILTO = "elfsong@outlook.sg"
N = 30
def norm(s):
return re.sub(r"[^a-z0-9]+", " ", (s or "").lower()).strip()
def jacc(a, b):
sa, sb = set(norm(a).split()), set(norm(b).split())
return len(sa & sb) / len(sa | sb) if sa | sb else 0
def search(title):
q = urllib.parse.urlencode({
"filter": "title.search:" + title,
"select": "id,doi,title,primary_topic,topics,publication_year",
"per-page": 1, "mailto": MAILTO})
url = "https://api.openalex.org/works?" + q
for attempt in range(4):
try:
req = urllib.request.Request(url, headers={"User-Agent": MAILTO})
return json.load(urllib.request.urlopen(req, timeout=40)).get("results", [])
except Exception as e:
print(" retry:", e, file=sys.stderr); time.sleep(3 * (attempt + 1))
return []
def main():
# 取未匹配的(优先 2025)样本
pool = []
for line in open(SRC):
d = json.loads(line)
if d.get("openalex_primary_topic") is None:
pool.append((d["year"], d["id"], d["title"]))
pool.sort(key=lambda x: 0 if x[0] == 2025 else 1)
sample = pool[:N]
print(f"unmatched pool={len(pool)}, testing {len(sample)} (2025-first)\n")
hit = 0
for yr, aid, title in sample:
res = search(title)
ok = False
if res:
w = res[0]
sim = jacc(title, w.get("title"))
if sim >= 0.7: # 标题高度一致才算命中
ok = True; hit += 1
pt = (w.get("primary_topic") or {}).get("display_name")
print(f"[HIT {sim:.2f}] {aid} ({yr}) -> {pt}")
print(f" ours: {title[:70]}")
print(f" them: {(w.get('title') or '')[:70]}")
else:
print(f"[low {sim:.2f}] {aid} ({yr}) top result mismatch")
else:
print(f"[MISS ] {aid} ({yr}) no search result")
time.sleep(0.4)
print(f"\n命中率: {hit}/{len(sample)} = {hit/len(sample)*100:.0f}%")
if __name__ == "__main__":
main()