Feature-Finder / scripts /update_arxiv_pool.py
salma-remyx's picture
Replace with MHPD paper recommender (v1.2 backend) to reuse Zero-GPU allocation
f4145b9 verified
"""
update_arxiv_pool.py — Refresh the daily arXiv candidate pool.
Pulls the last `--days-back` of cs.AI/cs.LG/cs.CL submissions from the
arXiv API, embeds each abstract with all-MiniLM-L6-v2, writes a parquet
file with columns [arxiv_id, title, abstract, published_at, categories,
abstract_embedding]. The Space loads this parquet at cold-start so per-
request candidate filtering is sub-second CPU work.
Designed to run as a GitHub Actions cron once per day. Pushes the
parquet to a HF Dataset repo so the Space can `hf_hub_download` it.
Usage:
python scripts/update_arxiv_pool.py \\
--days-back 30 \\
--max-results 800 \\
--output arxiv_pool.parquet \\
--push-to-hub remyxai/arxiv_pool_daily
"""
from __future__ import annotations
import argparse
import os
import sys
import time
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta, timezone
import numpy as np
import pandas as pd
ARXIV_API = "http://export.arxiv.org/api/query"
# Categories chosen to cover the domains represented in our MHPD corpus
# (v1.1 partition_meta shows: 45% agents/NLP, ~10% CV, ~10% RL, ~7% data/
# infra). Excluding cs.CV and cs.DB at launch (2026-05-25) produced
# poor retrieval for vortex-data/vortex (a columnar database project) —
# top candidates were KV-compression LLM papers, not actual storage
# papers. Broadening to the full set below.
CATEGORIES = [
"cs.AI", # broad AI
"cs.LG", # machine learning
"cs.CL", # NLP / language models
"cs.CV", # computer vision / multimodal
"cs.DB", # databases / storage / query engines
"cs.IR", # information retrieval / RAG
"cs.SE", # software engineering
"stat.ML", # statistical ML
]
def fetch_arxiv(days_back: int, max_results: int) -> list[dict]:
"""Query the arXiv API for recent submissions in the configured categories."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days_back)
# arXiv uses YYYYMMDDHHMM in its query date format
start_date = cutoff.strftime("%Y%m%d%H%M")
end_date = datetime.now(timezone.utc).strftime("%Y%m%d%H%M")
cat_query = " OR ".join(f"cat:{c}" for c in CATEGORIES)
search_query = f"({cat_query}) AND submittedDate:[{start_date} TO {end_date}]"
results: list[dict] = []
batch_size = 100
start = 0
while len(results) < max_results:
remaining = max_results - len(results)
params = {
"search_query": search_query,
"start": start,
"max_results": min(batch_size, remaining),
"sortBy": "submittedDate",
"sortOrder": "descending",
}
url = f"{ARXIV_API}?{urllib.parse.urlencode(params)}"
print(f"[arxiv] Fetching results {start}-{start+batch_size}…")
with urllib.request.urlopen(url, timeout=30) as r:
xml_data = r.read()
root = ET.fromstring(xml_data)
ns = {"atom": "http://www.w3.org/2005/Atom"}
entries = root.findall("atom:entry", ns)
if not entries:
break
for entry in entries:
arxiv_url = entry.find("atom:id", ns).text
# URL looks like http://arxiv.org/abs/2502.12345v1 — strip vN
arxiv_id = arxiv_url.rstrip("/").split("/")[-1]
arxiv_id = arxiv_id.split("v")[0] if "v" in arxiv_id else arxiv_id
title = entry.find("atom:title", ns).text.strip().replace("\n", " ")
summary = entry.find("atom:summary", ns).text.strip().replace("\n", " ")
published = entry.find("atom:published", ns).text[:10]
cats = [c.attrib["term"] for c in entry.findall("atom:category", ns)]
results.append({
"arxiv_id": arxiv_id,
"title": title,
"abstract": summary,
"published_at": published,
"categories": cats,
})
start += batch_size
if len(entries) < batch_size:
break
time.sleep(3) # arXiv rate-limit politeness
# De-dup by arxiv_id (papers in multiple categories show up twice)
seen = set()
unique = []
for r in results:
if r["arxiv_id"] in seen:
continue
seen.add(r["arxiv_id"])
unique.append(r)
print(f"[arxiv] Fetched {len(unique)} unique papers")
return unique
def embed_abstracts(papers: list[dict]) -> np.ndarray:
"""Embed every abstract. Runs on CPU; sentence-transformers handles batching."""
from sentence_transformers import SentenceTransformer
print("[embed] Loading all-MiniLM-L6-v2…")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
abstracts = [p["abstract"] for p in papers]
print(f"[embed] Encoding {len(abstracts)} abstracts…")
embs = model.encode(
abstracts,
batch_size=32,
show_progress_bar=True,
normalize_embeddings=False, # we normalize at use-time in candidate_filter
)
return embs.astype(np.float32)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--days-back", type=int, default=30)
ap.add_argument("--max-results", type=int, default=800)
ap.add_argument("--output", default="arxiv_pool.parquet")
ap.add_argument("--push-to-hub", default=None,
help="HF Dataset repo id to push the parquet to (e.g., remyxai/arxiv_pool_daily)")
args = ap.parse_args()
papers = fetch_arxiv(args.days_back, args.max_results)
if not papers:
sys.exit("No papers fetched — check arXiv API connectivity")
embeddings = embed_abstracts(papers)
df = pd.DataFrame(papers)
df["abstract_embedding"] = list(embeddings)
df.to_parquet(args.output, index=False)
print(f"[output] Wrote {len(df)} rows to {args.output}")
if args.push_to_hub:
from huggingface_hub import HfApi
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
if not token:
sys.exit("HF_TOKEN required for --push-to-hub")
api = HfApi(token=token)
try:
api.create_repo(args.push_to_hub, repo_type="dataset", private=False, exist_ok=True)
except Exception:
pass
api.upload_file(
path_or_fileobj=args.output,
path_in_repo="arxiv_pool.parquet",
repo_id=args.push_to_hub,
repo_type="dataset",
commit_message=f"Refresh arxiv pool ({len(df)} papers, last {args.days_back}d)",
token=token,
)
print(f"[hub] Pushed to https://huggingface.co/datasets/{args.push_to_hub}")
if __name__ == "__main__":
main()