""" update_arxiv_pool.py — Refresh the daily arXiv candidate pool. Pulls the last `--days-back` of cs.AI/cs.LG/cs.CL submissions from the arXiv API, embeds each abstract with all-MiniLM-L6-v2, writes a parquet file with columns [arxiv_id, title, abstract, published_at, categories, abstract_embedding]. The Space loads this parquet at cold-start so per- request candidate filtering is sub-second CPU work. Designed to run as a GitHub Actions cron once per day. Pushes the parquet to a HF Dataset repo so the Space can `hf_hub_download` it. Usage: python scripts/update_arxiv_pool.py \\ --days-back 30 \\ --max-results 800 \\ --output arxiv_pool.parquet \\ --push-to-hub remyxai/arxiv_pool_daily """ from __future__ import annotations import argparse import os import sys import time import urllib.parse import urllib.request import xml.etree.ElementTree as ET from datetime import datetime, timedelta, timezone import numpy as np import pandas as pd ARXIV_API = "http://export.arxiv.org/api/query" # Categories chosen to cover the domains represented in our MHPD corpus # (v1.1 partition_meta shows: 45% agents/NLP, ~10% CV, ~10% RL, ~7% data/ # infra). Excluding cs.CV and cs.DB at launch (2026-05-25) produced # poor retrieval for vortex-data/vortex (a columnar database project) — # top candidates were KV-compression LLM papers, not actual storage # papers. Broadening to the full set below. CATEGORIES = [ "cs.AI", # broad AI "cs.LG", # machine learning "cs.CL", # NLP / language models "cs.CV", # computer vision / multimodal "cs.DB", # databases / storage / query engines "cs.IR", # information retrieval / RAG "cs.SE", # software engineering "stat.ML", # statistical ML ] def fetch_arxiv(days_back: int, max_results: int) -> list[dict]: """Query the arXiv API for recent submissions in the configured categories.""" cutoff = datetime.now(timezone.utc) - timedelta(days=days_back) # arXiv uses YYYYMMDDHHMM in its query date format start_date = cutoff.strftime("%Y%m%d%H%M") end_date = datetime.now(timezone.utc).strftime("%Y%m%d%H%M") cat_query = " OR ".join(f"cat:{c}" for c in CATEGORIES) search_query = f"({cat_query}) AND submittedDate:[{start_date} TO {end_date}]" results: list[dict] = [] batch_size = 100 start = 0 while len(results) < max_results: remaining = max_results - len(results) params = { "search_query": search_query, "start": start, "max_results": min(batch_size, remaining), "sortBy": "submittedDate", "sortOrder": "descending", } url = f"{ARXIV_API}?{urllib.parse.urlencode(params)}" print(f"[arxiv] Fetching results {start}-{start+batch_size}…") with urllib.request.urlopen(url, timeout=30) as r: xml_data = r.read() root = ET.fromstring(xml_data) ns = {"atom": "http://www.w3.org/2005/Atom"} entries = root.findall("atom:entry", ns) if not entries: break for entry in entries: arxiv_url = entry.find("atom:id", ns).text # URL looks like http://arxiv.org/abs/2502.12345v1 — strip vN arxiv_id = arxiv_url.rstrip("/").split("/")[-1] arxiv_id = arxiv_id.split("v")[0] if "v" in arxiv_id else arxiv_id title = entry.find("atom:title", ns).text.strip().replace("\n", " ") summary = entry.find("atom:summary", ns).text.strip().replace("\n", " ") published = entry.find("atom:published", ns).text[:10] cats = [c.attrib["term"] for c in entry.findall("atom:category", ns)] results.append({ "arxiv_id": arxiv_id, "title": title, "abstract": summary, "published_at": published, "categories": cats, }) start += batch_size if len(entries) < batch_size: break time.sleep(3) # arXiv rate-limit politeness # De-dup by arxiv_id (papers in multiple categories show up twice) seen = set() unique = [] for r in results: if r["arxiv_id"] in seen: continue seen.add(r["arxiv_id"]) unique.append(r) print(f"[arxiv] Fetched {len(unique)} unique papers") return unique def embed_abstracts(papers: list[dict]) -> np.ndarray: """Embed every abstract. Runs on CPU; sentence-transformers handles batching.""" from sentence_transformers import SentenceTransformer print("[embed] Loading all-MiniLM-L6-v2…") model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") abstracts = [p["abstract"] for p in papers] print(f"[embed] Encoding {len(abstracts)} abstracts…") embs = model.encode( abstracts, batch_size=32, show_progress_bar=True, normalize_embeddings=False, # we normalize at use-time in candidate_filter ) return embs.astype(np.float32) def main(): ap = argparse.ArgumentParser() ap.add_argument("--days-back", type=int, default=30) ap.add_argument("--max-results", type=int, default=800) ap.add_argument("--output", default="arxiv_pool.parquet") ap.add_argument("--push-to-hub", default=None, help="HF Dataset repo id to push the parquet to (e.g., remyxai/arxiv_pool_daily)") args = ap.parse_args() papers = fetch_arxiv(args.days_back, args.max_results) if not papers: sys.exit("No papers fetched — check arXiv API connectivity") embeddings = embed_abstracts(papers) df = pd.DataFrame(papers) df["abstract_embedding"] = list(embeddings) df.to_parquet(args.output, index=False) print(f"[output] Wrote {len(df)} rows to {args.output}") if args.push_to_hub: from huggingface_hub import HfApi token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") if not token: sys.exit("HF_TOKEN required for --push-to-hub") api = HfApi(token=token) try: api.create_repo(args.push_to_hub, repo_type="dataset", private=False, exist_ok=True) except Exception: pass api.upload_file( path_or_fileobj=args.output, path_in_repo="arxiv_pool.parquet", repo_id=args.push_to_hub, repo_type="dataset", commit_message=f"Refresh arxiv pool ({len(df)} papers, last {args.days_back}d)", token=token, ) print(f"[hub] Pushed to https://huggingface.co/datasets/{args.push_to_hub}") if __name__ == "__main__": main()