Spaces:
Paused
Paused
| """ | |
| update_arxiv_pool.py — Refresh the daily arXiv candidate pool. | |
| Pulls the last `--days-back` of cs.AI/cs.LG/cs.CL submissions from the | |
| arXiv API, embeds each abstract with all-MiniLM-L6-v2, writes a parquet | |
| file with columns [arxiv_id, title, abstract, published_at, categories, | |
| abstract_embedding]. The Space loads this parquet at cold-start so per- | |
| request candidate filtering is sub-second CPU work. | |
| Designed to run as a GitHub Actions cron once per day. Pushes the | |
| parquet to a HF Dataset repo so the Space can `hf_hub_download` it. | |
| Usage: | |
| python scripts/update_arxiv_pool.py \\ | |
| --days-back 30 \\ | |
| --max-results 800 \\ | |
| --output arxiv_pool.parquet \\ | |
| --push-to-hub remyxai/arxiv_pool_daily | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import os | |
| import sys | |
| import time | |
| import urllib.parse | |
| import urllib.request | |
| import xml.etree.ElementTree as ET | |
| from datetime import datetime, timedelta, timezone | |
| import numpy as np | |
| import pandas as pd | |
| ARXIV_API = "http://export.arxiv.org/api/query" | |
| # Categories chosen to cover the domains represented in our MHPD corpus | |
| # (v1.1 partition_meta shows: 45% agents/NLP, ~10% CV, ~10% RL, ~7% data/ | |
| # infra). Excluding cs.CV and cs.DB at launch (2026-05-25) produced | |
| # poor retrieval for vortex-data/vortex (a columnar database project) — | |
| # top candidates were KV-compression LLM papers, not actual storage | |
| # papers. Broadening to the full set below. | |
| CATEGORIES = [ | |
| "cs.AI", # broad AI | |
| "cs.LG", # machine learning | |
| "cs.CL", # NLP / language models | |
| "cs.CV", # computer vision / multimodal | |
| "cs.DB", # databases / storage / query engines | |
| "cs.IR", # information retrieval / RAG | |
| "cs.SE", # software engineering | |
| "stat.ML", # statistical ML | |
| ] | |
| def fetch_arxiv(days_back: int, max_results: int) -> list[dict]: | |
| """Query the arXiv API for recent submissions in the configured categories.""" | |
| cutoff = datetime.now(timezone.utc) - timedelta(days=days_back) | |
| # arXiv uses YYYYMMDDHHMM in its query date format | |
| start_date = cutoff.strftime("%Y%m%d%H%M") | |
| end_date = datetime.now(timezone.utc).strftime("%Y%m%d%H%M") | |
| cat_query = " OR ".join(f"cat:{c}" for c in CATEGORIES) | |
| search_query = f"({cat_query}) AND submittedDate:[{start_date} TO {end_date}]" | |
| results: list[dict] = [] | |
| batch_size = 100 | |
| start = 0 | |
| while len(results) < max_results: | |
| remaining = max_results - len(results) | |
| params = { | |
| "search_query": search_query, | |
| "start": start, | |
| "max_results": min(batch_size, remaining), | |
| "sortBy": "submittedDate", | |
| "sortOrder": "descending", | |
| } | |
| url = f"{ARXIV_API}?{urllib.parse.urlencode(params)}" | |
| print(f"[arxiv] Fetching results {start}-{start+batch_size}…") | |
| with urllib.request.urlopen(url, timeout=30) as r: | |
| xml_data = r.read() | |
| root = ET.fromstring(xml_data) | |
| ns = {"atom": "http://www.w3.org/2005/Atom"} | |
| entries = root.findall("atom:entry", ns) | |
| if not entries: | |
| break | |
| for entry in entries: | |
| arxiv_url = entry.find("atom:id", ns).text | |
| # URL looks like http://arxiv.org/abs/2502.12345v1 — strip vN | |
| arxiv_id = arxiv_url.rstrip("/").split("/")[-1] | |
| arxiv_id = arxiv_id.split("v")[0] if "v" in arxiv_id else arxiv_id | |
| title = entry.find("atom:title", ns).text.strip().replace("\n", " ") | |
| summary = entry.find("atom:summary", ns).text.strip().replace("\n", " ") | |
| published = entry.find("atom:published", ns).text[:10] | |
| cats = [c.attrib["term"] for c in entry.findall("atom:category", ns)] | |
| results.append({ | |
| "arxiv_id": arxiv_id, | |
| "title": title, | |
| "abstract": summary, | |
| "published_at": published, | |
| "categories": cats, | |
| }) | |
| start += batch_size | |
| if len(entries) < batch_size: | |
| break | |
| time.sleep(3) # arXiv rate-limit politeness | |
| # De-dup by arxiv_id (papers in multiple categories show up twice) | |
| seen = set() | |
| unique = [] | |
| for r in results: | |
| if r["arxiv_id"] in seen: | |
| continue | |
| seen.add(r["arxiv_id"]) | |
| unique.append(r) | |
| print(f"[arxiv] Fetched {len(unique)} unique papers") | |
| return unique | |
| def embed_abstracts(papers: list[dict]) -> np.ndarray: | |
| """Embed every abstract. Runs on CPU; sentence-transformers handles batching.""" | |
| from sentence_transformers import SentenceTransformer | |
| print("[embed] Loading all-MiniLM-L6-v2…") | |
| model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| abstracts = [p["abstract"] for p in papers] | |
| print(f"[embed] Encoding {len(abstracts)} abstracts…") | |
| embs = model.encode( | |
| abstracts, | |
| batch_size=32, | |
| show_progress_bar=True, | |
| normalize_embeddings=False, # we normalize at use-time in candidate_filter | |
| ) | |
| return embs.astype(np.float32) | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--days-back", type=int, default=30) | |
| ap.add_argument("--max-results", type=int, default=800) | |
| ap.add_argument("--output", default="arxiv_pool.parquet") | |
| ap.add_argument("--push-to-hub", default=None, | |
| help="HF Dataset repo id to push the parquet to (e.g., remyxai/arxiv_pool_daily)") | |
| args = ap.parse_args() | |
| papers = fetch_arxiv(args.days_back, args.max_results) | |
| if not papers: | |
| sys.exit("No papers fetched — check arXiv API connectivity") | |
| embeddings = embed_abstracts(papers) | |
| df = pd.DataFrame(papers) | |
| df["abstract_embedding"] = list(embeddings) | |
| df.to_parquet(args.output, index=False) | |
| print(f"[output] Wrote {len(df)} rows to {args.output}") | |
| if args.push_to_hub: | |
| from huggingface_hub import HfApi | |
| token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") | |
| if not token: | |
| sys.exit("HF_TOKEN required for --push-to-hub") | |
| api = HfApi(token=token) | |
| try: | |
| api.create_repo(args.push_to_hub, repo_type="dataset", private=False, exist_ok=True) | |
| except Exception: | |
| pass | |
| api.upload_file( | |
| path_or_fileobj=args.output, | |
| path_in_repo="arxiv_pool.parquet", | |
| repo_id=args.push_to_hub, | |
| repo_type="dataset", | |
| commit_message=f"Refresh arxiv pool ({len(df)} papers, last {args.days_back}d)", | |
| token=token, | |
| ) | |
| print(f"[hub] Pushed to https://huggingface.co/datasets/{args.push_to_hub}") | |
| if __name__ == "__main__": | |
| main() | |