Spaces:

remyxai
/

Feature-Finder

Paused

App Files Files Community

Feature-Finder / scripts /update_arxiv_pool.py

salma-remyx

Replace with MHPD paper recommender (v1.2 backend) to reuse Zero-GPU allocation

f4145b9 verified 9 days ago

raw

history blame contribute delete

6.68 kB

	"""
	update_arxiv_pool.py — Refresh the daily arXiv candidate pool.

	Pulls the last `--days-back` of cs.AI/cs.LG/cs.CL submissions from the
	arXiv API, embeds each abstract with all-MiniLM-L6-v2, writes a parquet
	file with columns [arxiv_id, title, abstract, published_at, categories,
	abstract_embedding]. The Space loads this parquet at cold-start so per-
	request candidate filtering is sub-second CPU work.

	Designed to run as a GitHub Actions cron once per day. Pushes the
	parquet to a HF Dataset repo so the Space can `hf_hub_download` it.

	Usage:
	python scripts/update_arxiv_pool.py \\
	--days-back 30 \\
	--max-results 800 \\
	--output arxiv_pool.parquet \\
	--push-to-hub remyxai/arxiv_pool_daily
	"""
	from __future__ import annotations

	import argparse
	import os
	import sys
	import time
	import urllib.parse
	import urllib.request
	import xml.etree.ElementTree as ET
	from datetime import datetime, timedelta, timezone

	import numpy as np
	import pandas as pd

	ARXIV_API = "http://export.arxiv.org/api/query"
	# Categories chosen to cover the domains represented in our MHPD corpus
	# (v1.1 partition_meta shows: 45% agents/NLP, ~10% CV, ~10% RL, ~7% data/
	# infra). Excluding cs.CV and cs.DB at launch (2026-05-25) produced
	# poor retrieval for vortex-data/vortex (a columnar database project) —
	# top candidates were KV-compression LLM papers, not actual storage
	# papers. Broadening to the full set below.
	CATEGORIES = [
	"cs.AI", # broad AI
	"cs.LG", # machine learning
	"cs.CL", # NLP / language models
	"cs.CV", # computer vision / multimodal
	"cs.DB", # databases / storage / query engines
	"cs.IR", # information retrieval / RAG
	"cs.SE", # software engineering
	"stat.ML", # statistical ML
	]


	def fetch_arxiv(days_back: int, max_results: int) -> list[dict]:
	"""Query the arXiv API for recent submissions in the configured categories."""
	cutoff = datetime.now(timezone.utc) - timedelta(days=days_back)
	# arXiv uses YYYYMMDDHHMM in its query date format
	start_date = cutoff.strftime("%Y%m%d%H%M")
	end_date = datetime.now(timezone.utc).strftime("%Y%m%d%H%M")
	cat_query = " OR ".join(f"cat:{c}" for c in CATEGORIES)
	search_query = f"({cat_query}) AND submittedDate:[{start_date} TO {end_date}]"

	results: list[dict] = []
	batch_size = 100
	start = 0
	while len(results) < max_results:
	remaining = max_results - len(results)
	params = {
	"search_query": search_query,
	"start": start,
	"max_results": min(batch_size, remaining),
	"sortBy": "submittedDate",
	"sortOrder": "descending",
	}
	url = f"{ARXIV_API}?{urllib.parse.urlencode(params)}"
	print(f"[arxiv] Fetching results {start}-{start+batch_size}…")
	with urllib.request.urlopen(url, timeout=30) as r:
	xml_data = r.read()

	root = ET.fromstring(xml_data)
	ns = {"atom": "http://www.w3.org/2005/Atom"}
	entries = root.findall("atom:entry", ns)
	if not entries:
	break

	for entry in entries:
	arxiv_url = entry.find("atom:id", ns).text
	# URL looks like http://arxiv.org/abs/2502.12345v1 — strip vN
	arxiv_id = arxiv_url.rstrip("/").split("/")[-1]
	arxiv_id = arxiv_id.split("v")[0] if "v" in arxiv_id else arxiv_id

	title = entry.find("atom:title", ns).text.strip().replace("\n", " ")
	summary = entry.find("atom:summary", ns).text.strip().replace("\n", " ")
	published = entry.find("atom:published", ns).text[:10]
	cats = [c.attrib["term"] for c in entry.findall("atom:category", ns)]
	results.append({
	"arxiv_id": arxiv_id,
	"title": title,
	"abstract": summary,
	"published_at": published,
	"categories": cats,
	})

	start += batch_size
	if len(entries) < batch_size:
	break
	time.sleep(3) # arXiv rate-limit politeness

	# De-dup by arxiv_id (papers in multiple categories show up twice)
	seen = set()
	unique = []
	for r in results:
	if r["arxiv_id"] in seen:
	continue
	seen.add(r["arxiv_id"])
	unique.append(r)
	print(f"[arxiv] Fetched {len(unique)} unique papers")
	return unique


	def embed_abstracts(papers: list[dict]) -> np.ndarray:
	"""Embed every abstract. Runs on CPU; sentence-transformers handles batching."""
	from sentence_transformers import SentenceTransformer
	print("[embed] Loading all-MiniLM-L6-v2…")
	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	abstracts = [p["abstract"] for p in papers]
	print(f"[embed] Encoding {len(abstracts)} abstracts…")
	embs = model.encode(
	abstracts,
	batch_size=32,
	show_progress_bar=True,
	normalize_embeddings=False, # we normalize at use-time in candidate_filter
	)
	return embs.astype(np.float32)


	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--days-back", type=int, default=30)
	ap.add_argument("--max-results", type=int, default=800)
	ap.add_argument("--output", default="arxiv_pool.parquet")
	ap.add_argument("--push-to-hub", default=None,
	help="HF Dataset repo id to push the parquet to (e.g., remyxai/arxiv_pool_daily)")
	args = ap.parse_args()

	papers = fetch_arxiv(args.days_back, args.max_results)
	if not papers:
	sys.exit("No papers fetched — check arXiv API connectivity")

	embeddings = embed_abstracts(papers)
	df = pd.DataFrame(papers)
	df["abstract_embedding"] = list(embeddings)
	df.to_parquet(args.output, index=False)
	print(f"[output] Wrote {len(df)} rows to {args.output}")

	if args.push_to_hub:
	from huggingface_hub import HfApi
	token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
	if not token:
	sys.exit("HF_TOKEN required for --push-to-hub")
	api = HfApi(token=token)
	try:
	api.create_repo(args.push_to_hub, repo_type="dataset", private=False, exist_ok=True)
	except Exception:
	pass
	api.upload_file(
	path_or_fileobj=args.output,
	path_in_repo="arxiv_pool.parquet",
	repo_id=args.push_to_hub,
	repo_type="dataset",
	commit_message=f"Refresh arxiv pool ({len(df)} papers, last {args.days_back}d)",
	token=token,
	)
	print(f"[hub] Pushed to https://huggingface.co/datasets/{args.push_to_hub}")


	if __name__ == "__main__":
	main()