"""
update_arxiv_pool.py — Refresh the daily arXiv candidate pool.

Pulls the last `--days-back` of cs.AI/cs.LG/cs.CL submissions from the
arXiv API, embeds each abstract with all-MiniLM-L6-v2, writes a parquet
file with columns [arxiv_id, title, abstract, published_at, categories,
abstract_embedding]. The Space loads this parquet at cold-start so per-
request candidate filtering is sub-second CPU work.

Designed to run as a GitHub Actions cron once per day. Pushes the
parquet to a HF Dataset repo so the Space can `hf_hub_download` it.

Usage:
  python scripts/update_arxiv_pool.py \\
      --days-back 30 \\
      --max-results 800 \\
      --output arxiv_pool.parquet \\
      --push-to-hub remyxai/arxiv_pool_daily
"""
from __future__ import annotations

import argparse
import os
import sys
import time
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta, timezone

import numpy as np
import pandas as pd

ARXIV_API = "http://export.arxiv.org/api/query"
# Categories chosen to cover the domains represented in our MHPD corpus
# (v1.1 partition_meta shows: 45% agents/NLP, ~10% CV, ~10% RL, ~7% data/
# infra). Excluding cs.CV and cs.DB at launch (2026-05-25) produced
# poor retrieval for vortex-data/vortex (a columnar database project) —
# top candidates were KV-compression LLM papers, not actual storage
# papers. Broadening to the full set below.
CATEGORIES = [
    "cs.AI",   # broad AI
    "cs.LG",   # machine learning
    "cs.CL",   # NLP / language models
    "cs.CV",   # computer vision / multimodal
    "cs.DB",   # databases / storage / query engines
    "cs.IR",   # information retrieval / RAG
    "cs.SE",   # software engineering
    "stat.ML", # statistical ML
]


def fetch_arxiv(days_back: int, max_results: int) -> list[dict]:
    """Query the arXiv API for recent submissions in the configured categories."""
    cutoff = datetime.now(timezone.utc) - timedelta(days=days_back)
    # arXiv uses YYYYMMDDHHMM in its query date format
    start_date = cutoff.strftime("%Y%m%d%H%M")
    end_date = datetime.now(timezone.utc).strftime("%Y%m%d%H%M")
    cat_query = " OR ".join(f"cat:{c}" for c in CATEGORIES)
    search_query = f"({cat_query}) AND submittedDate:[{start_date} TO {end_date}]"

    results: list[dict] = []
    batch_size = 100
    start = 0
    while len(results) < max_results:
        remaining = max_results - len(results)
        params = {
            "search_query": search_query,
            "start": start,
            "max_results": min(batch_size, remaining),
            "sortBy": "submittedDate",
            "sortOrder": "descending",
        }
        url = f"{ARXIV_API}?{urllib.parse.urlencode(params)}"
        print(f"[arxiv] Fetching results {start}-{start+batch_size}…")
        with urllib.request.urlopen(url, timeout=30) as r:
            xml_data = r.read()

        root = ET.fromstring(xml_data)
        ns = {"atom": "http://www.w3.org/2005/Atom"}
        entries = root.findall("atom:entry", ns)
        if not entries:
            break

        for entry in entries:
            arxiv_url = entry.find("atom:id", ns).text
            # URL looks like http://arxiv.org/abs/2502.12345v1 — strip vN
            arxiv_id = arxiv_url.rstrip("/").split("/")[-1]
            arxiv_id = arxiv_id.split("v")[0] if "v" in arxiv_id else arxiv_id

            title = entry.find("atom:title", ns).text.strip().replace("\n", " ")
            summary = entry.find("atom:summary", ns).text.strip().replace("\n", " ")
            published = entry.find("atom:published", ns).text[:10]
            cats = [c.attrib["term"] for c in entry.findall("atom:category", ns)]
            results.append({
                "arxiv_id": arxiv_id,
                "title": title,
                "abstract": summary,
                "published_at": published,
                "categories": cats,
            })

        start += batch_size
        if len(entries) < batch_size:
            break
        time.sleep(3)   # arXiv rate-limit politeness

    # De-dup by arxiv_id (papers in multiple categories show up twice)
    seen = set()
    unique = []
    for r in results:
        if r["arxiv_id"] in seen:
            continue
        seen.add(r["arxiv_id"])
        unique.append(r)
    print(f"[arxiv] Fetched {len(unique)} unique papers")
    return unique


def embed_abstracts(papers: list[dict]) -> np.ndarray:
    """Embed every abstract. Runs on CPU; sentence-transformers handles batching."""
    from sentence_transformers import SentenceTransformer
    print("[embed] Loading all-MiniLM-L6-v2…")
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    abstracts = [p["abstract"] for p in papers]
    print(f"[embed] Encoding {len(abstracts)} abstracts…")
    embs = model.encode(
        abstracts,
        batch_size=32,
        show_progress_bar=True,
        normalize_embeddings=False,  # we normalize at use-time in candidate_filter
    )
    return embs.astype(np.float32)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--days-back", type=int, default=30)
    ap.add_argument("--max-results", type=int, default=800)
    ap.add_argument("--output", default="arxiv_pool.parquet")
    ap.add_argument("--push-to-hub", default=None,
                    help="HF Dataset repo id to push the parquet to (e.g., remyxai/arxiv_pool_daily)")
    args = ap.parse_args()

    papers = fetch_arxiv(args.days_back, args.max_results)
    if not papers:
        sys.exit("No papers fetched — check arXiv API connectivity")

    embeddings = embed_abstracts(papers)
    df = pd.DataFrame(papers)
    df["abstract_embedding"] = list(embeddings)
    df.to_parquet(args.output, index=False)
    print(f"[output] Wrote {len(df)} rows to {args.output}")

    if args.push_to_hub:
        from huggingface_hub import HfApi
        token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
        if not token:
            sys.exit("HF_TOKEN required for --push-to-hub")
        api = HfApi(token=token)
        try:
            api.create_repo(args.push_to_hub, repo_type="dataset", private=False, exist_ok=True)
        except Exception:
            pass
        api.upload_file(
            path_or_fileobj=args.output,
            path_in_repo="arxiv_pool.parquet",
            repo_id=args.push_to_hub,
            repo_type="dataset",
            commit_message=f"Refresh arxiv pool ({len(df)} papers, last {args.days_back}d)",
            token=token,
        )
        print(f"[hub] Pushed to https://huggingface.co/datasets/{args.push_to_hub}")


if __name__ == "__main__":
    main()