# scripts/sync_lmarena.py from __future__ import annotations import json import os import re from datetime import datetime, timezone from pathlib import Path from typing import Optional, Tuple from huggingface_hub import HfApi, hf_hub_download UPSTREAM_SPACE = "lmarena-ai/lmarena-leaderboard" # source Space LOCAL_DIR = Path("data/lmarena") LOCAL_CSV = LOCAL_DIR / "leaderboard_table_latest.csv" LOCAL_META = LOCAL_DIR / "sync_meta.json" LEADERBOARD_RE = re.compile(r"^leaderboard_table_(\d{8})\.csv$") def _pick_latest_leaderboard_file(files: list[str]) -> Optional[Tuple[str, str]]: """ Returns (filename, yyyymmdd) for the newest leaderboard_table_YYYYMMDD.csv found. """ candidates: list[Tuple[str, str]] = [] for f in files: m = LEADERBOARD_RE.match(f) if m: candidates.append((f, m.group(1))) if not candidates: return None # Sort by date string; YYYYMMDD sorts lexicographically correctly candidates.sort(key=lambda x: x[1]) return candidates[-1] def main() -> int: token = os.getenv("HF_TOKEN") # optional for public, but recommended for rate limits api = HfApi(token=token) # list files from the upstream *space repo* files = api.list_repo_files(repo_id=UPSTREAM_SPACE, repo_type="space") latest = _pick_latest_leaderboard_file(files) if not latest: raise RuntimeError( f"No leaderboard_table_YYYYMMDD.csv found in upstream Space: {UPSTREAM_SPACE}" ) filename, yyyymmdd = latest # Download the raw file to a temp location (hub cache) then copy to our repo path downloaded_path = hf_hub_download( repo_id=UPSTREAM_SPACE, repo_type="space", filename=filename, token=token, ) LOCAL_DIR.mkdir(parents=True, exist_ok=True) # Copy file contents to our tracked path Path(downloaded_path).replace(LOCAL_CSV) if False else LOCAL_CSV.write_bytes(Path(downloaded_path).read_bytes()) meta = { "source_space": UPSTREAM_SPACE, "source_filename": filename, "source_date": yyyymmdd, "synced_at_utc": datetime.now(timezone.utc).isoformat(), } LOCAL_META.write_text(json.dumps(meta, indent=2), encoding="utf-8") print(f"[OK] Synced {filename} -> {LOCAL_CSV}") print(json.dumps(meta, indent=2)) return 0 if __name__ == "__main__": raise SystemExit(main())