Spaces:

ZENLLC
/

LMArenaLeaderboard

Sleeping

File size: 2,413 Bytes

4f09bb6

# scripts/sync_lmarena.py
from __future__ import annotations

import json
import os
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Tuple

from huggingface_hub import HfApi, hf_hub_download

UPSTREAM_SPACE = "lmarena-ai/lmarena-leaderboard"  # source Space
LOCAL_DIR = Path("data/lmarena")
LOCAL_CSV = LOCAL_DIR / "leaderboard_table_latest.csv"
LOCAL_META = LOCAL_DIR / "sync_meta.json"

LEADERBOARD_RE = re.compile(r"^leaderboard_table_(\d{8})\.csv$")


def _pick_latest_leaderboard_file(files: list[str]) -> Optional[Tuple[str, str]]:
    """
    Returns (filename, yyyymmdd) for the newest leaderboard_table_YYYYMMDD.csv found.
    """
    candidates: list[Tuple[str, str]] = []
    for f in files:
        m = LEADERBOARD_RE.match(f)
        if m:
            candidates.append((f, m.group(1)))

    if not candidates:
        return None

    # Sort by date string; YYYYMMDD sorts lexicographically correctly
    candidates.sort(key=lambda x: x[1])
    return candidates[-1]


def main() -> int:
    token = os.getenv("HF_TOKEN")  # optional for public, but recommended for rate limits
    api = HfApi(token=token)

    # list files from the upstream *space repo*
    files = api.list_repo_files(repo_id=UPSTREAM_SPACE, repo_type="space")

    latest = _pick_latest_leaderboard_file(files)
    if not latest:
        raise RuntimeError(
            f"No leaderboard_table_YYYYMMDD.csv found in upstream Space: {UPSTREAM_SPACE}"
        )

    filename, yyyymmdd = latest

    # Download the raw file to a temp location (hub cache) then copy to our repo path
    downloaded_path = hf_hub_download(
        repo_id=UPSTREAM_SPACE,
        repo_type="space",
        filename=filename,
        token=token,
    )

    LOCAL_DIR.mkdir(parents=True, exist_ok=True)

    # Copy file contents to our tracked path
    Path(downloaded_path).replace(LOCAL_CSV) if False else LOCAL_CSV.write_bytes(Path(downloaded_path).read_bytes())

    meta = {
        "source_space": UPSTREAM_SPACE,
        "source_filename": filename,
        "source_date": yyyymmdd,
        "synced_at_utc": datetime.now(timezone.utc).isoformat(),
    }
    LOCAL_META.write_text(json.dumps(meta, indent=2), encoding="utf-8")

    print(f"[OK] Synced {filename} -> {LOCAL_CSV}")
    print(json.dumps(meta, indent=2))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())