Spaces:

agarwalanu3103
/

clarify-rl

Running

File size: 9,145 Bytes

099bec8

#!/usr/bin/env python
"""Run a vLLM-powered eval inside an HF Job and push results to the Hub.

Why this exists
---------------
HF Inference Router does NOT serve fine-tuned community uploads — only
provider-listed models (verified via `model_not_supported` 400 from the
router for our own GRPO checkpoint). So our trained Qwen3 GRPO models
must be evaluated via vLLM that we host ourselves. The cheapest /
cleanest path is one short HF Job per checkpoint that:

  1. Bootstraps the project repo (this file's source repo) from the
     public HF Space `agarwalanu3103/clarify-rl` so it has scenarios,
     `run_eval.py`, and `inference.py` available locally.
  2. Boots the vLLM OpenAI-compatible HTTP server in-process, loading
     the fine-tuned model from its Hub repo.
  3. Connects to the env Space WS exactly like the submission validator.
  4. Replays N eval scenarios via `scripts/run_eval.py --mode api`.
  5. Pushes the results JSON to the model repo's `evals/` folder so the
     submission/validator and `make_plots.py` can find it without us
     shipping artifacts back to the laptop.

Usage (inside an HF Job, via scripts/launch_eval_job.sh):

  HF_TOKEN=hf_xxx \\
  MODEL_NAME=agarwalanu3103/clarify-rl-grpo-qwen3-0-6b \\
  ENV_BASE_URL=https://agarwalanu3103-clarify-rl.hf.space \\
  LIMIT=50 \\
  python scripts/eval_with_vllm.py

Env vars consumed:
  HF_TOKEN          required, write token of the account hosting the eval.
  MODEL_NAME        required, full Hub repo id of the model to evaluate.
  ENV_BASE_URL      env Space URL (default: agarwalanu3103-clarify-rl).
  LIMIT             N scenarios to run (default 50).
  EVAL_LABEL        optional suffix for the output filename (default n{LIMIT}).
  PUSH_TO_REPO      where to upload eval JSON; defaults to MODEL_NAME.
  REPO_SPACE_ID     Space holding `inference.py` + `scripts/` + `scenarios/`.
                    Default: agarwalanu3103/clarify-rl.
  GPU_MEM_UTIL      vLLM gpu memory utilisation (default 0.85).
  MAX_MODEL_LEN     vLLM max model len (default 4096).
"""

from __future__ import annotations

import json
import os
import shutil
import socket
import subprocess
import sys
import time
import urllib.error
import urllib.request
from pathlib import Path

try:
    import truststore  # type: ignore[import-not-found]

    truststore.inject_into_ssl()
except ImportError:
    pass


def _read_env() -> dict:
    cfg = {
        "HF_TOKEN": os.environ.get("HF_TOKEN"),
        "MODEL_NAME": os.environ.get("MODEL_NAME"),
        "ENV_BASE_URL": os.environ.get(
            "ENV_BASE_URL", "https://agarwalanu3103-clarify-rl.hf.space"
        ),
        "LIMIT": int(os.environ.get("LIMIT", "50")),
        "EVAL_LABEL": os.environ.get("EVAL_LABEL", ""),
        "PUSH_TO_REPO": os.environ.get("PUSH_TO_REPO", ""),
        "REPO_SPACE_ID": os.environ.get("REPO_SPACE_ID", "agarwalanu3103/clarify-rl"),
        "GPU_MEM_UTIL": float(os.environ.get("GPU_MEM_UTIL", "0.85")),
        "MAX_MODEL_LEN": int(os.environ.get("MAX_MODEL_LEN", "4096")),
        "VLLM_PORT": int(os.environ.get("VLLM_PORT", "8000")),
    }
    if not cfg["HF_TOKEN"]:
        raise SystemExit("HF_TOKEN is required (write token).")
    if not cfg["MODEL_NAME"]:
        raise SystemExit("MODEL_NAME is required (Hub repo id of the trained model).")
    if not cfg["PUSH_TO_REPO"]:
        cfg["PUSH_TO_REPO"] = cfg["MODEL_NAME"]
    if not cfg["EVAL_LABEL"]:
        cfg["EVAL_LABEL"] = f"n{cfg['LIMIT']}"
    return cfg


def _bootstrap_repo(space_id: str, token: str) -> Path:
    """Snapshot the project Space so this job has run_eval.py + scenarios."""
    from huggingface_hub import snapshot_download

    target = Path("/tmp/clarify-rl")
    if target.exists():
        shutil.rmtree(target)
    print(f"[boot] downloading Space {space_id} → {target}", flush=True)
    snapshot_download(
        repo_id=space_id,
        repo_type="space",
        local_dir=str(target),
        token=token,
    )
    # Verify expected files exist.
    must_have = ["inference.py", "scripts/run_eval.py", "scenarios/eval_held_out.json"]
    for rel in must_have:
        if not (target / rel).exists():
            raise FileNotFoundError(f"Bootstrap failed — missing {rel} in Space {space_id}")
    print(f"[boot] repo ready: {sorted(p.name for p in target.iterdir())}", flush=True)
    return target


def _free_port(start: int) -> int:
    p = start
    while p < start + 50:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            try:
                s.bind(("127.0.0.1", p))
                return p
            except OSError:
                p += 1
    raise RuntimeError(f"No free port near {start}")


def _start_vllm(model_name: str, port: int, gpu_mem_util: float, max_len: int):
    log_path = Path("vllm_server.log")
    log = log_path.open("w")
    cmd = [
        sys.executable,
        "-m",
        "vllm.entrypoints.openai.api_server",
        "--model",
        model_name,
        "--host",
        "0.0.0.0",
        "--port",
        str(port),
        "--gpu-memory-utilization",
        str(gpu_mem_util),
        "--max-model-len",
        str(max_len),
        "--dtype",
        "bfloat16",
        "--enforce-eager",
    ]
    print(f"[vllm] launching: {' '.join(cmd)}", flush=True)
    proc = subprocess.Popen(cmd, stdout=log, stderr=subprocess.STDOUT)
    return proc, log_path


def _wait_for_vllm(port: int, timeout_s: float = 600) -> None:
    url = f"http://127.0.0.1:{port}/v1/models"
    print(f"[vllm] waiting for {url} (≤{timeout_s:.0f}s) ...", flush=True)
    t0 = time.time()
    last_err = ""
    while time.time() - t0 < timeout_s:
        try:
            with urllib.request.urlopen(url, timeout=5) as resp:
                if resp.status == 200:
                    body = resp.read().decode()
                    print(f"[vllm] ready after {time.time() - t0:.1f}s — {body[:200]}", flush=True)
                    return
        except (urllib.error.URLError, ConnectionError, TimeoutError) as exc:
            last_err = str(exc)
        time.sleep(5)
    raise RuntimeError(f"vLLM did not start within {timeout_s}s. Last error: {last_err}")


def _run_eval(cfg: dict, repo: Path, port: int) -> Path:
    out_path = repo / "outputs" / f"eval_{Path(cfg['MODEL_NAME']).name.lower()}_{cfg['EVAL_LABEL']}.json"
    out_path.parent.mkdir(parents=True, exist_ok=True)

    env = os.environ.copy()
    env["API_BASE_URL"] = f"http://127.0.0.1:{port}/v1"
    env["MODEL_NAME"] = cfg["MODEL_NAME"]
    env["HF_TOKEN"] = cfg["HF_TOKEN"]  # vllm ignores this; openai client wants a key
    env["ENV_BASE_URL"] = cfg["ENV_BASE_URL"]

    cmd = [
        sys.executable,
        str(repo / "scripts" / "run_eval.py"),
        "--mode",
        "api",
        "--out",
        str(out_path),
        "--limit",
        str(cfg["LIMIT"]),
    ]
    print(f"[eval] running: {' '.join(cmd)}", flush=True)
    res = subprocess.run(cmd, env=env, cwd=str(repo))
    if res.returncode != 0:
        raise RuntimeError(f"run_eval.py exited with {res.returncode}")
    return out_path


def _push_to_hub(cfg: dict, eval_json: Path) -> None:
    from huggingface_hub import HfApi

    api = HfApi(token=cfg["HF_TOKEN"])
    target = f"evals/{eval_json.name}"
    print(f"[push] uploading {eval_json} → {cfg['PUSH_TO_REPO']}:{target}", flush=True)
    api.upload_file(
        path_or_fileobj=str(eval_json),
        path_in_repo=target,
        repo_id=cfg["PUSH_TO_REPO"],
        repo_type="model",
        commit_message=f"eval: {cfg['EVAL_LABEL']}",
    )
    print(f"[push] done — see https://huggingface.co/{cfg['PUSH_TO_REPO']}/blob/main/{target}", flush=True)


def main() -> None:
    cfg = _read_env()
    print("=" * 70, flush=True)
    print(f"clarify-rl vllm eval | model={cfg['MODEL_NAME']} | n={cfg['LIMIT']}", flush=True)
    print(f"env={cfg['ENV_BASE_URL']}  push_to={cfg['PUSH_TO_REPO']}", flush=True)
    print("=" * 70, flush=True)

    repo = _bootstrap_repo(cfg["REPO_SPACE_ID"], cfg["HF_TOKEN"])

    port = _free_port(cfg["VLLM_PORT"])
    proc, log_path = _start_vllm(cfg["MODEL_NAME"], port, cfg["GPU_MEM_UTIL"], cfg["MAX_MODEL_LEN"])
    try:
        try:
            _wait_for_vllm(port, timeout_s=600)
        except Exception:
            print("[vllm] failed to start. Last 80 log lines:", flush=True)
            try:
                tail = log_path.read_text().splitlines()[-80:]
                print("\n".join(tail), flush=True)
            except Exception:
                pass
            raise

        eval_json = _run_eval(cfg, repo, port)
        _push_to_hub(cfg, eval_json)
        try:
            payload = json.loads(eval_json.read_text())
            print(json.dumps({"summary": payload.get("summary", {})}, indent=2), flush=True)
        except Exception:
            pass
    finally:
        if proc.poll() is None:
            print("[vllm] terminating server ...", flush=True)
            proc.terminate()
            try:
                proc.wait(timeout=15)
            except subprocess.TimeoutExpired:
                proc.kill()


if __name__ == "__main__":
    main()