clarify-rl / scripts /eval_with_vllm.py
agarwalanu3103's picture
plots: add training progression + diagnostics, drop W&B links
099bec8 verified
#!/usr/bin/env python
"""Run a vLLM-powered eval inside an HF Job and push results to the Hub.
Why this exists
---------------
HF Inference Router does NOT serve fine-tuned community uploads — only
provider-listed models (verified via `model_not_supported` 400 from the
router for our own GRPO checkpoint). So our trained Qwen3 GRPO models
must be evaluated via vLLM that we host ourselves. The cheapest /
cleanest path is one short HF Job per checkpoint that:
1. Bootstraps the project repo (this file's source repo) from the
public HF Space `agarwalanu3103/clarify-rl` so it has scenarios,
`run_eval.py`, and `inference.py` available locally.
2. Boots the vLLM OpenAI-compatible HTTP server in-process, loading
the fine-tuned model from its Hub repo.
3. Connects to the env Space WS exactly like the submission validator.
4. Replays N eval scenarios via `scripts/run_eval.py --mode api`.
5. Pushes the results JSON to the model repo's `evals/` folder so the
submission/validator and `make_plots.py` can find it without us
shipping artifacts back to the laptop.
Usage (inside an HF Job, via scripts/launch_eval_job.sh):
HF_TOKEN=hf_xxx \\
MODEL_NAME=agarwalanu3103/clarify-rl-grpo-qwen3-0-6b \\
ENV_BASE_URL=https://agarwalanu3103-clarify-rl.hf.space \\
LIMIT=50 \\
python scripts/eval_with_vllm.py
Env vars consumed:
HF_TOKEN required, write token of the account hosting the eval.
MODEL_NAME required, full Hub repo id of the model to evaluate.
ENV_BASE_URL env Space URL (default: agarwalanu3103-clarify-rl).
LIMIT N scenarios to run (default 50).
EVAL_LABEL optional suffix for the output filename (default n{LIMIT}).
PUSH_TO_REPO where to upload eval JSON; defaults to MODEL_NAME.
REPO_SPACE_ID Space holding `inference.py` + `scripts/` + `scenarios/`.
Default: agarwalanu3103/clarify-rl.
GPU_MEM_UTIL vLLM gpu memory utilisation (default 0.85).
MAX_MODEL_LEN vLLM max model len (default 4096).
"""
from __future__ import annotations
import json
import os
import shutil
import socket
import subprocess
import sys
import time
import urllib.error
import urllib.request
from pathlib import Path
try:
import truststore # type: ignore[import-not-found]
truststore.inject_into_ssl()
except ImportError:
pass
def _read_env() -> dict:
cfg = {
"HF_TOKEN": os.environ.get("HF_TOKEN"),
"MODEL_NAME": os.environ.get("MODEL_NAME"),
"ENV_BASE_URL": os.environ.get(
"ENV_BASE_URL", "https://agarwalanu3103-clarify-rl.hf.space"
),
"LIMIT": int(os.environ.get("LIMIT", "50")),
"EVAL_LABEL": os.environ.get("EVAL_LABEL", ""),
"PUSH_TO_REPO": os.environ.get("PUSH_TO_REPO", ""),
"REPO_SPACE_ID": os.environ.get("REPO_SPACE_ID", "agarwalanu3103/clarify-rl"),
"GPU_MEM_UTIL": float(os.environ.get("GPU_MEM_UTIL", "0.85")),
"MAX_MODEL_LEN": int(os.environ.get("MAX_MODEL_LEN", "4096")),
"VLLM_PORT": int(os.environ.get("VLLM_PORT", "8000")),
}
if not cfg["HF_TOKEN"]:
raise SystemExit("HF_TOKEN is required (write token).")
if not cfg["MODEL_NAME"]:
raise SystemExit("MODEL_NAME is required (Hub repo id of the trained model).")
if not cfg["PUSH_TO_REPO"]:
cfg["PUSH_TO_REPO"] = cfg["MODEL_NAME"]
if not cfg["EVAL_LABEL"]:
cfg["EVAL_LABEL"] = f"n{cfg['LIMIT']}"
return cfg
def _bootstrap_repo(space_id: str, token: str) -> Path:
"""Snapshot the project Space so this job has run_eval.py + scenarios."""
from huggingface_hub import snapshot_download
target = Path("/tmp/clarify-rl")
if target.exists():
shutil.rmtree(target)
print(f"[boot] downloading Space {space_id}{target}", flush=True)
snapshot_download(
repo_id=space_id,
repo_type="space",
local_dir=str(target),
token=token,
)
# Verify expected files exist.
must_have = ["inference.py", "scripts/run_eval.py", "scenarios/eval_held_out.json"]
for rel in must_have:
if not (target / rel).exists():
raise FileNotFoundError(f"Bootstrap failed — missing {rel} in Space {space_id}")
print(f"[boot] repo ready: {sorted(p.name for p in target.iterdir())}", flush=True)
return target
def _free_port(start: int) -> int:
p = start
while p < start + 50:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(("127.0.0.1", p))
return p
except OSError:
p += 1
raise RuntimeError(f"No free port near {start}")
def _start_vllm(model_name: str, port: int, gpu_mem_util: float, max_len: int):
log_path = Path("vllm_server.log")
log = log_path.open("w")
cmd = [
sys.executable,
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
model_name,
"--host",
"0.0.0.0",
"--port",
str(port),
"--gpu-memory-utilization",
str(gpu_mem_util),
"--max-model-len",
str(max_len),
"--dtype",
"bfloat16",
"--enforce-eager",
]
print(f"[vllm] launching: {' '.join(cmd)}", flush=True)
proc = subprocess.Popen(cmd, stdout=log, stderr=subprocess.STDOUT)
return proc, log_path
def _wait_for_vllm(port: int, timeout_s: float = 600) -> None:
url = f"http://127.0.0.1:{port}/v1/models"
print(f"[vllm] waiting for {url} (≤{timeout_s:.0f}s) ...", flush=True)
t0 = time.time()
last_err = ""
while time.time() - t0 < timeout_s:
try:
with urllib.request.urlopen(url, timeout=5) as resp:
if resp.status == 200:
body = resp.read().decode()
print(f"[vllm] ready after {time.time() - t0:.1f}s — {body[:200]}", flush=True)
return
except (urllib.error.URLError, ConnectionError, TimeoutError) as exc:
last_err = str(exc)
time.sleep(5)
raise RuntimeError(f"vLLM did not start within {timeout_s}s. Last error: {last_err}")
def _run_eval(cfg: dict, repo: Path, port: int) -> Path:
out_path = repo / "outputs" / f"eval_{Path(cfg['MODEL_NAME']).name.lower()}_{cfg['EVAL_LABEL']}.json"
out_path.parent.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env["API_BASE_URL"] = f"http://127.0.0.1:{port}/v1"
env["MODEL_NAME"] = cfg["MODEL_NAME"]
env["HF_TOKEN"] = cfg["HF_TOKEN"] # vllm ignores this; openai client wants a key
env["ENV_BASE_URL"] = cfg["ENV_BASE_URL"]
cmd = [
sys.executable,
str(repo / "scripts" / "run_eval.py"),
"--mode",
"api",
"--out",
str(out_path),
"--limit",
str(cfg["LIMIT"]),
]
print(f"[eval] running: {' '.join(cmd)}", flush=True)
res = subprocess.run(cmd, env=env, cwd=str(repo))
if res.returncode != 0:
raise RuntimeError(f"run_eval.py exited with {res.returncode}")
return out_path
def _push_to_hub(cfg: dict, eval_json: Path) -> None:
from huggingface_hub import HfApi
api = HfApi(token=cfg["HF_TOKEN"])
target = f"evals/{eval_json.name}"
print(f"[push] uploading {eval_json}{cfg['PUSH_TO_REPO']}:{target}", flush=True)
api.upload_file(
path_or_fileobj=str(eval_json),
path_in_repo=target,
repo_id=cfg["PUSH_TO_REPO"],
repo_type="model",
commit_message=f"eval: {cfg['EVAL_LABEL']}",
)
print(f"[push] done — see https://huggingface.co/{cfg['PUSH_TO_REPO']}/blob/main/{target}", flush=True)
def main() -> None:
cfg = _read_env()
print("=" * 70, flush=True)
print(f"clarify-rl vllm eval | model={cfg['MODEL_NAME']} | n={cfg['LIMIT']}", flush=True)
print(f"env={cfg['ENV_BASE_URL']} push_to={cfg['PUSH_TO_REPO']}", flush=True)
print("=" * 70, flush=True)
repo = _bootstrap_repo(cfg["REPO_SPACE_ID"], cfg["HF_TOKEN"])
port = _free_port(cfg["VLLM_PORT"])
proc, log_path = _start_vllm(cfg["MODEL_NAME"], port, cfg["GPU_MEM_UTIL"], cfg["MAX_MODEL_LEN"])
try:
try:
_wait_for_vllm(port, timeout_s=600)
except Exception:
print("[vllm] failed to start. Last 80 log lines:", flush=True)
try:
tail = log_path.read_text().splitlines()[-80:]
print("\n".join(tail), flush=True)
except Exception:
pass
raise
eval_json = _run_eval(cfg, repo, port)
_push_to_hub(cfg, eval_json)
try:
payload = json.loads(eval_json.read_text())
print(json.dumps({"summary": payload.get("summary", {})}, indent=2), flush=True)
except Exception:
pass
finally:
if proc.poll() is None:
print("[vllm] terminating server ...", flush=True)
proc.terminate()
try:
proc.wait(timeout=15)
except subprocess.TimeoutExpired:
proc.kill()
if __name__ == "__main__":
main()