Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| """Run a vLLM-powered eval inside an HF Job and push results to the Hub. | |
| Why this exists | |
| --------------- | |
| HF Inference Router does NOT serve fine-tuned community uploads — only | |
| provider-listed models (verified via `model_not_supported` 400 from the | |
| router for our own GRPO checkpoint). So our trained Qwen3 GRPO models | |
| must be evaluated via vLLM that we host ourselves. The cheapest / | |
| cleanest path is one short HF Job per checkpoint that: | |
| 1. Bootstraps the project repo (this file's source repo) from the | |
| public HF Space `agarwalanu3103/clarify-rl` so it has scenarios, | |
| `run_eval.py`, and `inference.py` available locally. | |
| 2. Boots the vLLM OpenAI-compatible HTTP server in-process, loading | |
| the fine-tuned model from its Hub repo. | |
| 3. Connects to the env Space WS exactly like the submission validator. | |
| 4. Replays N eval scenarios via `scripts/run_eval.py --mode api`. | |
| 5. Pushes the results JSON to the model repo's `evals/` folder so the | |
| submission/validator and `make_plots.py` can find it without us | |
| shipping artifacts back to the laptop. | |
| Usage (inside an HF Job, via scripts/launch_eval_job.sh): | |
| HF_TOKEN=hf_xxx \\ | |
| MODEL_NAME=agarwalanu3103/clarify-rl-grpo-qwen3-0-6b \\ | |
| ENV_BASE_URL=https://agarwalanu3103-clarify-rl.hf.space \\ | |
| LIMIT=50 \\ | |
| python scripts/eval_with_vllm.py | |
| Env vars consumed: | |
| HF_TOKEN required, write token of the account hosting the eval. | |
| MODEL_NAME required, full Hub repo id of the model to evaluate. | |
| ENV_BASE_URL env Space URL (default: agarwalanu3103-clarify-rl). | |
| LIMIT N scenarios to run (default 50). | |
| EVAL_LABEL optional suffix for the output filename (default n{LIMIT}). | |
| PUSH_TO_REPO where to upload eval JSON; defaults to MODEL_NAME. | |
| REPO_SPACE_ID Space holding `inference.py` + `scripts/` + `scenarios/`. | |
| Default: agarwalanu3103/clarify-rl. | |
| GPU_MEM_UTIL vLLM gpu memory utilisation (default 0.85). | |
| MAX_MODEL_LEN vLLM max model len (default 4096). | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import shutil | |
| import socket | |
| import subprocess | |
| import sys | |
| import time | |
| import urllib.error | |
| import urllib.request | |
| from pathlib import Path | |
| try: | |
| import truststore # type: ignore[import-not-found] | |
| truststore.inject_into_ssl() | |
| except ImportError: | |
| pass | |
| def _read_env() -> dict: | |
| cfg = { | |
| "HF_TOKEN": os.environ.get("HF_TOKEN"), | |
| "MODEL_NAME": os.environ.get("MODEL_NAME"), | |
| "ENV_BASE_URL": os.environ.get( | |
| "ENV_BASE_URL", "https://agarwalanu3103-clarify-rl.hf.space" | |
| ), | |
| "LIMIT": int(os.environ.get("LIMIT", "50")), | |
| "EVAL_LABEL": os.environ.get("EVAL_LABEL", ""), | |
| "PUSH_TO_REPO": os.environ.get("PUSH_TO_REPO", ""), | |
| "REPO_SPACE_ID": os.environ.get("REPO_SPACE_ID", "agarwalanu3103/clarify-rl"), | |
| "GPU_MEM_UTIL": float(os.environ.get("GPU_MEM_UTIL", "0.85")), | |
| "MAX_MODEL_LEN": int(os.environ.get("MAX_MODEL_LEN", "4096")), | |
| "VLLM_PORT": int(os.environ.get("VLLM_PORT", "8000")), | |
| } | |
| if not cfg["HF_TOKEN"]: | |
| raise SystemExit("HF_TOKEN is required (write token).") | |
| if not cfg["MODEL_NAME"]: | |
| raise SystemExit("MODEL_NAME is required (Hub repo id of the trained model).") | |
| if not cfg["PUSH_TO_REPO"]: | |
| cfg["PUSH_TO_REPO"] = cfg["MODEL_NAME"] | |
| if not cfg["EVAL_LABEL"]: | |
| cfg["EVAL_LABEL"] = f"n{cfg['LIMIT']}" | |
| return cfg | |
| def _bootstrap_repo(space_id: str, token: str) -> Path: | |
| """Snapshot the project Space so this job has run_eval.py + scenarios.""" | |
| from huggingface_hub import snapshot_download | |
| target = Path("/tmp/clarify-rl") | |
| if target.exists(): | |
| shutil.rmtree(target) | |
| print(f"[boot] downloading Space {space_id} → {target}", flush=True) | |
| snapshot_download( | |
| repo_id=space_id, | |
| repo_type="space", | |
| local_dir=str(target), | |
| token=token, | |
| ) | |
| # Verify expected files exist. | |
| must_have = ["inference.py", "scripts/run_eval.py", "scenarios/eval_held_out.json"] | |
| for rel in must_have: | |
| if not (target / rel).exists(): | |
| raise FileNotFoundError(f"Bootstrap failed — missing {rel} in Space {space_id}") | |
| print(f"[boot] repo ready: {sorted(p.name for p in target.iterdir())}", flush=True) | |
| return target | |
| def _free_port(start: int) -> int: | |
| p = start | |
| while p < start + 50: | |
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | |
| try: | |
| s.bind(("127.0.0.1", p)) | |
| return p | |
| except OSError: | |
| p += 1 | |
| raise RuntimeError(f"No free port near {start}") | |
| def _start_vllm(model_name: str, port: int, gpu_mem_util: float, max_len: int): | |
| log_path = Path("vllm_server.log") | |
| log = log_path.open("w") | |
| cmd = [ | |
| sys.executable, | |
| "-m", | |
| "vllm.entrypoints.openai.api_server", | |
| "--model", | |
| model_name, | |
| "--host", | |
| "0.0.0.0", | |
| "--port", | |
| str(port), | |
| "--gpu-memory-utilization", | |
| str(gpu_mem_util), | |
| "--max-model-len", | |
| str(max_len), | |
| "--dtype", | |
| "bfloat16", | |
| "--enforce-eager", | |
| ] | |
| print(f"[vllm] launching: {' '.join(cmd)}", flush=True) | |
| proc = subprocess.Popen(cmd, stdout=log, stderr=subprocess.STDOUT) | |
| return proc, log_path | |
| def _wait_for_vllm(port: int, timeout_s: float = 600) -> None: | |
| url = f"http://127.0.0.1:{port}/v1/models" | |
| print(f"[vllm] waiting for {url} (≤{timeout_s:.0f}s) ...", flush=True) | |
| t0 = time.time() | |
| last_err = "" | |
| while time.time() - t0 < timeout_s: | |
| try: | |
| with urllib.request.urlopen(url, timeout=5) as resp: | |
| if resp.status == 200: | |
| body = resp.read().decode() | |
| print(f"[vllm] ready after {time.time() - t0:.1f}s — {body[:200]}", flush=True) | |
| return | |
| except (urllib.error.URLError, ConnectionError, TimeoutError) as exc: | |
| last_err = str(exc) | |
| time.sleep(5) | |
| raise RuntimeError(f"vLLM did not start within {timeout_s}s. Last error: {last_err}") | |
| def _run_eval(cfg: dict, repo: Path, port: int) -> Path: | |
| out_path = repo / "outputs" / f"eval_{Path(cfg['MODEL_NAME']).name.lower()}_{cfg['EVAL_LABEL']}.json" | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| env = os.environ.copy() | |
| env["API_BASE_URL"] = f"http://127.0.0.1:{port}/v1" | |
| env["MODEL_NAME"] = cfg["MODEL_NAME"] | |
| env["HF_TOKEN"] = cfg["HF_TOKEN"] # vllm ignores this; openai client wants a key | |
| env["ENV_BASE_URL"] = cfg["ENV_BASE_URL"] | |
| cmd = [ | |
| sys.executable, | |
| str(repo / "scripts" / "run_eval.py"), | |
| "--mode", | |
| "api", | |
| "--out", | |
| str(out_path), | |
| "--limit", | |
| str(cfg["LIMIT"]), | |
| ] | |
| print(f"[eval] running: {' '.join(cmd)}", flush=True) | |
| res = subprocess.run(cmd, env=env, cwd=str(repo)) | |
| if res.returncode != 0: | |
| raise RuntimeError(f"run_eval.py exited with {res.returncode}") | |
| return out_path | |
| def _push_to_hub(cfg: dict, eval_json: Path) -> None: | |
| from huggingface_hub import HfApi | |
| api = HfApi(token=cfg["HF_TOKEN"]) | |
| target = f"evals/{eval_json.name}" | |
| print(f"[push] uploading {eval_json} → {cfg['PUSH_TO_REPO']}:{target}", flush=True) | |
| api.upload_file( | |
| path_or_fileobj=str(eval_json), | |
| path_in_repo=target, | |
| repo_id=cfg["PUSH_TO_REPO"], | |
| repo_type="model", | |
| commit_message=f"eval: {cfg['EVAL_LABEL']}", | |
| ) | |
| print(f"[push] done — see https://huggingface.co/{cfg['PUSH_TO_REPO']}/blob/main/{target}", flush=True) | |
| def main() -> None: | |
| cfg = _read_env() | |
| print("=" * 70, flush=True) | |
| print(f"clarify-rl vllm eval | model={cfg['MODEL_NAME']} | n={cfg['LIMIT']}", flush=True) | |
| print(f"env={cfg['ENV_BASE_URL']} push_to={cfg['PUSH_TO_REPO']}", flush=True) | |
| print("=" * 70, flush=True) | |
| repo = _bootstrap_repo(cfg["REPO_SPACE_ID"], cfg["HF_TOKEN"]) | |
| port = _free_port(cfg["VLLM_PORT"]) | |
| proc, log_path = _start_vllm(cfg["MODEL_NAME"], port, cfg["GPU_MEM_UTIL"], cfg["MAX_MODEL_LEN"]) | |
| try: | |
| try: | |
| _wait_for_vllm(port, timeout_s=600) | |
| except Exception: | |
| print("[vllm] failed to start. Last 80 log lines:", flush=True) | |
| try: | |
| tail = log_path.read_text().splitlines()[-80:] | |
| print("\n".join(tail), flush=True) | |
| except Exception: | |
| pass | |
| raise | |
| eval_json = _run_eval(cfg, repo, port) | |
| _push_to_hub(cfg, eval_json) | |
| try: | |
| payload = json.loads(eval_json.read_text()) | |
| print(json.dumps({"summary": payload.get("summary", {})}, indent=2), flush=True) | |
| except Exception: | |
| pass | |
| finally: | |
| if proc.poll() is None: | |
| print("[vllm] terminating server ...", flush=True) | |
| proc.terminate() | |
| try: | |
| proc.wait(timeout=15) | |
| except subprocess.TimeoutExpired: | |
| proc.kill() | |
| if __name__ == "__main__": | |
| main() | |