#!/usr/bin/env python3 """ eval_local.py — run the spec_rl RL-eval loop OFFLINE against the local stub. Purpose ------- Prove the *shape* of the RL evaluation loop with NO Prime Intellect credits and NO GPU: drive the spec_rl HumanEval code task's rollouts against the local, stdlib OpenAI-compatible stub (scripts/stub_server.py) and compute the SAME reward the verifiers environment computes (`@vf.reward code_reward`) — run the model's candidate code against the problem's unit tests and return the FRACTION of assertions that pass (dense RL signal; the pass@1 eval stays binary). At the venue the same loop points at the DFlash-speculated vLLM endpoint instead of the stub. Because greedy speculative decoding is lossless, the reward curve is identical; only the cost per rollout drops. This script lets us validate the loop end-to-end before any credits are spent. Reward logic is NOT reimplemented here — it is imported verbatim from `environments/spec_rl/spec_rl.py` (`fraction_passing`, `passes`, `STOP`, `load_problems`), so what runs locally is byte-identical to what the verifiers env scores at the venue. Two execution paths (auto-selected, reported in the output) ----------------------------------------------------------- 1. "verifiers" — if `verifiers` imports AND `spec_rl.load_environment()` constructs cleanly AND the endpoint exposes /v1/chat/completions, drive the real `vf.SingleTurnEnv.evaluate(...)`. This is the true RL-eval API. 2. "manual" — otherwise, a minimal hand-rolled rollout loop: build the same chat prompt, call the endpoint, trim at STOP, score with spec_rl.passes. This is the path that actually runs against the canned-completion stub (which serves only /v1/completions), and it is reported as such. Note on the stub: it returns a fixed canned completion for EVERY prompt, so the real HumanEval tests will almost always fail (reward 0.0). That is expected and correct — the point here is to prove the loop runs end-to-end offline without erroring, not to produce a real pass@1. Real rewards come from Laguna at the venue. SAFETY: scoring executes model-generated code in a timed subprocess (see spec_rl.passes). Locally the "code" is the stub's harmless canned snippet. Run RL rollouts only in the disposable venue sandbox, never against real data. Usage ----- # start a stub first: make stub (baseline, :8000) # or: make stub-b (dflash, :8001) python scripts/eval_local.py --base-url http://localhost:8000 --model laguna --n 5 """ from __future__ import annotations import argparse import json import os import sys import urllib.error import urllib.request from pathlib import Path # --------------------------------------------------------------------------- # Import the spec_rl env module so reward logic is shared, not duplicated. The # env lives in a sibling tree (environments/spec_rl/spec_rl.py); add it to the # path. spec_rl is import-safe even when `verifiers` is absent (its vf import is # guarded), so this works on the Mac with no GPU and no verifiers. # --------------------------------------------------------------------------- _HERE = Path(__file__).resolve() _REPO = _HERE.parents[1] # .../laguna-hack _GPU_HW = _HERE.parents[2] # .../gpu_and_inference_hw _SPEC_RL_DIR = _GPU_HW / "environments" / "spec_rl" if str(_SPEC_RL_DIR) not in sys.path: sys.path.insert(0, str(_SPEC_RL_DIR)) import spec_rl # noqa: E402 — shared reward core (passes, STOP, load_problems, ...) DEFAULT_OUT = _REPO / "results" / "eval_local.json" # System prompt mirrors spec_rl.load_environment so the manual loop sends the # exact same instruction the verifiers env would. SYSTEM_PROMPT = ( "You are an expert Python programmer. You will be given a function " "signature and docstring. Complete the function body only. Do not repeat " "the signature, do not add explanations, and do not wrap the code in " "markdown fences. Output only the indented function body." ) # --------------------------------------------------------------------------- # Endpoint helpers (stdlib urllib only — matches the rest of the harness). # --------------------------------------------------------------------------- def _post_json(url: str, payload: dict, timeout: int = 600) -> dict: data = json.dumps(payload).encode() req = urllib.request.Request( url, data=data, headers={"Content-Type": "application/json"} ) with urllib.request.urlopen(req, timeout=timeout) as r: return json.loads(r.read().decode()) def _endpoint_has_chat(base_url: str) -> bool: """True if the endpoint answers /v1/chat/completions (vLLM does; stub does not).""" url = base_url.rstrip("/") + "/v1/chat/completions" probe = { "model": "probe", "messages": [{"role": "user", "content": "ping"}], "max_tokens": 1, "temperature": 0.0, } try: _post_json(url, probe, timeout=10) return True except urllib.error.HTTPError as e: # 4xx/5xx still means the route exists and parsed our body; only a # 404 means "no chat endpoint here" (the stub returns 404 for it). return e.code != 404 except Exception: return False def complete_chat(base_url: str, model: str, user_content: str, max_tokens: int) -> str: """Greedy chat completion (Laguna/vLLM path).""" url = base_url.rstrip("/") + "/v1/chat/completions" payload = { "model": model, "messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_content}, ], "max_tokens": max_tokens, "temperature": 0.0, # greedy => deterministic => lossless-comparable "stop": spec_rl.STOP, } obj = _post_json(url, payload) return obj["choices"][0]["message"]["content"] or "" def complete_text(base_url: str, model: str, prompt: str, max_tokens: int) -> str: """Greedy text completion (the stub path; also valid for vLLM completions).""" url = base_url.rstrip("/") + "/v1/completions" payload = { "model": model, "prompt": prompt, "max_tokens": max_tokens, "temperature": 0.0, "stop": spec_rl.STOP, } obj = _post_json(url, payload) return obj["choices"][0]["text"] or "" def _trim_at_stop(text: str) -> str: """Cut at the first STOP sequence, mirroring the env's code_passes reward.""" for stop in spec_rl.STOP: idx = text.find(stop) if idx != -1: text = text[:idx] return text # --------------------------------------------------------------------------- # Path 1 — drive the real verifiers env, if (and only if) it constructs cleanly # AND the endpoint speaks chat. Returns a results dict, or None to fall back. # --------------------------------------------------------------------------- def try_verifiers(base_url: str, model: str, n: int) -> dict | None: try: import verifiers as vf # noqa: F401 except Exception: return None # load_environment() builds a vf.SingleTurnEnv. In some verifiers versions # the symbols spec_rl references (e.g. vf.Dataset) may not exist; guard the # whole construction so a mismatch falls back to the manual loop instead of # crashing the eval. try: env = spec_rl.load_environment(num_examples=n) except Exception as e: # AttributeError/ImportError/etc. -> manual fallback print(f"[eval_local] verifiers env did not construct ({type(e).__name__}: {e});" " falling back to manual rollout loop.") return None if not _endpoint_has_chat(base_url): print("[eval_local] endpoint has no /v1/chat/completions (the local stub " "serves only /v1/completions); using manual rollout loop instead.") return None try: from openai import OpenAI # type: ignore except Exception: print("[eval_local] openai client not available; using manual rollout loop.") return None client = OpenAI(base_url=base_url.rstrip("/") + "/v1", api_key="EMPTY") out = env.evaluate(client=client, model=model, num_examples=n, save_results=False) # Normalize verifiers' GenerateOutputs into our flat per-example shape. rewards = list(getattr(out, "reward", []) or []) completions = list(getattr(out, "completion", []) or []) infos = list(getattr(out, "info", []) or []) per_example = [] for i, r in enumerate(rewards): info = infos[i] if i < len(infos) else {} per_example.append({ "index": i, "task_id": (info or {}).get("task_id", f"example_{i}"), "score": float(r), "completion": completions[i] if i < len(completions) else "", }) mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0 return {"driver": "verifiers", "mean_reward": mean, "per_example": per_example} # --------------------------------------------------------------------------- # Path 2 — manual rollout loop (the offline / stub path). Reuses spec_rl.passes # and spec_rl.STOP so the reward is identical to the env's @vf.reward. # --------------------------------------------------------------------------- def manual_rollouts(base_url: str, model: str, n: int, max_tokens: int) -> dict: problems = spec_rl.load_problems(n) use_chat = _endpoint_has_chat(base_url) transport = "chat" if use_chat else "completions" print(f"[eval_local] manual loop: {len(problems)} examples via /v1/{transport} " f"at {base_url} (model={model})") per_example = [] for i, prob in enumerate(problems): if use_chat: raw = complete_chat(base_url, model, prob["prompt"], max_tokens) else: # Stub path: it ignores the prompt and returns a canned body, so we # send the bare code prompt the same way humaneval_subset.py does. raw = complete_text(base_url, model, prob["prompt"], max_tokens) completion = _trim_at_stop(raw) # Reward: identical logic to spec_rl's @vf.reward code_passes — rebuild # the problem from its own fields (never trust the model to echo it) and # run the unit tests in a timed subprocess. problem = { "prompt": prob["prompt"], "test": prob["test"], "entry_point": prob["entry_point"], } score = spec_rl.fraction_passing(problem, completion) per_example.append({ "index": i, "task_id": prob["task_id"], "score": score, "completion": completion, }) print(f" [{i+1}/{len(problems)}] {prob['task_id']}: " f"reward={score:.3f}") mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0 return { "driver": "manual", "transport": transport, "mean_reward": mean, "per_example": per_example, } def main() -> int: p = argparse.ArgumentParser( description="Run the spec_rl RL-eval loop offline against the local stub " "(or any OpenAI-compatible endpoint) and compute the reward." ) p.add_argument("--base-url", default="http://localhost:8000", help="OpenAI-compatible endpoint (stub :8000 / dflash stub :8001 / vLLM).") p.add_argument("--model", default="laguna") p.add_argument("--n", type=int, default=5, help="Number of HumanEval problems (rollouts).") p.add_argument("--max-tokens", type=int, default=512) p.add_argument("--out", default=str(DEFAULT_OUT), help="Where to write the small JSON summary.") p.add_argument("--force-manual", action="store_true", help="Skip the verifiers path; always use the manual rollout loop.") args = p.parse_args() result = None if not args.force_manual: result = try_verifiers(args.base_url, args.model, args.n) if result is None: result = manual_rollouts(args.base_url, args.model, args.n, args.max_tokens) summary = { "base_url": args.base_url, "model": args.model, "n": len(result["per_example"]), "driver": result["driver"], "transport": result.get("transport", "chat"), "mean_reward": result["mean_reward"], "scores": [p["score"] for p in result["per_example"]], "per_example": [ {"task_id": p["task_id"], "score": p["score"]} for p in result["per_example"] ], } print(json.dumps( {k: v for k, v in summary.items() if k != "per_example"}, indent=2 )) print(f"[eval_local] driver={summary['driver']} " f"mean_reward={summary['mean_reward']:.3f} n={summary['n']}") out_path = Path(args.out) out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(json.dumps(summary, indent=2)) print(f"[eval_local] wrote {out_path}") return 0 if __name__ == "__main__": raise SystemExit(main())