| |
| """ |
| eval_local.py — run the spec_rl RL-eval loop OFFLINE against the local stub. |
| |
| Purpose |
| ------- |
| Prove the *shape* of the RL evaluation loop with NO Prime Intellect credits and |
| NO GPU: drive the spec_rl HumanEval code task's rollouts against the local, |
| stdlib OpenAI-compatible stub (scripts/stub_server.py) and compute the SAME |
| reward the verifiers environment computes (`@vf.reward code_reward`) — run the |
| model's candidate code against the problem's unit tests and return the FRACTION |
| of assertions that pass (dense RL signal; the pass@1 eval stays binary). |
| |
| At the venue the same loop points at the DFlash-speculated vLLM endpoint instead |
| of the stub. Because greedy speculative decoding is lossless, the reward curve is |
| identical; only the cost per rollout drops. This script lets us validate the loop |
| end-to-end before any credits are spent. |
| |
| Reward logic is NOT reimplemented here — it is imported verbatim from |
| `environments/spec_rl/spec_rl.py` (`fraction_passing`, `passes`, `STOP`, |
| `load_problems`), so what runs locally is byte-identical to what the verifiers |
| env scores at the venue. |
| |
| Two execution paths (auto-selected, reported in the output) |
| ----------------------------------------------------------- |
| 1. "verifiers" — if `verifiers` imports AND `spec_rl.load_environment()` |
| constructs cleanly AND the endpoint exposes /v1/chat/completions, drive the |
| real `vf.SingleTurnEnv.evaluate(...)`. This is the true RL-eval API. |
| 2. "manual" — otherwise, a minimal hand-rolled rollout loop: build the same |
| chat prompt, call the endpoint, trim at STOP, score with spec_rl.passes. |
| This is the path that actually runs against the canned-completion stub |
| (which serves only /v1/completions), and it is reported as such. |
| |
| Note on the stub: it returns a fixed canned completion for EVERY prompt, so the |
| real HumanEval tests will almost always fail (reward 0.0). That is expected and |
| correct — the point here is to prove the loop runs end-to-end offline without |
| erroring, not to produce a real pass@1. Real rewards come from Laguna at the venue. |
| |
| SAFETY: scoring executes model-generated code in a timed subprocess (see |
| spec_rl.passes). Locally the "code" is the stub's harmless canned snippet. Run RL |
| rollouts only in the disposable venue sandbox, never against real data. |
| |
| Usage |
| ----- |
| # start a stub first: make stub (baseline, :8000) |
| # or: make stub-b (dflash, :8001) |
| python scripts/eval_local.py --base-url http://localhost:8000 --model laguna --n 5 |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import sys |
| import urllib.error |
| import urllib.request |
| from pathlib import Path |
|
|
| |
| |
| |
| |
| |
| |
| _HERE = Path(__file__).resolve() |
| _REPO = _HERE.parents[1] |
| _GPU_HW = _HERE.parents[2] |
| _SPEC_RL_DIR = _GPU_HW / "environments" / "spec_rl" |
| if str(_SPEC_RL_DIR) not in sys.path: |
| sys.path.insert(0, str(_SPEC_RL_DIR)) |
|
|
| import spec_rl |
|
|
| DEFAULT_OUT = _REPO / "results" / "eval_local.json" |
|
|
| |
| |
| SYSTEM_PROMPT = ( |
| "You are an expert Python programmer. You will be given a function " |
| "signature and docstring. Complete the function body only. Do not repeat " |
| "the signature, do not add explanations, and do not wrap the code in " |
| "markdown fences. Output only the indented function body." |
| ) |
|
|
|
|
| |
| |
| |
| def _post_json(url: str, payload: dict, timeout: int = 600) -> dict: |
| data = json.dumps(payload).encode() |
| req = urllib.request.Request( |
| url, data=data, headers={"Content-Type": "application/json"} |
| ) |
| with urllib.request.urlopen(req, timeout=timeout) as r: |
| return json.loads(r.read().decode()) |
|
|
|
|
| def _endpoint_has_chat(base_url: str) -> bool: |
| """True if the endpoint answers /v1/chat/completions (vLLM does; stub does not).""" |
| url = base_url.rstrip("/") + "/v1/chat/completions" |
| probe = { |
| "model": "probe", |
| "messages": [{"role": "user", "content": "ping"}], |
| "max_tokens": 1, |
| "temperature": 0.0, |
| } |
| try: |
| _post_json(url, probe, timeout=10) |
| return True |
| except urllib.error.HTTPError as e: |
| |
| |
| return e.code != 404 |
| except Exception: |
| return False |
|
|
|
|
| def complete_chat(base_url: str, model: str, user_content: str, max_tokens: int) -> str: |
| """Greedy chat completion (Laguna/vLLM path).""" |
| url = base_url.rstrip("/") + "/v1/chat/completions" |
| payload = { |
| "model": model, |
| "messages": [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": user_content}, |
| ], |
| "max_tokens": max_tokens, |
| "temperature": 0.0, |
| "stop": spec_rl.STOP, |
| } |
| obj = _post_json(url, payload) |
| return obj["choices"][0]["message"]["content"] or "" |
|
|
|
|
| def complete_text(base_url: str, model: str, prompt: str, max_tokens: int) -> str: |
| """Greedy text completion (the stub path; also valid for vLLM completions).""" |
| url = base_url.rstrip("/") + "/v1/completions" |
| payload = { |
| "model": model, |
| "prompt": prompt, |
| "max_tokens": max_tokens, |
| "temperature": 0.0, |
| "stop": spec_rl.STOP, |
| } |
| obj = _post_json(url, payload) |
| return obj["choices"][0]["text"] or "" |
|
|
|
|
| def _trim_at_stop(text: str) -> str: |
| """Cut at the first STOP sequence, mirroring the env's code_passes reward.""" |
| for stop in spec_rl.STOP: |
| idx = text.find(stop) |
| if idx != -1: |
| text = text[:idx] |
| return text |
|
|
|
|
| |
| |
| |
| |
| def try_verifiers(base_url: str, model: str, n: int) -> dict | None: |
| try: |
| import verifiers as vf |
| except Exception: |
| return None |
| |
| |
| |
| |
| try: |
| env = spec_rl.load_environment(num_examples=n) |
| except Exception as e: |
| print(f"[eval_local] verifiers env did not construct ({type(e).__name__}: {e});" |
| " falling back to manual rollout loop.") |
| return None |
| if not _endpoint_has_chat(base_url): |
| print("[eval_local] endpoint has no /v1/chat/completions (the local stub " |
| "serves only /v1/completions); using manual rollout loop instead.") |
| return None |
| try: |
| from openai import OpenAI |
| except Exception: |
| print("[eval_local] openai client not available; using manual rollout loop.") |
| return None |
|
|
| client = OpenAI(base_url=base_url.rstrip("/") + "/v1", api_key="EMPTY") |
| out = env.evaluate(client=client, model=model, num_examples=n, save_results=False) |
|
|
| |
| rewards = list(getattr(out, "reward", []) or []) |
| completions = list(getattr(out, "completion", []) or []) |
| infos = list(getattr(out, "info", []) or []) |
| per_example = [] |
| for i, r in enumerate(rewards): |
| info = infos[i] if i < len(infos) else {} |
| per_example.append({ |
| "index": i, |
| "task_id": (info or {}).get("task_id", f"example_{i}"), |
| "score": float(r), |
| "completion": completions[i] if i < len(completions) else "", |
| }) |
| mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0 |
| return {"driver": "verifiers", "mean_reward": mean, "per_example": per_example} |
|
|
|
|
| |
| |
| |
| |
| def manual_rollouts(base_url: str, model: str, n: int, max_tokens: int) -> dict: |
| problems = spec_rl.load_problems(n) |
| use_chat = _endpoint_has_chat(base_url) |
| transport = "chat" if use_chat else "completions" |
| print(f"[eval_local] manual loop: {len(problems)} examples via /v1/{transport} " |
| f"at {base_url} (model={model})") |
|
|
| per_example = [] |
| for i, prob in enumerate(problems): |
| if use_chat: |
| raw = complete_chat(base_url, model, prob["prompt"], max_tokens) |
| else: |
| |
| |
| raw = complete_text(base_url, model, prob["prompt"], max_tokens) |
| completion = _trim_at_stop(raw) |
|
|
| |
| |
| |
| problem = { |
| "prompt": prob["prompt"], |
| "test": prob["test"], |
| "entry_point": prob["entry_point"], |
| } |
| score = spec_rl.fraction_passing(problem, completion) |
| per_example.append({ |
| "index": i, |
| "task_id": prob["task_id"], |
| "score": score, |
| "completion": completion, |
| }) |
| print(f" [{i+1}/{len(problems)}] {prob['task_id']}: " |
| f"reward={score:.3f}") |
|
|
| mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0 |
| return { |
| "driver": "manual", |
| "transport": transport, |
| "mean_reward": mean, |
| "per_example": per_example, |
| } |
|
|
|
|
| def main() -> int: |
| p = argparse.ArgumentParser( |
| description="Run the spec_rl RL-eval loop offline against the local stub " |
| "(or any OpenAI-compatible endpoint) and compute the reward." |
| ) |
| p.add_argument("--base-url", default="http://localhost:8000", |
| help="OpenAI-compatible endpoint (stub :8000 / dflash stub :8001 / vLLM).") |
| p.add_argument("--model", default="laguna") |
| p.add_argument("--n", type=int, default=5, help="Number of HumanEval problems (rollouts).") |
| p.add_argument("--max-tokens", type=int, default=512) |
| p.add_argument("--out", default=str(DEFAULT_OUT), |
| help="Where to write the small JSON summary.") |
| p.add_argument("--force-manual", action="store_true", |
| help="Skip the verifiers path; always use the manual rollout loop.") |
| args = p.parse_args() |
|
|
| result = None |
| if not args.force_manual: |
| result = try_verifiers(args.base_url, args.model, args.n) |
| if result is None: |
| result = manual_rollouts(args.base_url, args.model, args.n, args.max_tokens) |
|
|
| summary = { |
| "base_url": args.base_url, |
| "model": args.model, |
| "n": len(result["per_example"]), |
| "driver": result["driver"], |
| "transport": result.get("transport", "chat"), |
| "mean_reward": result["mean_reward"], |
| "scores": [p["score"] for p in result["per_example"]], |
| "per_example": [ |
| {"task_id": p["task_id"], "score": p["score"]} |
| for p in result["per_example"] |
| ], |
| } |
|
|
| print(json.dumps( |
| {k: v for k, v in summary.items() if k != "per_example"}, indent=2 |
| )) |
| print(f"[eval_local] driver={summary['driver']} " |
| f"mean_reward={summary['mean_reward']:.3f} n={summary['n']}") |
|
|
| out_path = Path(args.out) |
| out_path.parent.mkdir(parents=True, exist_ok=True) |
| out_path.write_text(json.dumps(summary, indent=2)) |
| print(f"[eval_local] wrote {out_path}") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|