lean-laguna / scripts /eval_local.py
art87able's picture
Lean Laguna: lossless DFlash speculative decoding on Laguna XS.2 (harness, environment, results)
0a55ff6
#!/usr/bin/env python3
"""
eval_local.py — run the spec_rl RL-eval loop OFFLINE against the local stub.
Purpose
-------
Prove the *shape* of the RL evaluation loop with NO Prime Intellect credits and
NO GPU: drive the spec_rl HumanEval code task's rollouts against the local,
stdlib OpenAI-compatible stub (scripts/stub_server.py) and compute the SAME
reward the verifiers environment computes (`@vf.reward code_reward`) — run the
model's candidate code against the problem's unit tests and return the FRACTION
of assertions that pass (dense RL signal; the pass@1 eval stays binary).
At the venue the same loop points at the DFlash-speculated vLLM endpoint instead
of the stub. Because greedy speculative decoding is lossless, the reward curve is
identical; only the cost per rollout drops. This script lets us validate the loop
end-to-end before any credits are spent.
Reward logic is NOT reimplemented here — it is imported verbatim from
`environments/spec_rl/spec_rl.py` (`fraction_passing`, `passes`, `STOP`,
`load_problems`), so what runs locally is byte-identical to what the verifiers
env scores at the venue.
Two execution paths (auto-selected, reported in the output)
-----------------------------------------------------------
1. "verifiers" — if `verifiers` imports AND `spec_rl.load_environment()`
constructs cleanly AND the endpoint exposes /v1/chat/completions, drive the
real `vf.SingleTurnEnv.evaluate(...)`. This is the true RL-eval API.
2. "manual" — otherwise, a minimal hand-rolled rollout loop: build the same
chat prompt, call the endpoint, trim at STOP, score with spec_rl.passes.
This is the path that actually runs against the canned-completion stub
(which serves only /v1/completions), and it is reported as such.
Note on the stub: it returns a fixed canned completion for EVERY prompt, so the
real HumanEval tests will almost always fail (reward 0.0). That is expected and
correct — the point here is to prove the loop runs end-to-end offline without
erroring, not to produce a real pass@1. Real rewards come from Laguna at the venue.
SAFETY: scoring executes model-generated code in a timed subprocess (see
spec_rl.passes). Locally the "code" is the stub's harmless canned snippet. Run RL
rollouts only in the disposable venue sandbox, never against real data.
Usage
-----
# start a stub first: make stub (baseline, :8000)
# or: make stub-b (dflash, :8001)
python scripts/eval_local.py --base-url http://localhost:8000 --model laguna --n 5
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import urllib.error
import urllib.request
from pathlib import Path
# ---------------------------------------------------------------------------
# Import the spec_rl env module so reward logic is shared, not duplicated. The
# env lives in a sibling tree (environments/spec_rl/spec_rl.py); add it to the
# path. spec_rl is import-safe even when `verifiers` is absent (its vf import is
# guarded), so this works on the Mac with no GPU and no verifiers.
# ---------------------------------------------------------------------------
_HERE = Path(__file__).resolve()
_REPO = _HERE.parents[1] # .../laguna-hack
_GPU_HW = _HERE.parents[2] # .../gpu_and_inference_hw
_SPEC_RL_DIR = _GPU_HW / "environments" / "spec_rl"
if str(_SPEC_RL_DIR) not in sys.path:
sys.path.insert(0, str(_SPEC_RL_DIR))
import spec_rl # noqa: E402 — shared reward core (passes, STOP, load_problems, ...)
DEFAULT_OUT = _REPO / "results" / "eval_local.json"
# System prompt mirrors spec_rl.load_environment so the manual loop sends the
# exact same instruction the verifiers env would.
SYSTEM_PROMPT = (
"You are an expert Python programmer. You will be given a function "
"signature and docstring. Complete the function body only. Do not repeat "
"the signature, do not add explanations, and do not wrap the code in "
"markdown fences. Output only the indented function body."
)
# ---------------------------------------------------------------------------
# Endpoint helpers (stdlib urllib only — matches the rest of the harness).
# ---------------------------------------------------------------------------
def _post_json(url: str, payload: dict, timeout: int = 600) -> dict:
data = json.dumps(payload).encode()
req = urllib.request.Request(
url, data=data, headers={"Content-Type": "application/json"}
)
with urllib.request.urlopen(req, timeout=timeout) as r:
return json.loads(r.read().decode())
def _endpoint_has_chat(base_url: str) -> bool:
"""True if the endpoint answers /v1/chat/completions (vLLM does; stub does not)."""
url = base_url.rstrip("/") + "/v1/chat/completions"
probe = {
"model": "probe",
"messages": [{"role": "user", "content": "ping"}],
"max_tokens": 1,
"temperature": 0.0,
}
try:
_post_json(url, probe, timeout=10)
return True
except urllib.error.HTTPError as e:
# 4xx/5xx still means the route exists and parsed our body; only a
# 404 means "no chat endpoint here" (the stub returns 404 for it).
return e.code != 404
except Exception:
return False
def complete_chat(base_url: str, model: str, user_content: str, max_tokens: int) -> str:
"""Greedy chat completion (Laguna/vLLM path)."""
url = base_url.rstrip("/") + "/v1/chat/completions"
payload = {
"model": model,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_content},
],
"max_tokens": max_tokens,
"temperature": 0.0, # greedy => deterministic => lossless-comparable
"stop": spec_rl.STOP,
}
obj = _post_json(url, payload)
return obj["choices"][0]["message"]["content"] or ""
def complete_text(base_url: str, model: str, prompt: str, max_tokens: int) -> str:
"""Greedy text completion (the stub path; also valid for vLLM completions)."""
url = base_url.rstrip("/") + "/v1/completions"
payload = {
"model": model,
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": 0.0,
"stop": spec_rl.STOP,
}
obj = _post_json(url, payload)
return obj["choices"][0]["text"] or ""
def _trim_at_stop(text: str) -> str:
"""Cut at the first STOP sequence, mirroring the env's code_passes reward."""
for stop in spec_rl.STOP:
idx = text.find(stop)
if idx != -1:
text = text[:idx]
return text
# ---------------------------------------------------------------------------
# Path 1 — drive the real verifiers env, if (and only if) it constructs cleanly
# AND the endpoint speaks chat. Returns a results dict, or None to fall back.
# ---------------------------------------------------------------------------
def try_verifiers(base_url: str, model: str, n: int) -> dict | None:
try:
import verifiers as vf # noqa: F401
except Exception:
return None
# load_environment() builds a vf.SingleTurnEnv. In some verifiers versions
# the symbols spec_rl references (e.g. vf.Dataset) may not exist; guard the
# whole construction so a mismatch falls back to the manual loop instead of
# crashing the eval.
try:
env = spec_rl.load_environment(num_examples=n)
except Exception as e: # AttributeError/ImportError/etc. -> manual fallback
print(f"[eval_local] verifiers env did not construct ({type(e).__name__}: {e});"
" falling back to manual rollout loop.")
return None
if not _endpoint_has_chat(base_url):
print("[eval_local] endpoint has no /v1/chat/completions (the local stub "
"serves only /v1/completions); using manual rollout loop instead.")
return None
try:
from openai import OpenAI # type: ignore
except Exception:
print("[eval_local] openai client not available; using manual rollout loop.")
return None
client = OpenAI(base_url=base_url.rstrip("/") + "/v1", api_key="EMPTY")
out = env.evaluate(client=client, model=model, num_examples=n, save_results=False)
# Normalize verifiers' GenerateOutputs into our flat per-example shape.
rewards = list(getattr(out, "reward", []) or [])
completions = list(getattr(out, "completion", []) or [])
infos = list(getattr(out, "info", []) or [])
per_example = []
for i, r in enumerate(rewards):
info = infos[i] if i < len(infos) else {}
per_example.append({
"index": i,
"task_id": (info or {}).get("task_id", f"example_{i}"),
"score": float(r),
"completion": completions[i] if i < len(completions) else "",
})
mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0
return {"driver": "verifiers", "mean_reward": mean, "per_example": per_example}
# ---------------------------------------------------------------------------
# Path 2 — manual rollout loop (the offline / stub path). Reuses spec_rl.passes
# and spec_rl.STOP so the reward is identical to the env's @vf.reward.
# ---------------------------------------------------------------------------
def manual_rollouts(base_url: str, model: str, n: int, max_tokens: int) -> dict:
problems = spec_rl.load_problems(n)
use_chat = _endpoint_has_chat(base_url)
transport = "chat" if use_chat else "completions"
print(f"[eval_local] manual loop: {len(problems)} examples via /v1/{transport} "
f"at {base_url} (model={model})")
per_example = []
for i, prob in enumerate(problems):
if use_chat:
raw = complete_chat(base_url, model, prob["prompt"], max_tokens)
else:
# Stub path: it ignores the prompt and returns a canned body, so we
# send the bare code prompt the same way humaneval_subset.py does.
raw = complete_text(base_url, model, prob["prompt"], max_tokens)
completion = _trim_at_stop(raw)
# Reward: identical logic to spec_rl's @vf.reward code_passes — rebuild
# the problem from its own fields (never trust the model to echo it) and
# run the unit tests in a timed subprocess.
problem = {
"prompt": prob["prompt"],
"test": prob["test"],
"entry_point": prob["entry_point"],
}
score = spec_rl.fraction_passing(problem, completion)
per_example.append({
"index": i,
"task_id": prob["task_id"],
"score": score,
"completion": completion,
})
print(f" [{i+1}/{len(problems)}] {prob['task_id']}: "
f"reward={score:.3f}")
mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0
return {
"driver": "manual",
"transport": transport,
"mean_reward": mean,
"per_example": per_example,
}
def main() -> int:
p = argparse.ArgumentParser(
description="Run the spec_rl RL-eval loop offline against the local stub "
"(or any OpenAI-compatible endpoint) and compute the reward."
)
p.add_argument("--base-url", default="http://localhost:8000",
help="OpenAI-compatible endpoint (stub :8000 / dflash stub :8001 / vLLM).")
p.add_argument("--model", default="laguna")
p.add_argument("--n", type=int, default=5, help="Number of HumanEval problems (rollouts).")
p.add_argument("--max-tokens", type=int, default=512)
p.add_argument("--out", default=str(DEFAULT_OUT),
help="Where to write the small JSON summary.")
p.add_argument("--force-manual", action="store_true",
help="Skip the verifiers path; always use the manual rollout loop.")
args = p.parse_args()
result = None
if not args.force_manual:
result = try_verifiers(args.base_url, args.model, args.n)
if result is None:
result = manual_rollouts(args.base_url, args.model, args.n, args.max_tokens)
summary = {
"base_url": args.base_url,
"model": args.model,
"n": len(result["per_example"]),
"driver": result["driver"],
"transport": result.get("transport", "chat"),
"mean_reward": result["mean_reward"],
"scores": [p["score"] for p in result["per_example"]],
"per_example": [
{"task_id": p["task_id"], "score": p["score"]}
for p in result["per_example"]
],
}
print(json.dumps(
{k: v for k, v in summary.items() if k != "per_example"}, indent=2
))
print(f"[eval_local] driver={summary['driver']} "
f"mean_reward={summary['mean_reward']:.3f} n={summary['n']}")
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(summary, indent=2))
print(f"[eval_local] wrote {out_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())