Lean Laguna: lossless DFlash speculative decoding on Laguna XS.2 (harness, environment, results)

0a55ff6 about 5 hours ago

13.1 kB

	#!/usr/bin/env python3
	"""
	eval_local.py — run the spec_rl RL-eval loop OFFLINE against the local stub.

	Purpose
	-------
	Prove the shape of the RL evaluation loop with NO Prime Intellect credits and
	NO GPU: drive the spec_rl HumanEval code task's rollouts against the local,
	stdlib OpenAI-compatible stub (scripts/stub_server.py) and compute the SAME
	reward the verifiers environment computes (`@vf.reward code_reward`) — run the
	model's candidate code against the problem's unit tests and return the FRACTION
	of assertions that pass (dense RL signal; the pass@1 eval stays binary).

	At the venue the same loop points at the DFlash-speculated vLLM endpoint instead
	of the stub. Because greedy speculative decoding is lossless, the reward curve is
	identical; only the cost per rollout drops. This script lets us validate the loop
	end-to-end before any credits are spent.

	Reward logic is NOT reimplemented here — it is imported verbatim from
	`environments/spec_rl/spec_rl.py` (`fraction_passing`, `passes`, `STOP`,
	`load_problems`), so what runs locally is byte-identical to what the verifiers
	env scores at the venue.

	Two execution paths (auto-selected, reported in the output)
	-----------------------------------------------------------
	1. "verifiers" — if `verifiers` imports AND `spec_rl.load_environment()`
	constructs cleanly AND the endpoint exposes /v1/chat/completions, drive the
	real `vf.SingleTurnEnv.evaluate(...)`. This is the true RL-eval API.
	2. "manual" — otherwise, a minimal hand-rolled rollout loop: build the same
	chat prompt, call the endpoint, trim at STOP, score with spec_rl.passes.
	This is the path that actually runs against the canned-completion stub
	(which serves only /v1/completions), and it is reported as such.

	Note on the stub: it returns a fixed canned completion for EVERY prompt, so the
	real HumanEval tests will almost always fail (reward 0.0). That is expected and
	correct — the point here is to prove the loop runs end-to-end offline without
	erroring, not to produce a real pass@1. Real rewards come from Laguna at the venue.

	SAFETY: scoring executes model-generated code in a timed subprocess (see
	spec_rl.passes). Locally the "code" is the stub's harmless canned snippet. Run RL
	rollouts only in the disposable venue sandbox, never against real data.

	Usage
	-----
	# start a stub first: make stub (baseline, :8000)
	# or: make stub-b (dflash, :8001)
	python scripts/eval_local.py --base-url http://localhost:8000 --model laguna --n 5
	"""
	from __future__ import annotations

	import argparse
	import json
	import os
	import sys
	import urllib.error
	import urllib.request
	from pathlib import Path

	# ---------------------------------------------------------------------------
	# Import the spec_rl env module so reward logic is shared, not duplicated. The
	# env lives in a sibling tree (environments/spec_rl/spec_rl.py); add it to the
	# path. spec_rl is import-safe even when `verifiers` is absent (its vf import is
	# guarded), so this works on the Mac with no GPU and no verifiers.
	# ---------------------------------------------------------------------------
	_HERE = Path(__file__).resolve()
	_REPO = _HERE.parents[1] # .../laguna-hack
	_GPU_HW = _HERE.parents[2] # .../gpu_and_inference_hw
	_SPEC_RL_DIR = _GPU_HW / "environments" / "spec_rl"
	if str(_SPEC_RL_DIR) not in sys.path:
	sys.path.insert(0, str(_SPEC_RL_DIR))

	import spec_rl # noqa: E402 — shared reward core (passes, STOP, load_problems, ...)

	DEFAULT_OUT = _REPO / "results" / "eval_local.json"

	# System prompt mirrors spec_rl.load_environment so the manual loop sends the
	# exact same instruction the verifiers env would.
	SYSTEM_PROMPT = (
	"You are an expert Python programmer. You will be given a function "
	"signature and docstring. Complete the function body only. Do not repeat "
	"the signature, do not add explanations, and do not wrap the code in "
	"markdown fences. Output only the indented function body."
	)


	# ---------------------------------------------------------------------------
	# Endpoint helpers (stdlib urllib only — matches the rest of the harness).
	# ---------------------------------------------------------------------------
	def _post_json(url: str, payload: dict, timeout: int = 600) -> dict:
	data = json.dumps(payload).encode()
	req = urllib.request.Request(
	url, data=data, headers={"Content-Type": "application/json"}
	)
	with urllib.request.urlopen(req, timeout=timeout) as r:
	return json.loads(r.read().decode())


	def _endpoint_has_chat(base_url: str) -> bool:
	"""True if the endpoint answers /v1/chat/completions (vLLM does; stub does not)."""
	url = base_url.rstrip("/") + "/v1/chat/completions"
	probe = {
	"model": "probe",
	"messages": [{"role": "user", "content": "ping"}],
	"max_tokens": 1,
	"temperature": 0.0,
	}
	try:
	_post_json(url, probe, timeout=10)
	return True
	except urllib.error.HTTPError as e:
	# 4xx/5xx still means the route exists and parsed our body; only a
	# 404 means "no chat endpoint here" (the stub returns 404 for it).
	return e.code != 404
	except Exception:
	return False


	def complete_chat(base_url: str, model: str, user_content: str, max_tokens: int) -> str:
	"""Greedy chat completion (Laguna/vLLM path)."""
	url = base_url.rstrip("/") + "/v1/chat/completions"
	payload = {
	"model": model,
	"messages": [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_content},
	],
	"max_tokens": max_tokens,
	"temperature": 0.0, # greedy => deterministic => lossless-comparable
	"stop": spec_rl.STOP,
	}
	obj = _post_json(url, payload)
	return obj["choices"][0]["message"]["content"] or ""


	def complete_text(base_url: str, model: str, prompt: str, max_tokens: int) -> str:
	"""Greedy text completion (the stub path; also valid for vLLM completions)."""
	url = base_url.rstrip("/") + "/v1/completions"
	payload = {
	"model": model,
	"prompt": prompt,
	"max_tokens": max_tokens,
	"temperature": 0.0,
	"stop": spec_rl.STOP,
	}
	obj = _post_json(url, payload)
	return obj["choices"][0]["text"] or ""


	def _trim_at_stop(text: str) -> str:
	"""Cut at the first STOP sequence, mirroring the env's code_passes reward."""
	for stop in spec_rl.STOP:
	idx = text.find(stop)
	if idx != -1:
	text = text[:idx]
	return text


	# ---------------------------------------------------------------------------
	# Path 1 — drive the real verifiers env, if (and only if) it constructs cleanly
	# AND the endpoint speaks chat. Returns a results dict, or None to fall back.
	# ---------------------------------------------------------------------------
	def try_verifiers(base_url: str, model: str, n: int) -> dict \| None:
	try:
	import verifiers as vf # noqa: F401
	except Exception:
	return None
	# load_environment() builds a vf.SingleTurnEnv. In some verifiers versions
	# the symbols spec_rl references (e.g. vf.Dataset) may not exist; guard the
	# whole construction so a mismatch falls back to the manual loop instead of
	# crashing the eval.
	try:
	env = spec_rl.load_environment(num_examples=n)
	except Exception as e: # AttributeError/ImportError/etc. -> manual fallback
	print(f"[eval_local] verifiers env did not construct ({type(e).__name__}: {e});"
	" falling back to manual rollout loop.")
	return None
	if not _endpoint_has_chat(base_url):
	print("[eval_local] endpoint has no /v1/chat/completions (the local stub "
	"serves only /v1/completions); using manual rollout loop instead.")
	return None
	try:
	from openai import OpenAI # type: ignore
	except Exception:
	print("[eval_local] openai client not available; using manual rollout loop.")
	return None

	client = OpenAI(base_url=base_url.rstrip("/") + "/v1", api_key="EMPTY")
	out = env.evaluate(client=client, model=model, num_examples=n, save_results=False)

	# Normalize verifiers' GenerateOutputs into our flat per-example shape.
	rewards = list(getattr(out, "reward", []) or [])
	completions = list(getattr(out, "completion", []) or [])
	infos = list(getattr(out, "info", []) or [])
	per_example = []
	for i, r in enumerate(rewards):
	info = infos[i] if i < len(infos) else {}
	per_example.append({
	"index": i,
	"task_id": (info or {}).get("task_id", f"example_{i}"),
	"score": float(r),
	"completion": completions[i] if i < len(completions) else "",
	})
	mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0
	return {"driver": "verifiers", "mean_reward": mean, "per_example": per_example}


	# ---------------------------------------------------------------------------
	# Path 2 — manual rollout loop (the offline / stub path). Reuses spec_rl.passes
	# and spec_rl.STOP so the reward is identical to the env's @vf.reward.
	# ---------------------------------------------------------------------------
	def manual_rollouts(base_url: str, model: str, n: int, max_tokens: int) -> dict:
	problems = spec_rl.load_problems(n)
	use_chat = _endpoint_has_chat(base_url)
	transport = "chat" if use_chat else "completions"
	print(f"[eval_local] manual loop: {len(problems)} examples via /v1/{transport} "
	f"at {base_url} (model={model})")

	per_example = []
	for i, prob in enumerate(problems):
	if use_chat:
	raw = complete_chat(base_url, model, prob["prompt"], max_tokens)
	else:
	# Stub path: it ignores the prompt and returns a canned body, so we
	# send the bare code prompt the same way humaneval_subset.py does.
	raw = complete_text(base_url, model, prob["prompt"], max_tokens)
	completion = _trim_at_stop(raw)

	# Reward: identical logic to spec_rl's @vf.reward code_passes — rebuild
	# the problem from its own fields (never trust the model to echo it) and
	# run the unit tests in a timed subprocess.
	problem = {
	"prompt": prob["prompt"],
	"test": prob["test"],
	"entry_point": prob["entry_point"],
	}
	score = spec_rl.fraction_passing(problem, completion)
	per_example.append({
	"index": i,
	"task_id": prob["task_id"],
	"score": score,
	"completion": completion,
	})
	print(f" [{i+1}/{len(problems)}] {prob['task_id']}: "
	f"reward={score:.3f}")

	mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0
	return {
	"driver": "manual",
	"transport": transport,
	"mean_reward": mean,
	"per_example": per_example,
	}


	def main() -> int:
	p = argparse.ArgumentParser(
	description="Run the spec_rl RL-eval loop offline against the local stub "
	"(or any OpenAI-compatible endpoint) and compute the reward."
	)
	p.add_argument("--base-url", default="http://localhost:8000",
	help="OpenAI-compatible endpoint (stub :8000 / dflash stub :8001 / vLLM).")
	p.add_argument("--model", default="laguna")
	p.add_argument("--n", type=int, default=5, help="Number of HumanEval problems (rollouts).")
	p.add_argument("--max-tokens", type=int, default=512)
	p.add_argument("--out", default=str(DEFAULT_OUT),
	help="Where to write the small JSON summary.")
	p.add_argument("--force-manual", action="store_true",
	help="Skip the verifiers path; always use the manual rollout loop.")
	args = p.parse_args()

	result = None
	if not args.force_manual:
	result = try_verifiers(args.base_url, args.model, args.n)
	if result is None:
	result = manual_rollouts(args.base_url, args.model, args.n, args.max_tokens)

	summary = {
	"base_url": args.base_url,
	"model": args.model,
	"n": len(result["per_example"]),
	"driver": result["driver"],
	"transport": result.get("transport", "chat"),
	"mean_reward": result["mean_reward"],
	"scores": [p["score"] for p in result["per_example"]],
	"per_example": [
	{"task_id": p["task_id"], "score": p["score"]}
	for p in result["per_example"]
	],
	}

	print(json.dumps(
	{k: v for k, v in summary.items() if k != "per_example"}, indent=2
	))
	print(f"[eval_local] driver={summary['driver']} "
	f"mean_reward={summary['mean_reward']:.3f} n={summary['n']}")

	out_path = Path(args.out)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	out_path.write_text(json.dumps(summary, indent=2))
	print(f"[eval_local] wrote {out_path}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())