lean-laguna / evals /humaneval_subset.py

Lean Laguna: Laguna XS.2 + DFlash — lossless single-GPU speedup + cheaper RL rollouts

8612587 about 9 hours ago

5.71 kB

	#!/usr/bin/env python3
	"""
	humaneval_subset.py — a 20-30 problem pass@1 check against an OpenAI-compatible
	endpoint. Purpose on the GPU host: PROVE the DFlash run produces the SAME quality
	as the baseline (and ideally the same greedy text), so "lossless" isn't just a
	claim — it's a measured parity check.

	Two modes:
	1. Quality: run pass@1 on a HumanEval subset and print the score.
	2. Parity: run greedy on both endpoints and assert outputs are token-identical.

	This loads HumanEval via `datasets` (openai_humaneval). On the Mac you can dry-run
	the harness against a tiny local server; the real numbers come from Laguna on PI.

	SAFETY: this executes model-generated code to grade pass@1. Run ONLY in the
	disposable isolated sandbox / container, never on your laptop with real data.
	A --no-exec flag skips execution and just dumps completions for manual review.

	Usage:
	python evals/humaneval_subset.py --base-url http://localhost:8000 --model laguna \
	--n 25 --out results/humaneval_dflash.json
	# parity check:
	python evals/humaneval_subset.py --parity \
	--base-url http://localhost:8000 --base-url-b http://localhost:8001 \
	--model laguna --n 25
	"""
	from __future__ import annotations

	import argparse
	import json
	import os
	import signal
	import urllib.request
	from contextlib import contextmanager


	def load_problems(n: int):
	# datasets >= 3 requires a namespaced repo id; the bare "openai_humaneval"
	# legacy name now raises. Override with HUMANEVAL_DATASET if the GPU image
	# pins a different datasets version / mirror.
	import os
	from datasets import load_dataset
	dataset_id = os.environ.get("HUMANEVAL_DATASET", "openai/openai_humaneval")
	ds = load_dataset(dataset_id, split="test")
	n = min(n, len(ds))
	return [ds[i] for i in range(n)]


	def complete(base_url: str, model: str, prompt: str, max_tokens: int) -> str:
	url = base_url.rstrip("/") + "/v1/completions"
	payload = {
	"model": model,
	"prompt": prompt,
	"max_tokens": max_tokens,
	"temperature": 0.0, # greedy => deterministic => lossless-comparable
	"stop": ["\nclass ", "\ndef ", "\n#", "\nif __name__"],
	}
	data = json.dumps(payload).encode()
	req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
	with urllib.request.urlopen(req, timeout=600) as r:
	obj = json.loads(r.read().decode())
	return obj["choices"][0]["text"]


	@contextmanager
	def time_limit(seconds: int):
	def handler(signum, frame):
	raise TimeoutError("timed out")
	signal.signal(signal.SIGALRM, handler)
	signal.alarm(seconds)
	try:
	yield
	finally:
	signal.alarm(0)


	def passes(problem: dict, completion: str) -> bool:
	program = problem["prompt"] + completion + "\n" + problem["test"] + \
	f"\ncheck({problem['entry_point']})\n"
	try:
	with time_limit(8):
	ns: dict = {}
	exec(program, ns) # noqa: S102 — sandbox only
	return True
	except Exception:
	return False


	def run_quality(args) -> None:
	problems = load_problems(args.n)
	results = []
	n_pass = 0
	for i, prob in enumerate(problems):
	comp = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
	ok = False if args.no_exec else passes(prob, comp)
	n_pass += int(ok)
	results.append({"task_id": prob["task_id"], "passed": ok, "completion": comp})
	print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'PASS' if ok else ('?' if args.no_exec else 'fail')}")
	score = n_pass / len(problems) if problems else 0.0
	out = {"model": args.model, "base_url": args.base_url, "n": len(problems),
	"pass_at_1": score, "no_exec": args.no_exec, "results": results}
	print(json.dumps({k: v for k, v in out.items() if k != "results"}, indent=2))
	if args.out:
	os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
	with open(args.out, "w") as f:
	json.dump(out, f, indent=2)
	print(f"[humaneval] wrote {args.out} pass@1={score:.3f}")


	def run_parity(args) -> None:
	"""Greedy outputs from baseline (A) and DFlash (B) must be token-identical."""
	problems = load_problems(args.n)
	mismatches = 0
	for i, prob in enumerate(problems):
	a = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
	b = complete(args.base_url_b, args.model, prob["prompt"], args.max_tokens)
	same = a == b
	mismatches += int(not same)
	print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'IDENTICAL' if same else 'MISMATCH'}")
	n = len(problems)
	print(json.dumps({"parity_pairs": n, "identical": n - mismatches,
	"mismatches": mismatches,
	"lossless": mismatches == 0}, indent=2))


	def main() -> None:
	p = argparse.ArgumentParser(description="HumanEval subset pass@1 + baseline/DFlash greedy parity check.")
	p.add_argument("--base-url", default="http://localhost:8000")
	p.add_argument("--base-url-b", default="http://localhost:8001", help="DFlash endpoint for --parity.")
	p.add_argument("--model", default="laguna")
	p.add_argument("--n", type=int, default=25)
	p.add_argument("--max-tokens", type=int, default=512)
	p.add_argument("--no-exec", action="store_true", help="Skip code execution; dump completions only.")
	p.add_argument("--parity", action="store_true", help="Compare two endpoints' greedy outputs.")
	p.add_argument("--out", default=None)
	args = p.parse_args()

	if args.parity:
	run_parity(args)
	else:
	run_quality(args)


	if __name__ == "__main__":
	main()