lean-laguna / evals /humaneval_subset.py
art87able's picture
Lean Laguna: Laguna XS.2 + DFlash — lossless single-GPU speedup + cheaper RL rollouts
8612587
#!/usr/bin/env python3
"""
humaneval_subset.py — a 20-30 problem pass@1 check against an OpenAI-compatible
endpoint. Purpose on the GPU host: PROVE the DFlash run produces the SAME quality
as the baseline (and ideally the same greedy text), so "lossless" isn't just a
claim — it's a measured parity check.
Two modes:
1. Quality: run pass@1 on a HumanEval subset and print the score.
2. Parity: run greedy on both endpoints and assert outputs are token-identical.
This loads HumanEval via `datasets` (openai_humaneval). On the Mac you can dry-run
the harness against a tiny local server; the real numbers come from Laguna on PI.
SAFETY: this executes model-generated code to grade pass@1. Run ONLY in the
disposable isolated sandbox / container, never on your laptop with real data.
A --no-exec flag skips execution and just dumps completions for manual review.
Usage:
python evals/humaneval_subset.py --base-url http://localhost:8000 --model laguna \
--n 25 --out results/humaneval_dflash.json
# parity check:
python evals/humaneval_subset.py --parity \
--base-url http://localhost:8000 --base-url-b http://localhost:8001 \
--model laguna --n 25
"""
from __future__ import annotations
import argparse
import json
import os
import signal
import urllib.request
from contextlib import contextmanager
def load_problems(n: int):
# datasets >= 3 requires a namespaced repo id; the bare "openai_humaneval"
# legacy name now raises. Override with HUMANEVAL_DATASET if the GPU image
# pins a different datasets version / mirror.
import os
from datasets import load_dataset
dataset_id = os.environ.get("HUMANEVAL_DATASET", "openai/openai_humaneval")
ds = load_dataset(dataset_id, split="test")
n = min(n, len(ds))
return [ds[i] for i in range(n)]
def complete(base_url: str, model: str, prompt: str, max_tokens: int) -> str:
url = base_url.rstrip("/") + "/v1/completions"
payload = {
"model": model,
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": 0.0, # greedy => deterministic => lossless-comparable
"stop": ["\nclass ", "\ndef ", "\n#", "\nif __name__"],
}
data = json.dumps(payload).encode()
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
with urllib.request.urlopen(req, timeout=600) as r:
obj = json.loads(r.read().decode())
return obj["choices"][0]["text"]
@contextmanager
def time_limit(seconds: int):
def handler(signum, frame):
raise TimeoutError("timed out")
signal.signal(signal.SIGALRM, handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)
def passes(problem: dict, completion: str) -> bool:
program = problem["prompt"] + completion + "\n" + problem["test"] + \
f"\ncheck({problem['entry_point']})\n"
try:
with time_limit(8):
ns: dict = {}
exec(program, ns) # noqa: S102 — sandbox only
return True
except Exception:
return False
def run_quality(args) -> None:
problems = load_problems(args.n)
results = []
n_pass = 0
for i, prob in enumerate(problems):
comp = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
ok = False if args.no_exec else passes(prob, comp)
n_pass += int(ok)
results.append({"task_id": prob["task_id"], "passed": ok, "completion": comp})
print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'PASS' if ok else ('?' if args.no_exec else 'fail')}")
score = n_pass / len(problems) if problems else 0.0
out = {"model": args.model, "base_url": args.base_url, "n": len(problems),
"pass_at_1": score, "no_exec": args.no_exec, "results": results}
print(json.dumps({k: v for k, v in out.items() if k != "results"}, indent=2))
if args.out:
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
with open(args.out, "w") as f:
json.dump(out, f, indent=2)
print(f"[humaneval] wrote {args.out} pass@1={score:.3f}")
def run_parity(args) -> None:
"""Greedy outputs from baseline (A) and DFlash (B) must be token-identical."""
problems = load_problems(args.n)
mismatches = 0
for i, prob in enumerate(problems):
a = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
b = complete(args.base_url_b, args.model, prob["prompt"], args.max_tokens)
same = a == b
mismatches += int(not same)
print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'IDENTICAL' if same else 'MISMATCH'}")
n = len(problems)
print(json.dumps({"parity_pairs": n, "identical": n - mismatches,
"mismatches": mismatches,
"lossless": mismatches == 0}, indent=2))
def main() -> None:
p = argparse.ArgumentParser(description="HumanEval subset pass@1 + baseline/DFlash greedy parity check.")
p.add_argument("--base-url", default="http://localhost:8000")
p.add_argument("--base-url-b", default="http://localhost:8001", help="DFlash endpoint for --parity.")
p.add_argument("--model", default="laguna")
p.add_argument("--n", type=int, default=25)
p.add_argument("--max-tokens", type=int, default=512)
p.add_argument("--no-exec", action="store_true", help="Skip code execution; dump completions only.")
p.add_argument("--parity", action="store_true", help="Compare two endpoints' greedy outputs.")
p.add_argument("--out", default=None)
args = p.parse_args()
if args.parity:
run_parity(args)
else:
run_quality(args)
if __name__ == "__main__":
main()