#!/usr/bin/env python3 """ humaneval_subset.py — a 20-30 problem pass@1 check against an OpenAI-compatible endpoint. Purpose on the GPU host: PROVE the DFlash run produces the SAME quality as the baseline (and ideally the same greedy text), so "lossless" isn't just a claim — it's a measured parity check. Two modes: 1. Quality: run pass@1 on a HumanEval subset and print the score. 2. Parity: run greedy on both endpoints and assert outputs are token-identical. This loads HumanEval via `datasets` (openai_humaneval). On the Mac you can dry-run the harness against a tiny local server; the real numbers come from Laguna on PI. SAFETY: this executes model-generated code to grade pass@1. Run ONLY in the disposable isolated sandbox / container, never on your laptop with real data. A --no-exec flag skips execution and just dumps completions for manual review. Usage: python evals/humaneval_subset.py --base-url http://localhost:8000 --model laguna \ --n 25 --out results/humaneval_dflash.json # parity check: python evals/humaneval_subset.py --parity \ --base-url http://localhost:8000 --base-url-b http://localhost:8001 \ --model laguna --n 25 """ from __future__ import annotations import argparse import json import os import signal import urllib.request from contextlib import contextmanager def load_problems(n: int): # datasets >= 3 requires a namespaced repo id; the bare "openai_humaneval" # legacy name now raises. Override with HUMANEVAL_DATASET if the GPU image # pins a different datasets version / mirror. import os from datasets import load_dataset dataset_id = os.environ.get("HUMANEVAL_DATASET", "openai/openai_humaneval") ds = load_dataset(dataset_id, split="test") n = min(n, len(ds)) return [ds[i] for i in range(n)] def complete(base_url: str, model: str, prompt: str, max_tokens: int) -> str: url = base_url.rstrip("/") + "/v1/completions" payload = { "model": model, "prompt": prompt, "max_tokens": max_tokens, "temperature": 0.0, # greedy => deterministic => lossless-comparable "stop": ["\nclass ", "\ndef ", "\n#", "\nif __name__"], } data = json.dumps(payload).encode() req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"}) with urllib.request.urlopen(req, timeout=600) as r: obj = json.loads(r.read().decode()) return obj["choices"][0]["text"] @contextmanager def time_limit(seconds: int): def handler(signum, frame): raise TimeoutError("timed out") signal.signal(signal.SIGALRM, handler) signal.alarm(seconds) try: yield finally: signal.alarm(0) def passes(problem: dict, completion: str) -> bool: program = problem["prompt"] + completion + "\n" + problem["test"] + \ f"\ncheck({problem['entry_point']})\n" try: with time_limit(8): ns: dict = {} exec(program, ns) # noqa: S102 — sandbox only return True except Exception: return False def run_quality(args) -> None: problems = load_problems(args.n) results = [] n_pass = 0 for i, prob in enumerate(problems): comp = complete(args.base_url, args.model, prob["prompt"], args.max_tokens) ok = False if args.no_exec else passes(prob, comp) n_pass += int(ok) results.append({"task_id": prob["task_id"], "passed": ok, "completion": comp}) print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'PASS' if ok else ('?' if args.no_exec else 'fail')}") score = n_pass / len(problems) if problems else 0.0 out = {"model": args.model, "base_url": args.base_url, "n": len(problems), "pass_at_1": score, "no_exec": args.no_exec, "results": results} print(json.dumps({k: v for k, v in out.items() if k != "results"}, indent=2)) if args.out: os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True) with open(args.out, "w") as f: json.dump(out, f, indent=2) print(f"[humaneval] wrote {args.out} pass@1={score:.3f}") def run_parity(args) -> None: """Greedy outputs from baseline (A) and DFlash (B) must be token-identical.""" problems = load_problems(args.n) mismatches = 0 for i, prob in enumerate(problems): a = complete(args.base_url, args.model, prob["prompt"], args.max_tokens) b = complete(args.base_url_b, args.model, prob["prompt"], args.max_tokens) same = a == b mismatches += int(not same) print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'IDENTICAL' if same else 'MISMATCH'}") n = len(problems) print(json.dumps({"parity_pairs": n, "identical": n - mismatches, "mismatches": mismatches, "lossless": mismatches == 0}, indent=2)) def main() -> None: p = argparse.ArgumentParser(description="HumanEval subset pass@1 + baseline/DFlash greedy parity check.") p.add_argument("--base-url", default="http://localhost:8000") p.add_argument("--base-url-b", default="http://localhost:8001", help="DFlash endpoint for --parity.") p.add_argument("--model", default="laguna") p.add_argument("--n", type=int, default=25) p.add_argument("--max-tokens", type=int, default=512) p.add_argument("--no-exec", action="store_true", help="Skip code execution; dump completions only.") p.add_argument("--parity", action="store_true", help="Compare two endpoints' greedy outputs.") p.add_argument("--out", default=None) args = p.parse_args() if args.parity: run_parity(args) else: run_quality(args) if __name__ == "__main__": main()