| |
| """ |
| humaneval_subset.py — a 20-30 problem pass@1 check against an OpenAI-compatible |
| endpoint. Purpose on the GPU host: PROVE the DFlash run produces the SAME quality |
| as the baseline (and ideally the same greedy text), so "lossless" isn't just a |
| claim — it's a measured parity check. |
| |
| Two modes: |
| 1. Quality: run pass@1 on a HumanEval subset and print the score. |
| 2. Parity: run greedy on both endpoints and assert outputs are token-identical. |
| |
| This loads HumanEval via `datasets` (openai_humaneval). On the Mac you can dry-run |
| the harness against a tiny local server; the real numbers come from Laguna on PI. |
| |
| SAFETY: this executes model-generated code to grade pass@1. Run ONLY in the |
| disposable isolated sandbox / container, never on your laptop with real data. |
| A --no-exec flag skips execution and just dumps completions for manual review. |
| |
| Usage: |
| python evals/humaneval_subset.py --base-url http://localhost:8000 --model laguna \ |
| --n 25 --out results/humaneval_dflash.json |
| # parity check: |
| python evals/humaneval_subset.py --parity \ |
| --base-url http://localhost:8000 --base-url-b http://localhost:8001 \ |
| --model laguna --n 25 |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import signal |
| import urllib.request |
| from contextlib import contextmanager |
|
|
|
|
| def load_problems(n: int): |
| |
| |
| |
| import os |
| from datasets import load_dataset |
| dataset_id = os.environ.get("HUMANEVAL_DATASET", "openai/openai_humaneval") |
| ds = load_dataset(dataset_id, split="test") |
| n = min(n, len(ds)) |
| return [ds[i] for i in range(n)] |
|
|
|
|
| def complete(base_url: str, model: str, prompt: str, max_tokens: int) -> str: |
| url = base_url.rstrip("/") + "/v1/completions" |
| payload = { |
| "model": model, |
| "prompt": prompt, |
| "max_tokens": max_tokens, |
| "temperature": 0.0, |
| "stop": ["\nclass ", "\ndef ", "\n#", "\nif __name__"], |
| } |
| data = json.dumps(payload).encode() |
| req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"}) |
| with urllib.request.urlopen(req, timeout=600) as r: |
| obj = json.loads(r.read().decode()) |
| return obj["choices"][0]["text"] |
|
|
|
|
| @contextmanager |
| def time_limit(seconds: int): |
| def handler(signum, frame): |
| raise TimeoutError("timed out") |
| signal.signal(signal.SIGALRM, handler) |
| signal.alarm(seconds) |
| try: |
| yield |
| finally: |
| signal.alarm(0) |
|
|
|
|
| def passes(problem: dict, completion: str) -> bool: |
| program = problem["prompt"] + completion + "\n" + problem["test"] + \ |
| f"\ncheck({problem['entry_point']})\n" |
| try: |
| with time_limit(8): |
| ns: dict = {} |
| exec(program, ns) |
| return True |
| except Exception: |
| return False |
|
|
|
|
| def run_quality(args) -> None: |
| problems = load_problems(args.n) |
| results = [] |
| n_pass = 0 |
| for i, prob in enumerate(problems): |
| comp = complete(args.base_url, args.model, prob["prompt"], args.max_tokens) |
| ok = False if args.no_exec else passes(prob, comp) |
| n_pass += int(ok) |
| results.append({"task_id": prob["task_id"], "passed": ok, "completion": comp}) |
| print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'PASS' if ok else ('?' if args.no_exec else 'fail')}") |
| score = n_pass / len(problems) if problems else 0.0 |
| out = {"model": args.model, "base_url": args.base_url, "n": len(problems), |
| "pass_at_1": score, "no_exec": args.no_exec, "results": results} |
| print(json.dumps({k: v for k, v in out.items() if k != "results"}, indent=2)) |
| if args.out: |
| os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True) |
| with open(args.out, "w") as f: |
| json.dump(out, f, indent=2) |
| print(f"[humaneval] wrote {args.out} pass@1={score:.3f}") |
|
|
|
|
| def run_parity(args) -> None: |
| """Greedy outputs from baseline (A) and DFlash (B) must be token-identical.""" |
| problems = load_problems(args.n) |
| mismatches = 0 |
| for i, prob in enumerate(problems): |
| a = complete(args.base_url, args.model, prob["prompt"], args.max_tokens) |
| b = complete(args.base_url_b, args.model, prob["prompt"], args.max_tokens) |
| same = a == b |
| mismatches += int(not same) |
| print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'IDENTICAL' if same else 'MISMATCH'}") |
| n = len(problems) |
| print(json.dumps({"parity_pairs": n, "identical": n - mismatches, |
| "mismatches": mismatches, |
| "lossless": mismatches == 0}, indent=2)) |
|
|
|
|
| def main() -> None: |
| p = argparse.ArgumentParser(description="HumanEval subset pass@1 + baseline/DFlash greedy parity check.") |
| p.add_argument("--base-url", default="http://localhost:8000") |
| p.add_argument("--base-url-b", default="http://localhost:8001", help="DFlash endpoint for --parity.") |
| p.add_argument("--model", default="laguna") |
| p.add_argument("--n", type=int, default=25) |
| p.add_argument("--max-tokens", type=int, default=512) |
| p.add_argument("--no-exec", action="store_true", help="Skip code execution; dump completions only.") |
| p.add_argument("--parity", action="store_true", help="Compare two endpoints' greedy outputs.") |
| p.add_argument("--out", default=None) |
| args = p.parse_args() |
|
|
| if args.parity: |
| run_parity(args) |
| else: |
| run_quality(args) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|