File size: 5,712 Bytes
8612587 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | #!/usr/bin/env python3
"""
humaneval_subset.py — a 20-30 problem pass@1 check against an OpenAI-compatible
endpoint. Purpose on the GPU host: PROVE the DFlash run produces the SAME quality
as the baseline (and ideally the same greedy text), so "lossless" isn't just a
claim — it's a measured parity check.
Two modes:
1. Quality: run pass@1 on a HumanEval subset and print the score.
2. Parity: run greedy on both endpoints and assert outputs are token-identical.
This loads HumanEval via `datasets` (openai_humaneval). On the Mac you can dry-run
the harness against a tiny local server; the real numbers come from Laguna on PI.
SAFETY: this executes model-generated code to grade pass@1. Run ONLY in the
disposable isolated sandbox / container, never on your laptop with real data.
A --no-exec flag skips execution and just dumps completions for manual review.
Usage:
python evals/humaneval_subset.py --base-url http://localhost:8000 --model laguna \
--n 25 --out results/humaneval_dflash.json
# parity check:
python evals/humaneval_subset.py --parity \
--base-url http://localhost:8000 --base-url-b http://localhost:8001 \
--model laguna --n 25
"""
from __future__ import annotations
import argparse
import json
import os
import signal
import urllib.request
from contextlib import contextmanager
def load_problems(n: int):
# datasets >= 3 requires a namespaced repo id; the bare "openai_humaneval"
# legacy name now raises. Override with HUMANEVAL_DATASET if the GPU image
# pins a different datasets version / mirror.
import os
from datasets import load_dataset
dataset_id = os.environ.get("HUMANEVAL_DATASET", "openai/openai_humaneval")
ds = load_dataset(dataset_id, split="test")
n = min(n, len(ds))
return [ds[i] for i in range(n)]
def complete(base_url: str, model: str, prompt: str, max_tokens: int) -> str:
url = base_url.rstrip("/") + "/v1/completions"
payload = {
"model": model,
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": 0.0, # greedy => deterministic => lossless-comparable
"stop": ["\nclass ", "\ndef ", "\n#", "\nif __name__"],
}
data = json.dumps(payload).encode()
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
with urllib.request.urlopen(req, timeout=600) as r:
obj = json.loads(r.read().decode())
return obj["choices"][0]["text"]
@contextmanager
def time_limit(seconds: int):
def handler(signum, frame):
raise TimeoutError("timed out")
signal.signal(signal.SIGALRM, handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)
def passes(problem: dict, completion: str) -> bool:
program = problem["prompt"] + completion + "\n" + problem["test"] + \
f"\ncheck({problem['entry_point']})\n"
try:
with time_limit(8):
ns: dict = {}
exec(program, ns) # noqa: S102 — sandbox only
return True
except Exception:
return False
def run_quality(args) -> None:
problems = load_problems(args.n)
results = []
n_pass = 0
for i, prob in enumerate(problems):
comp = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
ok = False if args.no_exec else passes(prob, comp)
n_pass += int(ok)
results.append({"task_id": prob["task_id"], "passed": ok, "completion": comp})
print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'PASS' if ok else ('?' if args.no_exec else 'fail')}")
score = n_pass / len(problems) if problems else 0.0
out = {"model": args.model, "base_url": args.base_url, "n": len(problems),
"pass_at_1": score, "no_exec": args.no_exec, "results": results}
print(json.dumps({k: v for k, v in out.items() if k != "results"}, indent=2))
if args.out:
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
with open(args.out, "w") as f:
json.dump(out, f, indent=2)
print(f"[humaneval] wrote {args.out} pass@1={score:.3f}")
def run_parity(args) -> None:
"""Greedy outputs from baseline (A) and DFlash (B) must be token-identical."""
problems = load_problems(args.n)
mismatches = 0
for i, prob in enumerate(problems):
a = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
b = complete(args.base_url_b, args.model, prob["prompt"], args.max_tokens)
same = a == b
mismatches += int(not same)
print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'IDENTICAL' if same else 'MISMATCH'}")
n = len(problems)
print(json.dumps({"parity_pairs": n, "identical": n - mismatches,
"mismatches": mismatches,
"lossless": mismatches == 0}, indent=2))
def main() -> None:
p = argparse.ArgumentParser(description="HumanEval subset pass@1 + baseline/DFlash greedy parity check.")
p.add_argument("--base-url", default="http://localhost:8000")
p.add_argument("--base-url-b", default="http://localhost:8001", help="DFlash endpoint for --parity.")
p.add_argument("--model", default="laguna")
p.add_argument("--n", type=int, default=25)
p.add_argument("--max-tokens", type=int, default=512)
p.add_argument("--no-exec", action="store_true", help="Skip code execution; dump completions only.")
p.add_argument("--parity", action="store_true", help="Compare two endpoints' greedy outputs.")
p.add_argument("--out", default=None)
args = p.parse_args()
if args.parity:
run_parity(args)
else:
run_quality(args)
if __name__ == "__main__":
main()
|