File size: 5,712 Bytes
#!/usr/bin/env python3
"""
humaneval_subset.py — a 20-30 problem pass@1 check against an OpenAI-compatible
endpoint. Purpose on the GPU host: PROVE the DFlash run produces the SAME quality
as the baseline (and ideally the same greedy text), so "lossless" isn't just a
claim — it's a measured parity check.

Two modes:
  1. Quality: run pass@1 on a HumanEval subset and print the score.
  2. Parity:  run greedy on both endpoints and assert outputs are token-identical.

This loads HumanEval via `datasets` (openai_humaneval). On the Mac you can dry-run
the harness against a tiny local server; the real numbers come from Laguna on PI.

SAFETY: this executes model-generated code to grade pass@1. Run ONLY in the
disposable isolated sandbox / container, never on your laptop with real data.
A --no-exec flag skips execution and just dumps completions for manual review.

Usage:
  python evals/humaneval_subset.py --base-url http://localhost:8000 --model laguna \
      --n 25 --out results/humaneval_dflash.json
  # parity check:
  python evals/humaneval_subset.py --parity \
      --base-url http://localhost:8000 --base-url-b http://localhost:8001 \
      --model laguna --n 25
"""
from __future__ import annotations

import argparse
import json
import os
import signal
import urllib.request
from contextlib import contextmanager


def load_problems(n: int):
    # datasets >= 3 requires a namespaced repo id; the bare "openai_humaneval"
    # legacy name now raises. Override with HUMANEVAL_DATASET if the GPU image
    # pins a different datasets version / mirror.
    import os
    from datasets import load_dataset
    dataset_id = os.environ.get("HUMANEVAL_DATASET", "openai/openai_humaneval")
    ds = load_dataset(dataset_id, split="test")
    n = min(n, len(ds))
    return [ds[i] for i in range(n)]


def complete(base_url: str, model: str, prompt: str, max_tokens: int) -> str:
    url = base_url.rstrip("/") + "/v1/completions"
    payload = {
        "model": model,
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": 0.0,          # greedy => deterministic => lossless-comparable
        "stop": ["\nclass ", "\ndef ", "\n#", "\nif __name__"],
    }
    data = json.dumps(payload).encode()
    req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
    with urllib.request.urlopen(req, timeout=600) as r:
        obj = json.loads(r.read().decode())
    return obj["choices"][0]["text"]


@contextmanager
def time_limit(seconds: int):
    def handler(signum, frame):
        raise TimeoutError("timed out")
    signal.signal(signal.SIGALRM, handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)


def passes(problem: dict, completion: str) -> bool:
    program = problem["prompt"] + completion + "\n" + problem["test"] + \
        f"\ncheck({problem['entry_point']})\n"
    try:
        with time_limit(8):
            ns: dict = {}
            exec(program, ns)  # noqa: S102 — sandbox only
        return True
    except Exception:
        return False


def run_quality(args) -> None:
    problems = load_problems(args.n)
    results = []
    n_pass = 0
    for i, prob in enumerate(problems):
        comp = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
        ok = False if args.no_exec else passes(prob, comp)
        n_pass += int(ok)
        results.append({"task_id": prob["task_id"], "passed": ok, "completion": comp})
        print(f"  [{i+1}/{len(problems)}] {prob['task_id']}: {'PASS' if ok else ('?' if args.no_exec else 'fail')}")
    score = n_pass / len(problems) if problems else 0.0
    out = {"model": args.model, "base_url": args.base_url, "n": len(problems),
           "pass_at_1": score, "no_exec": args.no_exec, "results": results}
    print(json.dumps({k: v for k, v in out.items() if k != "results"}, indent=2))
    if args.out:
        os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
        with open(args.out, "w") as f:
            json.dump(out, f, indent=2)
        print(f"[humaneval] wrote {args.out}  pass@1={score:.3f}")


def run_parity(args) -> None:
    """Greedy outputs from baseline (A) and DFlash (B) must be token-identical."""
    problems = load_problems(args.n)
    mismatches = 0
    for i, prob in enumerate(problems):
        a = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
        b = complete(args.base_url_b, args.model, prob["prompt"], args.max_tokens)
        same = a == b
        mismatches += int(not same)
        print(f"  [{i+1}/{len(problems)}] {prob['task_id']}: {'IDENTICAL' if same else 'MISMATCH'}")
    n = len(problems)
    print(json.dumps({"parity_pairs": n, "identical": n - mismatches,
                      "mismatches": mismatches,
                      "lossless": mismatches == 0}, indent=2))


def main() -> None:
    p = argparse.ArgumentParser(description="HumanEval subset pass@1 + baseline/DFlash greedy parity check.")
    p.add_argument("--base-url", default="http://localhost:8000")
    p.add_argument("--base-url-b", default="http://localhost:8001", help="DFlash endpoint for --parity.")
    p.add_argument("--model", default="laguna")
    p.add_argument("--n", type=int, default=25)
    p.add_argument("--max-tokens", type=int, default=512)
    p.add_argument("--no-exec", action="store_true", help="Skip code execution; dump completions only.")
    p.add_argument("--parity", action="store_true", help="Compare two endpoints' greedy outputs.")
    p.add_argument("--out", default=None)
    args = p.parse_args()

    if args.parity:
        run_parity(args)
    else:
        run_quality(args)


if __name__ == "__main__":
    main()