File size: 5,712 Bytes
8612587
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
"""
humaneval_subset.py — a 20-30 problem pass@1 check against an OpenAI-compatible
endpoint. Purpose on the GPU host: PROVE the DFlash run produces the SAME quality
as the baseline (and ideally the same greedy text), so "lossless" isn't just a
claim — it's a measured parity check.

Two modes:
  1. Quality: run pass@1 on a HumanEval subset and print the score.
  2. Parity:  run greedy on both endpoints and assert outputs are token-identical.

This loads HumanEval via `datasets` (openai_humaneval). On the Mac you can dry-run
the harness against a tiny local server; the real numbers come from Laguna on PI.

SAFETY: this executes model-generated code to grade pass@1. Run ONLY in the
disposable isolated sandbox / container, never on your laptop with real data.
A --no-exec flag skips execution and just dumps completions for manual review.

Usage:
  python evals/humaneval_subset.py --base-url http://localhost:8000 --model laguna \
      --n 25 --out results/humaneval_dflash.json
  # parity check:
  python evals/humaneval_subset.py --parity \
      --base-url http://localhost:8000 --base-url-b http://localhost:8001 \
      --model laguna --n 25
"""
from __future__ import annotations

import argparse
import json
import os
import signal
import urllib.request
from contextlib import contextmanager


def load_problems(n: int):
    # datasets >= 3 requires a namespaced repo id; the bare "openai_humaneval"
    # legacy name now raises. Override with HUMANEVAL_DATASET if the GPU image
    # pins a different datasets version / mirror.
    import os
    from datasets import load_dataset
    dataset_id = os.environ.get("HUMANEVAL_DATASET", "openai/openai_humaneval")
    ds = load_dataset(dataset_id, split="test")
    n = min(n, len(ds))
    return [ds[i] for i in range(n)]


def complete(base_url: str, model: str, prompt: str, max_tokens: int) -> str:
    url = base_url.rstrip("/") + "/v1/completions"
    payload = {
        "model": model,
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": 0.0,          # greedy => deterministic => lossless-comparable
        "stop": ["\nclass ", "\ndef ", "\n#", "\nif __name__"],
    }
    data = json.dumps(payload).encode()
    req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
    with urllib.request.urlopen(req, timeout=600) as r:
        obj = json.loads(r.read().decode())
    return obj["choices"][0]["text"]


@contextmanager
def time_limit(seconds: int):
    def handler(signum, frame):
        raise TimeoutError("timed out")
    signal.signal(signal.SIGALRM, handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)


def passes(problem: dict, completion: str) -> bool:
    program = problem["prompt"] + completion + "\n" + problem["test"] + \
        f"\ncheck({problem['entry_point']})\n"
    try:
        with time_limit(8):
            ns: dict = {}
            exec(program, ns)  # noqa: S102 — sandbox only
        return True
    except Exception:
        return False


def run_quality(args) -> None:
    problems = load_problems(args.n)
    results = []
    n_pass = 0
    for i, prob in enumerate(problems):
        comp = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
        ok = False if args.no_exec else passes(prob, comp)
        n_pass += int(ok)
        results.append({"task_id": prob["task_id"], "passed": ok, "completion": comp})
        print(f"  [{i+1}/{len(problems)}] {prob['task_id']}: {'PASS' if ok else ('?' if args.no_exec else 'fail')}")
    score = n_pass / len(problems) if problems else 0.0
    out = {"model": args.model, "base_url": args.base_url, "n": len(problems),
           "pass_at_1": score, "no_exec": args.no_exec, "results": results}
    print(json.dumps({k: v for k, v in out.items() if k != "results"}, indent=2))
    if args.out:
        os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
        with open(args.out, "w") as f:
            json.dump(out, f, indent=2)
        print(f"[humaneval] wrote {args.out}  pass@1={score:.3f}")


def run_parity(args) -> None:
    """Greedy outputs from baseline (A) and DFlash (B) must be token-identical."""
    problems = load_problems(args.n)
    mismatches = 0
    for i, prob in enumerate(problems):
        a = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
        b = complete(args.base_url_b, args.model, prob["prompt"], args.max_tokens)
        same = a == b
        mismatches += int(not same)
        print(f"  [{i+1}/{len(problems)}] {prob['task_id']}: {'IDENTICAL' if same else 'MISMATCH'}")
    n = len(problems)
    print(json.dumps({"parity_pairs": n, "identical": n - mismatches,
                      "mismatches": mismatches,
                      "lossless": mismatches == 0}, indent=2))


def main() -> None:
    p = argparse.ArgumentParser(description="HumanEval subset pass@1 + baseline/DFlash greedy parity check.")
    p.add_argument("--base-url", default="http://localhost:8000")
    p.add_argument("--base-url-b", default="http://localhost:8001", help="DFlash endpoint for --parity.")
    p.add_argument("--model", default="laguna")
    p.add_argument("--n", type=int, default=25)
    p.add_argument("--max-tokens", type=int, default=512)
    p.add_argument("--no-exec", action="store_true", help="Skip code execution; dump completions only.")
    p.add_argument("--parity", action="store_true", help="Compare two endpoints' greedy outputs.")
    p.add_argument("--out", default=None)
    args = p.parse_args()

    if args.parity:
        run_parity(args)
    else:
        run_quality(args)


if __name__ == "__main__":
    main()