File size: 3,492 Bytes
b89e6d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""Phase 2: functional correctness eval (pass@k) on HumanEval / MBPP.

Unlike CodeBLEU (similarity), this measures whether generated code actually RUNS
and PASSES unit tests - the claim that carries a capstone defense.

generate_fn(intent) -> code is injected, so this is decoupled from the model and
unit-testable with a mock.
"""
from __future__ import annotations

import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Callable

import numpy as np

sys.path.append(str(Path(__file__).resolve().parents[2]))
from src.eval.sandbox import run_code  # noqa: E402


def pass_at_k(n: int, c: int, k: int) -> float:
    """Unbiased pass@k estimator (Codex paper). n samples, c correct."""
    if n - c < k:
        return 1.0
    return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))


# ---- per-benchmark program builders ------------------------------------
def humaneval_program(problem, candidate_code: str) -> str:
    # candidate_code already defines the full function (entry_point).
    return f"{candidate_code}\n\n{problem['test']}\n\ncheck({problem['entry_point']})\n"


def mbpp_program(problem, candidate_code: str) -> str:
    setup = problem.get("test_setup_code", "") or ""
    tests = "\n".join(problem.get("test_list", []))
    return f"{candidate_code}\n\n{setup}\n{tests}\n"


# hf_id is a tuple of candidate IDs tried in order; newer datasets versions
# require the namespaced form, older ones or mirrors may still use the bare id.
_BENCH = {
    "humaneval": {
        "hf_id": ("openai/openai_humaneval", "openai_humaneval"), "split": "test",
        "intent_col": "prompt", "program": humaneval_program,
    },
    "mbpp": {
        "hf_id": ("google-research-datasets/mbpp", "mbpp"), "split": "test",
        "intent_col": "text", "program": mbpp_program,
    },
}


@dataclass
class EvalResult:
    benchmark: str
    n_problems: int
    n_samples: int
    pass_at_1: float
    pass_at_k: float
    k: int


def evaluate(
    generate_fn: Callable[[str], str],
    benchmark: str = "humaneval",
    limit: int | None = 20,
    n_samples: int = 1,
    k: int = 1,
    timeout: float = 8.0,
) -> EvalResult:
    """Run pass@k. Use n_samples>1 + a sampling generate_fn for k>1."""
    from datasets import load_dataset

    spec = _BENCH[benchmark]
    hf_ids = spec["hf_id"]
    ds = None
    for hf_id in hf_ids:
        try:
            ds = load_dataset(hf_id, split=spec["split"], trust_remote_code=True)
            break
        except Exception:  # noqa: BLE001
            continue
    if ds is None:
        raise RuntimeError(
            f"Could not load benchmark '{benchmark}' from any of {hf_ids}. "
            "Check your HuggingFace token and dataset availability."
        )
    if limit:
        ds = ds.select(range(min(limit, len(ds))))

    p1_scores, pk_scores = [], []
    for problem in ds:
        intent = problem[spec["intent_col"]]
        correct = 0
        for _ in range(n_samples):
            code = generate_fn(intent)
            program = spec["program"](problem, code)
            if run_code(program, timeout=timeout).ok:
                correct += 1
        p1_scores.append(pass_at_k(n_samples, correct, 1))
        pk_scores.append(pass_at_k(n_samples, correct, k))

    return EvalResult(
        benchmark=benchmark, n_problems=len(ds), n_samples=n_samples,
        pass_at_1=round(float(np.mean(p1_scores)), 4),
        pass_at_k=round(float(np.mean(pk_scores)), 4), k=k,
    )