code-gen-assistant / src /eval /functional_eval.py
Rushabh147's picture
Initial deploy to HF Spaces (clean history, LFS for all binaries)
b89e6d6
Raw
History Blame Contribute Delete
3.49 kB
"""Phase 2: functional correctness eval (pass@k) on HumanEval / MBPP.
Unlike CodeBLEU (similarity), this measures whether generated code actually RUNS
and PASSES unit tests - the claim that carries a capstone defense.
generate_fn(intent) -> code is injected, so this is decoupled from the model and
unit-testable with a mock.
"""
from __future__ import annotations
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Callable
import numpy as np
sys.path.append(str(Path(__file__).resolve().parents[2]))
from src.eval.sandbox import run_code # noqa: E402
def pass_at_k(n: int, c: int, k: int) -> float:
"""Unbiased pass@k estimator (Codex paper). n samples, c correct."""
if n - c < k:
return 1.0
return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
# ---- per-benchmark program builders ------------------------------------
def humaneval_program(problem, candidate_code: str) -> str:
# candidate_code already defines the full function (entry_point).
return f"{candidate_code}\n\n{problem['test']}\n\ncheck({problem['entry_point']})\n"
def mbpp_program(problem, candidate_code: str) -> str:
setup = problem.get("test_setup_code", "") or ""
tests = "\n".join(problem.get("test_list", []))
return f"{candidate_code}\n\n{setup}\n{tests}\n"
# hf_id is a tuple of candidate IDs tried in order; newer datasets versions
# require the namespaced form, older ones or mirrors may still use the bare id.
_BENCH = {
"humaneval": {
"hf_id": ("openai/openai_humaneval", "openai_humaneval"), "split": "test",
"intent_col": "prompt", "program": humaneval_program,
},
"mbpp": {
"hf_id": ("google-research-datasets/mbpp", "mbpp"), "split": "test",
"intent_col": "text", "program": mbpp_program,
},
}
@dataclass
class EvalResult:
benchmark: str
n_problems: int
n_samples: int
pass_at_1: float
pass_at_k: float
k: int
def evaluate(
generate_fn: Callable[[str], str],
benchmark: str = "humaneval",
limit: int | None = 20,
n_samples: int = 1,
k: int = 1,
timeout: float = 8.0,
) -> EvalResult:
"""Run pass@k. Use n_samples>1 + a sampling generate_fn for k>1."""
from datasets import load_dataset
spec = _BENCH[benchmark]
hf_ids = spec["hf_id"]
ds = None
for hf_id in hf_ids:
try:
ds = load_dataset(hf_id, split=spec["split"], trust_remote_code=True)
break
except Exception: # noqa: BLE001
continue
if ds is None:
raise RuntimeError(
f"Could not load benchmark '{benchmark}' from any of {hf_ids}. "
"Check your HuggingFace token and dataset availability."
)
if limit:
ds = ds.select(range(min(limit, len(ds))))
p1_scores, pk_scores = [], []
for problem in ds:
intent = problem[spec["intent_col"]]
correct = 0
for _ in range(n_samples):
code = generate_fn(intent)
program = spec["program"](problem, code)
if run_code(program, timeout=timeout).ok:
correct += 1
p1_scores.append(pass_at_k(n_samples, correct, 1))
pk_scores.append(pass_at_k(n_samples, correct, k))
return EvalResult(
benchmark=benchmark, n_problems=len(ds), n_samples=n_samples,
pass_at_1=round(float(np.mean(p1_scores)), 4),
pass_at_k=round(float(np.mean(pk_scores)), 4), k=k,
)