"""Phase 2: functional correctness eval (pass@k) on HumanEval / MBPP. Unlike CodeBLEU (similarity), this measures whether generated code actually RUNS and PASSES unit tests - the claim that carries a capstone defense. generate_fn(intent) -> code is injected, so this is decoupled from the model and unit-testable with a mock. """ from __future__ import annotations import sys from dataclasses import dataclass from pathlib import Path from typing import Callable import numpy as np sys.path.append(str(Path(__file__).resolve().parents[2])) from src.eval.sandbox import run_code # noqa: E402 def pass_at_k(n: int, c: int, k: int) -> float: """Unbiased pass@k estimator (Codex paper). n samples, c correct.""" if n - c < k: return 1.0 return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1))) # ---- per-benchmark program builders ------------------------------------ def humaneval_program(problem, candidate_code: str) -> str: # candidate_code already defines the full function (entry_point). return f"{candidate_code}\n\n{problem['test']}\n\ncheck({problem['entry_point']})\n" def mbpp_program(problem, candidate_code: str) -> str: setup = problem.get("test_setup_code", "") or "" tests = "\n".join(problem.get("test_list", [])) return f"{candidate_code}\n\n{setup}\n{tests}\n" # hf_id is a tuple of candidate IDs tried in order; newer datasets versions # require the namespaced form, older ones or mirrors may still use the bare id. _BENCH = { "humaneval": { "hf_id": ("openai/openai_humaneval", "openai_humaneval"), "split": "test", "intent_col": "prompt", "program": humaneval_program, }, "mbpp": { "hf_id": ("google-research-datasets/mbpp", "mbpp"), "split": "test", "intent_col": "text", "program": mbpp_program, }, } @dataclass class EvalResult: benchmark: str n_problems: int n_samples: int pass_at_1: float pass_at_k: float k: int def evaluate( generate_fn: Callable[[str], str], benchmark: str = "humaneval", limit: int | None = 20, n_samples: int = 1, k: int = 1, timeout: float = 8.0, ) -> EvalResult: """Run pass@k. Use n_samples>1 + a sampling generate_fn for k>1.""" from datasets import load_dataset spec = _BENCH[benchmark] hf_ids = spec["hf_id"] ds = None for hf_id in hf_ids: try: ds = load_dataset(hf_id, split=spec["split"], trust_remote_code=True) break except Exception: # noqa: BLE001 continue if ds is None: raise RuntimeError( f"Could not load benchmark '{benchmark}' from any of {hf_ids}. " "Check your HuggingFace token and dataset availability." ) if limit: ds = ds.select(range(min(limit, len(ds)))) p1_scores, pk_scores = [], [] for problem in ds: intent = problem[spec["intent_col"]] correct = 0 for _ in range(n_samples): code = generate_fn(intent) program = spec["program"](problem, code) if run_code(program, timeout=timeout).ok: correct += 1 p1_scores.append(pass_at_k(n_samples, correct, 1)) pk_scores.append(pass_at_k(n_samples, correct, k)) return EvalResult( benchmark=benchmark, n_problems=len(ds), n_samples=n_samples, pass_at_1=round(float(np.mean(p1_scores)), 4), pass_at_k=round(float(np.mean(pk_scores)), 4), k=k, )