Spaces:
Sleeping
Sleeping
| """Phase 2: functional correctness eval (pass@k) on HumanEval / MBPP. | |
| Unlike CodeBLEU (similarity), this measures whether generated code actually RUNS | |
| and PASSES unit tests - the claim that carries a capstone defense. | |
| generate_fn(intent) -> code is injected, so this is decoupled from the model and | |
| unit-testable with a mock. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Callable | |
| import numpy as np | |
| sys.path.append(str(Path(__file__).resolve().parents[2])) | |
| from src.eval.sandbox import run_code # noqa: E402 | |
| def pass_at_k(n: int, c: int, k: int) -> float: | |
| """Unbiased pass@k estimator (Codex paper). n samples, c correct.""" | |
| if n - c < k: | |
| return 1.0 | |
| return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1))) | |
| # ---- per-benchmark program builders ------------------------------------ | |
| def humaneval_program(problem, candidate_code: str) -> str: | |
| # candidate_code already defines the full function (entry_point). | |
| return f"{candidate_code}\n\n{problem['test']}\n\ncheck({problem['entry_point']})\n" | |
| def mbpp_program(problem, candidate_code: str) -> str: | |
| setup = problem.get("test_setup_code", "") or "" | |
| tests = "\n".join(problem.get("test_list", [])) | |
| return f"{candidate_code}\n\n{setup}\n{tests}\n" | |
| # hf_id is a tuple of candidate IDs tried in order; newer datasets versions | |
| # require the namespaced form, older ones or mirrors may still use the bare id. | |
| _BENCH = { | |
| "humaneval": { | |
| "hf_id": ("openai/openai_humaneval", "openai_humaneval"), "split": "test", | |
| "intent_col": "prompt", "program": humaneval_program, | |
| }, | |
| "mbpp": { | |
| "hf_id": ("google-research-datasets/mbpp", "mbpp"), "split": "test", | |
| "intent_col": "text", "program": mbpp_program, | |
| }, | |
| } | |
| class EvalResult: | |
| benchmark: str | |
| n_problems: int | |
| n_samples: int | |
| pass_at_1: float | |
| pass_at_k: float | |
| k: int | |
| def evaluate( | |
| generate_fn: Callable[[str], str], | |
| benchmark: str = "humaneval", | |
| limit: int | None = 20, | |
| n_samples: int = 1, | |
| k: int = 1, | |
| timeout: float = 8.0, | |
| ) -> EvalResult: | |
| """Run pass@k. Use n_samples>1 + a sampling generate_fn for k>1.""" | |
| from datasets import load_dataset | |
| spec = _BENCH[benchmark] | |
| hf_ids = spec["hf_id"] | |
| ds = None | |
| for hf_id in hf_ids: | |
| try: | |
| ds = load_dataset(hf_id, split=spec["split"], trust_remote_code=True) | |
| break | |
| except Exception: # noqa: BLE001 | |
| continue | |
| if ds is None: | |
| raise RuntimeError( | |
| f"Could not load benchmark '{benchmark}' from any of {hf_ids}. " | |
| "Check your HuggingFace token and dataset availability." | |
| ) | |
| if limit: | |
| ds = ds.select(range(min(limit, len(ds)))) | |
| p1_scores, pk_scores = [], [] | |
| for problem in ds: | |
| intent = problem[spec["intent_col"]] | |
| correct = 0 | |
| for _ in range(n_samples): | |
| code = generate_fn(intent) | |
| program = spec["program"](problem, code) | |
| if run_code(program, timeout=timeout).ok: | |
| correct += 1 | |
| p1_scores.append(pass_at_k(n_samples, correct, 1)) | |
| pk_scores.append(pass_at_k(n_samples, correct, k)) | |
| return EvalResult( | |
| benchmark=benchmark, n_problems=len(ds), n_samples=n_samples, | |
| pass_at_1=round(float(np.mean(p1_scores)), 4), | |
| pass_at_k=round(float(np.mean(pk_scores)), 4), k=k, | |
| ) | |