Spaces:

Rushabh147
/

code-gen-assistant

Sleeping

App Files Files Community

code-gen-assistant / src /eval /functional_eval.py

Rushabh147

Initial deploy to HF Spaces (clean history, LFS for all binaries)

b89e6d6 5 days ago

Raw

History Blame Contribute Delete

3.49 kB

	"""Phase 2: functional correctness eval (pass@k) on HumanEval / MBPP.

	Unlike CodeBLEU (similarity), this measures whether generated code actually RUNS
	and PASSES unit tests - the claim that carries a capstone defense.

	generate_fn(intent) -> code is injected, so this is decoupled from the model and
	unit-testable with a mock.
	"""
	from __future__ import annotations

	import sys
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Callable

	import numpy as np

	sys.path.append(str(Path(__file__).resolve().parents[2]))
	from src.eval.sandbox import run_code # noqa: E402


	def pass_at_k(n: int, c: int, k: int) -> float:
	"""Unbiased pass@k estimator (Codex paper). n samples, c correct."""
	if n - c < k:
	return 1.0
	return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))


	# ---- per-benchmark program builders ------------------------------------
	def humaneval_program(problem, candidate_code: str) -> str:
	# candidate_code already defines the full function (entry_point).
	return f"{candidate_code}\n\n{problem['test']}\n\ncheck({problem['entry_point']})\n"


	def mbpp_program(problem, candidate_code: str) -> str:
	setup = problem.get("test_setup_code", "") or ""
	tests = "\n".join(problem.get("test_list", []))
	return f"{candidate_code}\n\n{setup}\n{tests}\n"


	# hf_id is a tuple of candidate IDs tried in order; newer datasets versions
	# require the namespaced form, older ones or mirrors may still use the bare id.
	_BENCH = {
	"humaneval": {
	"hf_id": ("openai/openai_humaneval", "openai_humaneval"), "split": "test",
	"intent_col": "prompt", "program": humaneval_program,
	},
	"mbpp": {
	"hf_id": ("google-research-datasets/mbpp", "mbpp"), "split": "test",
	"intent_col": "text", "program": mbpp_program,
	},
	}


	@dataclass
	class EvalResult:
	benchmark: str
	n_problems: int
	n_samples: int
	pass_at_1: float
	pass_at_k: float
	k: int


	def evaluate(
	generate_fn: Callable[[str], str],
	benchmark: str = "humaneval",
	limit: int \| None = 20,
	n_samples: int = 1,
	k: int = 1,
	timeout: float = 8.0,
	) -> EvalResult:
	"""Run pass@k. Use n_samples>1 + a sampling generate_fn for k>1."""
	from datasets import load_dataset

	spec = _BENCH[benchmark]
	hf_ids = spec["hf_id"]
	ds = None
	for hf_id in hf_ids:
	try:
	ds = load_dataset(hf_id, split=spec["split"], trust_remote_code=True)
	break
	except Exception: # noqa: BLE001
	continue
	if ds is None:
	raise RuntimeError(
	f"Could not load benchmark '{benchmark}' from any of {hf_ids}. "
	"Check your HuggingFace token and dataset availability."
	)
	if limit:
	ds = ds.select(range(min(limit, len(ds))))

	p1_scores, pk_scores = [], []
	for problem in ds:
	intent = problem[spec["intent_col"]]
	correct = 0
	for _ in range(n_samples):
	code = generate_fn(intent)
	program = spec["program"](problem, code)
	if run_code(program, timeout=timeout).ok:
	correct += 1
	p1_scores.append(pass_at_k(n_samples, correct, 1))
	pk_scores.append(pass_at_k(n_samples, correct, k))

	return EvalResult(
	benchmark=benchmark, n_problems=len(ds), n_samples=n_samples,
	pass_at_1=round(float(np.mean(p1_scores)), 4),
	pass_at_k=round(float(np.mean(pk_scores)), 4), k=k,
	)