"""pass@k metric — the standard code-generation evaluation statistic. We use the unbiased estimator from the Codex paper (Chen et al. 2021): pass@k = E_task[ 1 - C(n - c, k) / C(n, k) ] where `n` is the number of samples drawn per task and `c` is how many of them passed. The naive estimator `c/n > 0` is biased; the formula above gives the exact probability that at least one of `k` randomly chosen samples (without replacement) from `n` passes. Every code-LLM paper uses this form — reporting a different number breaks comparability. """ from __future__ import annotations from collections.abc import Sequence import numpy as np def pass_at_k(n: int, c: int, k: int) -> float: """Unbiased pass@k for a single task. Args: n: Total samples drawn for this task. c: Number of samples that passed all tests. k: The k in pass@k. Returns: Probability (in [0, 1]) that at least one of k randomly drawn samples from the n drawn passes. """ if k <= 0 or n <= 0: return 0.0 if c <= 0: return 0.0 if n - c < k: return 1.0 # Stable form: prod_{i=n-c+1..n} (1 - k/i) return float(1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))) def compute_pass_at_k( per_task_results: Sequence[Sequence[bool]], ks: Sequence[int] = (1, 5, 10), ) -> dict[int, float]: """Average pass@k across tasks. Args: per_task_results: For each task, a list of per-sample booleans (True = passed all tests, False otherwise). ks: Which pass@k values to compute. `k` larger than n for a task is skipped for that task (we don't silently coerce). Returns: Mapping from k to mean pass@k across tasks that had >= k samples. """ out: dict[int, float] = {} for k in ks: scores: list[float] = [] for samples in per_task_results: n = len(samples) if n < k: continue c = sum(1 for passed in samples if passed) scores.append(pass_at_k(n=n, c=c, k=k)) out[k] = float(np.mean(scores)) if scores else 0.0 return out