Spaces:
Sleeping
Sleeping
| """pass@k metric — the standard code-generation evaluation statistic. | |
| We use the unbiased estimator from the Codex paper (Chen et al. 2021): | |
| pass@k = E_task[ 1 - C(n - c, k) / C(n, k) ] | |
| where `n` is the number of samples drawn per task and `c` is how many of them | |
| passed. The naive estimator `c/n > 0` is biased; the formula above gives the | |
| exact probability that at least one of `k` randomly chosen samples (without | |
| replacement) from `n` passes. Every code-LLM paper uses this form — reporting | |
| a different number breaks comparability. | |
| """ | |
| from __future__ import annotations | |
| from collections.abc import Sequence | |
| import numpy as np | |
| def pass_at_k(n: int, c: int, k: int) -> float: | |
| """Unbiased pass@k for a single task. | |
| Args: | |
| n: Total samples drawn for this task. | |
| c: Number of samples that passed all tests. | |
| k: The k in pass@k. | |
| Returns: | |
| Probability (in [0, 1]) that at least one of k randomly drawn | |
| samples from the n drawn passes. | |
| """ | |
| if k <= 0 or n <= 0: | |
| return 0.0 | |
| if c <= 0: | |
| return 0.0 | |
| if n - c < k: | |
| return 1.0 | |
| # Stable form: prod_{i=n-c+1..n} (1 - k/i) | |
| return float(1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))) | |
| def compute_pass_at_k( | |
| per_task_results: Sequence[Sequence[bool]], | |
| ks: Sequence[int] = (1, 5, 10), | |
| ) -> dict[int, float]: | |
| """Average pass@k across tasks. | |
| Args: | |
| per_task_results: For each task, a list of per-sample booleans | |
| (True = passed all tests, False otherwise). | |
| ks: Which pass@k values to compute. `k` larger than n for a task is | |
| skipped for that task (we don't silently coerce). | |
| Returns: | |
| Mapping from k to mean pass@k across tasks that had >= k samples. | |
| """ | |
| out: dict[int, float] = {} | |
| for k in ks: | |
| scores: list[float] = [] | |
| for samples in per_task_results: | |
| n = len(samples) | |
| if n < k: | |
| continue | |
| c = sum(1 for passed in samples if passed) | |
| scores.append(pass_at_k(n=n, c=c, k=k)) | |
| out[k] = float(np.mean(scores)) if scores else 0.0 | |
| return out | |