dmaheshwar22's picture
deploy: replace template with real demo
0dd7c80 verified
"""pass@k metric — the standard code-generation evaluation statistic.
We use the unbiased estimator from the Codex paper (Chen et al. 2021):
pass@k = E_task[ 1 - C(n - c, k) / C(n, k) ]
where `n` is the number of samples drawn per task and `c` is how many of them
passed. The naive estimator `c/n > 0` is biased; the formula above gives the
exact probability that at least one of `k` randomly chosen samples (without
replacement) from `n` passes. Every code-LLM paper uses this form — reporting
a different number breaks comparability.
"""
from __future__ import annotations
from collections.abc import Sequence
import numpy as np
def pass_at_k(n: int, c: int, k: int) -> float:
"""Unbiased pass@k for a single task.
Args:
n: Total samples drawn for this task.
c: Number of samples that passed all tests.
k: The k in pass@k.
Returns:
Probability (in [0, 1]) that at least one of k randomly drawn
samples from the n drawn passes.
"""
if k <= 0 or n <= 0:
return 0.0
if c <= 0:
return 0.0
if n - c < k:
return 1.0
# Stable form: prod_{i=n-c+1..n} (1 - k/i)
return float(1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
def compute_pass_at_k(
per_task_results: Sequence[Sequence[bool]],
ks: Sequence[int] = (1, 5, 10),
) -> dict[int, float]:
"""Average pass@k across tasks.
Args:
per_task_results: For each task, a list of per-sample booleans
(True = passed all tests, False otherwise).
ks: Which pass@k values to compute. `k` larger than n for a task is
skipped for that task (we don't silently coerce).
Returns:
Mapping from k to mean pass@k across tasks that had >= k samples.
"""
out: dict[int, float] = {}
for k in ks:
scores: list[float] = []
for samples in per_task_results:
n = len(samples)
if n < k:
continue
c = sum(1 for passed in samples if passed)
scores.append(pass_at_k(n=n, c=c, k=k))
out[k] = float(np.mean(scores)) if scores else 0.0
return out