File size: 2,159 Bytes
0dd7c80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""pass@k metric — the standard code-generation evaluation statistic.

We use the unbiased estimator from the Codex paper (Chen et al. 2021):

    pass@k = E_task[ 1 - C(n - c, k) / C(n, k) ]

where `n` is the number of samples drawn per task and `c` is how many of them
passed. The naive estimator `c/n > 0` is biased; the formula above gives the
exact probability that at least one of `k` randomly chosen samples (without
replacement) from `n` passes. Every code-LLM paper uses this form — reporting
a different number breaks comparability.
"""

from __future__ import annotations

from collections.abc import Sequence

import numpy as np


def pass_at_k(n: int, c: int, k: int) -> float:
    """Unbiased pass@k for a single task.

    Args:
        n: Total samples drawn for this task.
        c: Number of samples that passed all tests.
        k: The k in pass@k.

    Returns:
        Probability (in [0, 1]) that at least one of k randomly drawn
        samples from the n drawn passes.
    """
    if k <= 0 or n <= 0:
        return 0.0
    if c <= 0:
        return 0.0
    if n - c < k:
        return 1.0
    # Stable form: prod_{i=n-c+1..n} (1 - k/i)
    return float(1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))


def compute_pass_at_k(
    per_task_results: Sequence[Sequence[bool]],
    ks: Sequence[int] = (1, 5, 10),
) -> dict[int, float]:
    """Average pass@k across tasks.

    Args:
        per_task_results: For each task, a list of per-sample booleans
            (True = passed all tests, False otherwise).
        ks: Which pass@k values to compute. `k` larger than n for a task is
            skipped for that task (we don't silently coerce).

    Returns:
        Mapping from k to mean pass@k across tasks that had >= k samples.
    """
    out: dict[int, float] = {}
    for k in ks:
        scores: list[float] = []
        for samples in per_task_results:
            n = len(samples)
            if n < k:
                continue
            c = sum(1 for passed in samples if passed)
            scores.append(pass_at_k(n=n, c=c, k=k))
        out[k] = float(np.mean(scores)) if scores else 0.0
    return out