Spaces:

dmaheshwar22
/

verifiable-rl-coder

Sleeping

App Files Files Community

verifiable-rl-coder / src /verifiable_rl_coder /evaluation /pass_at_k.py

dmaheshwar22

deploy: replace template with real demo

0dd7c80 verified about 1 month ago

raw

history blame contribute delete

2.16 kB

	"""pass@k metric — the standard code-generation evaluation statistic.

	We use the unbiased estimator from the Codex paper (Chen et al. 2021):

	pass@k = E_task[ 1 - C(n - c, k) / C(n, k) ]

	where `n` is the number of samples drawn per task and `c` is how many of them
	passed. The naive estimator `c/n > 0` is biased; the formula above gives the
	exact probability that at least one of `k` randomly chosen samples (without
	replacement) from `n` passes. Every code-LLM paper uses this form — reporting
	a different number breaks comparability.
	"""

	from __future__ import annotations

	from collections.abc import Sequence

	import numpy as np


	def pass_at_k(n: int, c: int, k: int) -> float:
	"""Unbiased pass@k for a single task.

	Args:
	n: Total samples drawn for this task.
	c: Number of samples that passed all tests.
	k: The k in pass@k.

	Returns:
	Probability (in [0, 1]) that at least one of k randomly drawn
	samples from the n drawn passes.
	"""
	if k <= 0 or n <= 0:
	return 0.0
	if c <= 0:
	return 0.0
	if n - c < k:
	return 1.0
	# Stable form: prod_{i=n-c+1..n} (1 - k/i)
	return float(1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))


	def compute_pass_at_k(
	per_task_results: Sequence[Sequence[bool]],
	ks: Sequence[int] = (1, 5, 10),
	) -> dict[int, float]:
	"""Average pass@k across tasks.

	Args:
	per_task_results: For each task, a list of per-sample booleans
	(True = passed all tests, False otherwise).
	ks: Which pass@k values to compute. `k` larger than n for a task is
	skipped for that task (we don't silently coerce).

	Returns:
	Mapping from k to mean pass@k across tasks that had >= k samples.
	"""
	out: dict[int, float] = {}
	for k in ks:
	scores: list[float] = []
	for samples in per_task_results:
	n = len(samples)
	if n < k:
	continue
	c = sum(1 for passed in samples if passed)
	scores.append(pass_at_k(n=n, c=c, k=k))
	out[k] = float(np.mean(scores)) if scores else 0.0
	return out