Spaces:

dmaheshwar22
/

verifiable-rl-coder

Running

App Files Files Community

verifiable-rl-coder / src /verifiable_rl_coder /benchmarks /base.py

dmaheshwar22

deploy: replace template with real demo

0dd7c80 verified 16 days ago

raw

history blame contribute delete

3.48 kB

	"""Shared types and helpers for benchmark loaders.

	Every benchmark (HumanEval+, MBPP+, SWE-bench-Lite) exposes its problems as a
	list of `Task` so downstream code (proposer, sandbox, eval) treats them
	uniformly. Test files are reconstructed per benchmark by `to_sandbox_inputs`.
	"""

	from __future__ import annotations

	from dataclasses import dataclass


	@dataclass(frozen=True)
	class Task:
	"""A single coding problem.

	Fields:
	task_id: Stable identifier (e.g. "HumanEval/0", "Mbpp/2").
	prompt: The user-facing description. For HumanEval this is a function
	signature + docstring; for MBPP it's a natural-language description.
	canonical_solution: The reference implementation (used for sanity
	checks and SFT rejection sampling, not for scoring model output).
	test: Test source code. Format is benchmark-specific:
	- HumanEval+: a full `def check(candidate)` function.
	- MBPP+: a block of `assert entry_point(...) == ...` lines.
	The sandbox executor wraps these into a runnable test file.
	entry_point: Name of the function the model must implement.
	"""

	task_id: str
	prompt: str
	canonical_solution: str
	test: str
	entry_point: str
	# Optional helpers (imports, classes, helper functions) extracted from
	# canonical_solution. For MBPP tasks where tests reference custom types
	# like `Pair`, these definitions are prepended to the model's solution
	# at sandbox-execution time so tests can run regardless of whether the
	# model copied them into its output.
	helpers: str = ""


	def to_sandbox_inputs(
	task: Task,
	completion: str,
	benchmark: str,
	) -> tuple[str, str]:
	"""Build `(solution.py, test_solution.py)` source for the sandbox runner.

	HumanEval+ stores tests as a full `def check(candidate)` function; MBPP+
	stores raw `assert entry_point(...)` lines. This helper produces a uniform
	`test_main()` test for the sandbox regardless of benchmark shape.

	For HumanEval+, the completion is expected to include the full function
	definition. If it doesn't (model returned only the body), we prepend the
	task prompt to reconstruct a valid module.
	"""
	if benchmark == "humaneval_plus":
	solution = (
	completion
	if f"def {task.entry_point}(" in completion
	else task.prompt + completion
	)
	tests = (
	f"from solution import {task.entry_point}\n"
	f"{task.test}\n"
	"def test_main() -> None:\n"
	f" check({task.entry_point})\n"
	)
	return solution, tests

	if benchmark == "mbpp_plus":
	# Prepend helpers (e.g. `class Pair`) so test assertions can reference
	# custom types/imports without depending on whether the model copied
	# them into its output.
	solution = f"{task.helpers}\n\n{completion}" if task.helpers else completion
	# `from solution import *` so the helpers (which are at solution.py's
	# module level) are visible to the test assertions.
	lines = [line for line in task.test.splitlines() if line.strip()]
	indented = "\n ".join(lines)
	tests = (
	"from solution import * # noqa: F401, F403\n"
	"def test_main() -> None:\n"
	f" {indented}\n"
	)
	return solution, tests

	raise ValueError(f"unknown benchmark: {benchmark!r}")