"""Shared types and helpers for benchmark loaders. Every benchmark (HumanEval+, MBPP+, SWE-bench-Lite) exposes its problems as a list of `Task` so downstream code (proposer, sandbox, eval) treats them uniformly. Test files are reconstructed per benchmark by `to_sandbox_inputs`. """ from __future__ import annotations from dataclasses import dataclass @dataclass(frozen=True) class Task: """A single coding problem. Fields: task_id: Stable identifier (e.g. "HumanEval/0", "Mbpp/2"). prompt: The user-facing description. For HumanEval this is a function signature + docstring; for MBPP it's a natural-language description. canonical_solution: The reference implementation (used for sanity checks and SFT rejection sampling, not for scoring model output). test: Test source code. Format is benchmark-specific: - HumanEval+: a full `def check(candidate)` function. - MBPP+: a block of `assert entry_point(...) == ...` lines. The sandbox executor wraps these into a runnable test file. entry_point: Name of the function the model must implement. """ task_id: str prompt: str canonical_solution: str test: str entry_point: str # Optional helpers (imports, classes, helper functions) extracted from # canonical_solution. For MBPP tasks where tests reference custom types # like `Pair`, these definitions are prepended to the model's solution # at sandbox-execution time so tests can run regardless of whether the # model copied them into its output. helpers: str = "" def to_sandbox_inputs( task: Task, completion: str, benchmark: str, ) -> tuple[str, str]: """Build `(solution.py, test_solution.py)` source for the sandbox runner. HumanEval+ stores tests as a full `def check(candidate)` function; MBPP+ stores raw `assert entry_point(...)` lines. This helper produces a uniform `test_main()` test for the sandbox regardless of benchmark shape. For HumanEval+, the completion is expected to include the full function definition. If it doesn't (model returned only the body), we prepend the task prompt to reconstruct a valid module. """ if benchmark == "humaneval_plus": solution = ( completion if f"def {task.entry_point}(" in completion else task.prompt + completion ) tests = ( f"from solution import {task.entry_point}\n" f"{task.test}\n" "def test_main() -> None:\n" f" check({task.entry_point})\n" ) return solution, tests if benchmark == "mbpp_plus": # Prepend helpers (e.g. `class Pair`) so test assertions can reference # custom types/imports without depending on whether the model copied # them into its output. solution = f"{task.helpers}\n\n{completion}" if task.helpers else completion # `from solution import *` so the helpers (which are at solution.py's # module level) are visible to the test assertions. lines = [line for line in task.test.splitlines() if line.strip()] indented = "\n ".join(lines) tests = ( "from solution import * # noqa: F401, F403\n" "def test_main() -> None:\n" f" {indented}\n" ) return solution, tests raise ValueError(f"unknown benchmark: {benchmark!r}")