| """Shared types and helpers for benchmark loaders. |
| |
| Every benchmark (HumanEval+, MBPP+, SWE-bench-Lite) exposes its problems as a |
| list of `Task` so downstream code (proposer, sandbox, eval) treats them |
| uniformly. Test files are reconstructed per benchmark by `to_sandbox_inputs`. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from dataclasses import dataclass |
|
|
|
|
| @dataclass(frozen=True) |
| class Task: |
| """A single coding problem. |
| |
| Fields: |
| task_id: Stable identifier (e.g. "HumanEval/0", "Mbpp/2"). |
| prompt: The user-facing description. For HumanEval this is a function |
| signature + docstring; for MBPP it's a natural-language description. |
| canonical_solution: The reference implementation (used for sanity |
| checks and SFT rejection sampling, not for scoring model output). |
| test: Test source code. Format is benchmark-specific: |
| - HumanEval+: a full `def check(candidate)` function. |
| - MBPP+: a block of `assert entry_point(...) == ...` lines. |
| The sandbox executor wraps these into a runnable test file. |
| entry_point: Name of the function the model must implement. |
| """ |
|
|
| task_id: str |
| prompt: str |
| canonical_solution: str |
| test: str |
| entry_point: str |
| |
| |
| |
| |
| |
| helpers: str = "" |
|
|
|
|
| def to_sandbox_inputs( |
| task: Task, |
| completion: str, |
| benchmark: str, |
| ) -> tuple[str, str]: |
| """Build `(solution.py, test_solution.py)` source for the sandbox runner. |
| |
| HumanEval+ stores tests as a full `def check(candidate)` function; MBPP+ |
| stores raw `assert entry_point(...)` lines. This helper produces a uniform |
| `test_main()` test for the sandbox regardless of benchmark shape. |
| |
| For HumanEval+, the completion is expected to include the full function |
| definition. If it doesn't (model returned only the body), we prepend the |
| task prompt to reconstruct a valid module. |
| """ |
| if benchmark == "humaneval_plus": |
| solution = ( |
| completion |
| if f"def {task.entry_point}(" in completion |
| else task.prompt + completion |
| ) |
| tests = ( |
| f"from solution import {task.entry_point}\n" |
| f"{task.test}\n" |
| "def test_main() -> None:\n" |
| f" check({task.entry_point})\n" |
| ) |
| return solution, tests |
|
|
| if benchmark == "mbpp_plus": |
| |
| |
| |
| solution = f"{task.helpers}\n\n{completion}" if task.helpers else completion |
| |
| |
| lines = [line for line in task.test.splitlines() if line.strip()] |
| indented = "\n ".join(lines) |
| tests = ( |
| "from solution import * # noqa: F401, F403\n" |
| "def test_main() -> None:\n" |
| f" {indented}\n" |
| ) |
| return solution, tests |
|
|
| raise ValueError(f"unknown benchmark: {benchmark!r}") |
|
|