dmaheshwar22's picture
deploy: replace template with real demo
0dd7c80 verified
"""Shared types and helpers for benchmark loaders.
Every benchmark (HumanEval+, MBPP+, SWE-bench-Lite) exposes its problems as a
list of `Task` so downstream code (proposer, sandbox, eval) treats them
uniformly. Test files are reconstructed per benchmark by `to_sandbox_inputs`.
"""
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True)
class Task:
"""A single coding problem.
Fields:
task_id: Stable identifier (e.g. "HumanEval/0", "Mbpp/2").
prompt: The user-facing description. For HumanEval this is a function
signature + docstring; for MBPP it's a natural-language description.
canonical_solution: The reference implementation (used for sanity
checks and SFT rejection sampling, not for scoring model output).
test: Test source code. Format is benchmark-specific:
- HumanEval+: a full `def check(candidate)` function.
- MBPP+: a block of `assert entry_point(...) == ...` lines.
The sandbox executor wraps these into a runnable test file.
entry_point: Name of the function the model must implement.
"""
task_id: str
prompt: str
canonical_solution: str
test: str
entry_point: str
# Optional helpers (imports, classes, helper functions) extracted from
# canonical_solution. For MBPP tasks where tests reference custom types
# like `Pair`, these definitions are prepended to the model's solution
# at sandbox-execution time so tests can run regardless of whether the
# model copied them into its output.
helpers: str = ""
def to_sandbox_inputs(
task: Task,
completion: str,
benchmark: str,
) -> tuple[str, str]:
"""Build `(solution.py, test_solution.py)` source for the sandbox runner.
HumanEval+ stores tests as a full `def check(candidate)` function; MBPP+
stores raw `assert entry_point(...)` lines. This helper produces a uniform
`test_main()` test for the sandbox regardless of benchmark shape.
For HumanEval+, the completion is expected to include the full function
definition. If it doesn't (model returned only the body), we prepend the
task prompt to reconstruct a valid module.
"""
if benchmark == "humaneval_plus":
solution = (
completion
if f"def {task.entry_point}(" in completion
else task.prompt + completion
)
tests = (
f"from solution import {task.entry_point}\n"
f"{task.test}\n"
"def test_main() -> None:\n"
f" check({task.entry_point})\n"
)
return solution, tests
if benchmark == "mbpp_plus":
# Prepend helpers (e.g. `class Pair`) so test assertions can reference
# custom types/imports without depending on whether the model copied
# them into its output.
solution = f"{task.helpers}\n\n{completion}" if task.helpers else completion
# `from solution import *` so the helpers (which are at solution.py's
# module level) are visible to the test assertions.
lines = [line for line in task.test.splitlines() if line.strip()]
indented = "\n ".join(lines)
tests = (
"from solution import * # noqa: F401, F403\n"
"def test_main() -> None:\n"
f" {indented}\n"
)
return solution, tests
raise ValueError(f"unknown benchmark: {benchmark!r}")