Spaces:

dmaheshwar22
/

verifiable-rl-coder

Running

File size: 3,475 Bytes

0dd7c80

"""Shared types and helpers for benchmark loaders.

Every benchmark (HumanEval+, MBPP+, SWE-bench-Lite) exposes its problems as a
list of `Task` so downstream code (proposer, sandbox, eval) treats them
uniformly. Test files are reconstructed per benchmark by `to_sandbox_inputs`.
"""

from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class Task:
    """A single coding problem.

    Fields:
        task_id: Stable identifier (e.g. "HumanEval/0", "Mbpp/2").
        prompt: The user-facing description. For HumanEval this is a function
            signature + docstring; for MBPP it's a natural-language description.
        canonical_solution: The reference implementation (used for sanity
            checks and SFT rejection sampling, not for scoring model output).
        test: Test source code. Format is benchmark-specific:
            - HumanEval+: a full `def check(candidate)` function.
            - MBPP+: a block of `assert entry_point(...) == ...` lines.
            The sandbox executor wraps these into a runnable test file.
        entry_point: Name of the function the model must implement.
    """

    task_id: str
    prompt: str
    canonical_solution: str
    test: str
    entry_point: str
    # Optional helpers (imports, classes, helper functions) extracted from
    # canonical_solution. For MBPP tasks where tests reference custom types
    # like `Pair`, these definitions are prepended to the model's solution
    # at sandbox-execution time so tests can run regardless of whether the
    # model copied them into its output.
    helpers: str = ""


def to_sandbox_inputs(
    task: Task,
    completion: str,
    benchmark: str,
) -> tuple[str, str]:
    """Build `(solution.py, test_solution.py)` source for the sandbox runner.

    HumanEval+ stores tests as a full `def check(candidate)` function; MBPP+
    stores raw `assert entry_point(...)` lines. This helper produces a uniform
    `test_main()` test for the sandbox regardless of benchmark shape.

    For HumanEval+, the completion is expected to include the full function
    definition. If it doesn't (model returned only the body), we prepend the
    task prompt to reconstruct a valid module.
    """
    if benchmark == "humaneval_plus":
        solution = (
            completion
            if f"def {task.entry_point}(" in completion
            else task.prompt + completion
        )
        tests = (
            f"from solution import {task.entry_point}\n"
            f"{task.test}\n"
            "def test_main() -> None:\n"
            f"    check({task.entry_point})\n"
        )
        return solution, tests

    if benchmark == "mbpp_plus":
        # Prepend helpers (e.g. `class Pair`) so test assertions can reference
        # custom types/imports without depending on whether the model copied
        # them into its output.
        solution = f"{task.helpers}\n\n{completion}" if task.helpers else completion
        # `from solution import *` so the helpers (which are at solution.py's
        # module level) are visible to the test assertions.
        lines = [line for line in task.test.splitlines() if line.strip()]
        indented = "\n    ".join(lines)
        tests = (
            "from solution import *  # noqa: F401, F403\n"
            "def test_main() -> None:\n"
            f"    {indented}\n"
        )
        return solution, tests

    raise ValueError(f"unknown benchmark: {benchmark!r}")