Spaces:

dmaheshwar22
/

verifiable-rl-coder

Running

File size: 1,261 Bytes

0dd7c80

"""HumanEval+ loader (EvalPlus-augmented HumanEval, 164 tasks).

HumanEval+ uses the same prompts as the original HumanEval benchmark but with
~80x more tests per problem, catching solutions that pass the thin original
suite by luck. This is the honest way to report HumanEval numbers in 2025-26.

First call downloads the dataset to the evalplus cache (~a few MB). Subsequent
calls are instant.
"""

from __future__ import annotations

from typing import Any, cast

from evalplus.data import get_human_eval_plus  # type: ignore[import-untyped]

from .base import Task


def load_humaneval_plus() -> list[Task]:
    """Return all 164 HumanEval+ tasks in deterministic task_id order."""
    raw = cast("dict[str, dict[str, Any]]", get_human_eval_plus())
    tasks = [
        Task(
            task_id=task_id,
            prompt=item["prompt"],
            canonical_solution=item["canonical_solution"],
            test=item["test"],
            entry_point=item["entry_point"],
        )
        for task_id, item in raw.items()
    ]
    # EvalPlus returns a dict — sort by the numeric suffix so "HumanEval/10"
    # comes after "HumanEval/9" rather than in lexicographic order.
    tasks.sort(key=lambda t: int(t.task_id.split("/")[-1]))
    return tasks