File size: 1,261 Bytes
0dd7c80 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | """HumanEval+ loader (EvalPlus-augmented HumanEval, 164 tasks).
HumanEval+ uses the same prompts as the original HumanEval benchmark but with
~80x more tests per problem, catching solutions that pass the thin original
suite by luck. This is the honest way to report HumanEval numbers in 2025-26.
First call downloads the dataset to the evalplus cache (~a few MB). Subsequent
calls are instant.
"""
from __future__ import annotations
from typing import Any, cast
from evalplus.data import get_human_eval_plus # type: ignore[import-untyped]
from .base import Task
def load_humaneval_plus() -> list[Task]:
"""Return all 164 HumanEval+ tasks in deterministic task_id order."""
raw = cast("dict[str, dict[str, Any]]", get_human_eval_plus())
tasks = [
Task(
task_id=task_id,
prompt=item["prompt"],
canonical_solution=item["canonical_solution"],
test=item["test"],
entry_point=item["entry_point"],
)
for task_id, item in raw.items()
]
# EvalPlus returns a dict — sort by the numeric suffix so "HumanEval/10"
# comes after "HumanEval/9" rather than in lexicographic order.
tasks.sort(key=lambda t: int(t.task_id.split("/")[-1]))
return tasks
|