| """HumanEval+ loader (EvalPlus-augmented HumanEval, 164 tasks). | |
| HumanEval+ uses the same prompts as the original HumanEval benchmark but with | |
| ~80x more tests per problem, catching solutions that pass the thin original | |
| suite by luck. This is the honest way to report HumanEval numbers in 2025-26. | |
| First call downloads the dataset to the evalplus cache (~a few MB). Subsequent | |
| calls are instant. | |
| """ | |
| from __future__ import annotations | |
| from typing import Any, cast | |
| from evalplus.data import get_human_eval_plus # type: ignore[import-untyped] | |
| from .base import Task | |
| def load_humaneval_plus() -> list[Task]: | |
| """Return all 164 HumanEval+ tasks in deterministic task_id order.""" | |
| raw = cast("dict[str, dict[str, Any]]", get_human_eval_plus()) | |
| tasks = [ | |
| Task( | |
| task_id=task_id, | |
| prompt=item["prompt"], | |
| canonical_solution=item["canonical_solution"], | |
| test=item["test"], | |
| entry_point=item["entry_point"], | |
| ) | |
| for task_id, item in raw.items() | |
| ] | |
| # EvalPlus returns a dict — sort by the numeric suffix so "HumanEval/10" | |
| # comes after "HumanEval/9" rather than in lexicographic order. | |
| tasks.sort(key=lambda t: int(t.task_id.split("/")[-1])) | |
| return tasks | |