"""HumanEval+ loader (EvalPlus-augmented HumanEval, 164 tasks). HumanEval+ uses the same prompts as the original HumanEval benchmark but with ~80x more tests per problem, catching solutions that pass the thin original suite by luck. This is the honest way to report HumanEval numbers in 2025-26. First call downloads the dataset to the evalplus cache (~a few MB). Subsequent calls are instant. """ from __future__ import annotations from typing import Any, cast from evalplus.data import get_human_eval_plus # type: ignore[import-untyped] from .base import Task def load_humaneval_plus() -> list[Task]: """Return all 164 HumanEval+ tasks in deterministic task_id order.""" raw = cast("dict[str, dict[str, Any]]", get_human_eval_plus()) tasks = [ Task( task_id=task_id, prompt=item["prompt"], canonical_solution=item["canonical_solution"], test=item["test"], entry_point=item["entry_point"], ) for task_id, item in raw.items() ] # EvalPlus returns a dict — sort by the numeric suffix so "HumanEval/10" # comes after "HumanEval/9" rather than in lexicographic order. tasks.sort(key=lambda t: int(t.task_id.split("/")[-1])) return tasks