dmaheshwar22's picture
deploy: replace template with real demo
0dd7c80 verified
"""HumanEval+ loader (EvalPlus-augmented HumanEval, 164 tasks).
HumanEval+ uses the same prompts as the original HumanEval benchmark but with
~80x more tests per problem, catching solutions that pass the thin original
suite by luck. This is the honest way to report HumanEval numbers in 2025-26.
First call downloads the dataset to the evalplus cache (~a few MB). Subsequent
calls are instant.
"""
from __future__ import annotations
from typing import Any, cast
from evalplus.data import get_human_eval_plus # type: ignore[import-untyped]
from .base import Task
def load_humaneval_plus() -> list[Task]:
"""Return all 164 HumanEval+ tasks in deterministic task_id order."""
raw = cast("dict[str, dict[str, Any]]", get_human_eval_plus())
tasks = [
Task(
task_id=task_id,
prompt=item["prompt"],
canonical_solution=item["canonical_solution"],
test=item["test"],
entry_point=item["entry_point"],
)
for task_id, item in raw.items()
]
# EvalPlus returns a dict — sort by the numeric suffix so "HumanEval/10"
# comes after "HumanEval/9" rather than in lexicographic order.
tasks.sort(key=lambda t: int(t.task_id.split("/")[-1]))
return tasks