Spaces:

dmaheshwar22
/

verifiable-rl-coder

Running

deploy: replace template with real demo

0dd7c80 verified 23 days ago

1.26 kB

	"""HumanEval+ loader (EvalPlus-augmented HumanEval, 164 tasks).

	HumanEval+ uses the same prompts as the original HumanEval benchmark but with
	~80x more tests per problem, catching solutions that pass the thin original
	suite by luck. This is the honest way to report HumanEval numbers in 2025-26.

	First call downloads the dataset to the evalplus cache (~a few MB). Subsequent
	calls are instant.
	"""

	from __future__ import annotations

	from typing import Any, cast

	from evalplus.data import get_human_eval_plus # type: ignore[import-untyped]

	from .base import Task


	def load_humaneval_plus() -> list[Task]:
	"""Return all 164 HumanEval+ tasks in deterministic task_id order."""
	raw = cast("dict[str, dict[str, Any]]", get_human_eval_plus())
	tasks = [
	Task(
	task_id=task_id,
	prompt=item["prompt"],
	canonical_solution=item["canonical_solution"],
	test=item["test"],
	entry_point=item["entry_point"],
	)
	for task_id, item in raw.items()
	]
	# EvalPlus returns a dict — sort by the numeric suffix so "HumanEval/10"
	# comes after "HumanEval/9" rather than in lexicographic order.
	tasks.sort(key=lambda t: int(t.task_id.split("/")[-1]))
	return tasks