dmaheshwar22's picture
deploy: replace template with real demo
0dd7c80 verified
"""MBPP+ loader (EvalPlus-augmented MBPP, ~378 sanitized tasks).
MBPP+ applies the same test-augmentation treatment as HumanEval+: the original
MBPP has very thin tests (usually 3 assertions per task); MBPP+ expands each
problem to ~100 tests and filters out problems whose ground truth is
ambiguous or buggy.
Same API as `load_humaneval_plus` — returns a deterministically ordered list
of `Task`.
"""
from __future__ import annotations
from typing import Any, cast
from evalplus.data import get_mbpp_plus # type: ignore[import-untyped]
from .base import Task
def _extract_mbpp_helpers(canonical_solution: str, entry_point: str) -> str:
"""Same helper-extraction logic as mbpp_train (kept independent to
avoid an import cycle between mbpp_plus and mbpp_train)."""
import re
pattern = re.compile(
rf"^def\s+{re.escape(entry_point)}\s*\(",
re.MULTILINE,
)
match = pattern.search(canonical_solution)
if not match:
return ""
raw = canonical_solution[: match.start()].rstrip()
return raw.replace("\r\n", "\n").replace("\r", "\n").expandtabs(4)
def load_mbpp_plus() -> list[Task]:
"""Return all MBPP+ tasks in deterministic task_id order.
MBPP+ stores tests under `assertion` (raw `assert ...` lines referencing
the entry_point by name), not `test` (HumanEval+'s `def check(candidate)`
wrapper). We load the assertion block verbatim into `Task.test`. We also
extract any preamble (imports, classes) from `canonical_solution` into
`Task.helpers` so the sandbox executor can prepend them at run time.
"""
raw = cast("dict[str, dict[str, Any]]", get_mbpp_plus())
tasks: list[Task] = []
for task_id, item in raw.items():
canonical = item["canonical_solution"]
entry_point = item["entry_point"]
tasks.append(
Task(
task_id=task_id,
prompt=item["prompt"],
canonical_solution=canonical,
test=item["assertion"],
entry_point=entry_point,
helpers=_extract_mbpp_helpers(canonical, entry_point),
)
)
tasks.sort(key=lambda t: int(t.task_id.split("/")[-1]))
return tasks