File size: 4,505 Bytes
acf77ab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True)
class Task:
task_id: str
task_level: str
brief: str
initial_files: dict[str, str]
target_score: float
max_budget: int
tools: tuple[str, ...]
hidden_tests: dict[str, str] = () # type: ignore[assignment]
"""Hidden correctness tests injected by the environment during grading.
The agent never sees these. They are written into the sandbox temp dir
alongside the agent's submitted files so pytest runs them automatically.
This prevents "clean garbage" exploits where syntactically valid but
semantically wrong code scores perfectly.
"""
# -- Hidden test suites (agent never sees these) ----------------------------
_HIDDEN_EASY = {
"test_hidden_greet.py": (
"from __future__ import annotations\n"
"from main import greet\n\n"
"def test_greet_alice() -> None:\n"
' assert greet("Alice") == "Hello, Alice!"\n\n'
"def test_greet_bob() -> None:\n"
' assert greet("Bob") == "Hello, Bob!"\n\n'
"def test_greet_empty() -> None:\n"
' assert greet("") == "Hello, !"\n'
),
}
_HIDDEN_MEDIUM = {
"test_hidden_greet.py": (
"from __future__ import annotations\n"
"import pytest\n"
"from main import greet\n\n"
"def test_greet_alice() -> None:\n"
' assert greet("Alice") == "Hello, Alice!"\n\n'
"def test_greet_none_raises() -> None:\n"
" with pytest.raises(ValueError):\n"
" greet(None) # type: ignore[arg-type]\n\n"
"def test_greet_returns_str() -> None:\n"
' assert isinstance(greet("X"), str)\n'
),
}
_HIDDEN_HARD = {
"test_hidden_core.py": (
"from __future__ import annotations\n"
"import pytest\n"
"from core import greet\n\n"
"def test_greet_alice() -> None:\n"
' assert greet("Alice") == "Hello, Alice!"\n\n'
"def test_greet_bob() -> None:\n"
' assert greet("Bob") == "Hello, Bob!"\n\n'
"def test_greet_returns_str() -> None:\n"
' assert isinstance(greet("X"), str)\n\n'
"def test_greet_empty() -> None:\n"
' assert greet("") == "Hello, !"\n'
),
}
TASKS: tuple[Task, ...] = (
Task(
task_id="greet_single_file",
task_level="easy",
brief=(
"Implement `greet(name)` in `main.py` so that `greet(\"Alice\")` returns "
'`"Hello, Alice!"`. Use type hints. Keep the module under 15 lines.'
),
initial_files={"main.py": "def greet(name):\n pass\n"},
target_score=0.90,
max_budget=4,
tools=("ruff", "imports", "mypy", "pytest"),
hidden_tests=_HIDDEN_EASY,
),
Task(
task_id="greet_with_tests",
task_level="medium",
brief=(
"Extend `main.py` so that `greet(None)` raises `ValueError`, "
"and add a `test_main.py` with pytest assertions. Keep `ruff` and "
"`mypy --strict` clean."
),
initial_files={
"main.py": (
"from __future__ import annotations\n\n\n"
"def greet(name: str) -> str:\n"
' return f"Hello, {name}!"\n'
),
"test_main.py": "",
},
target_score=0.80,
max_budget=6,
tools=("ruff", "imports", "mypy", "pytest"),
hidden_tests=_HIDDEN_MEDIUM,
),
Task(
task_id="multi_file_module",
task_level="hard",
brief=(
"Split into three files: `main.py` (entry), `core.py` (the greet "
"function), `test_core.py` (tests). Every function must be type-hinted. "
"All tests pass. `mypy --strict` clean."
),
initial_files={
"main.py": (
"from __future__ import annotations\n\nfrom core import greet\n\n\n"
'if __name__ == "__main__":\n'
' print(greet("World"))\n'
),
"core.py": "",
"test_core.py": "",
},
target_score=0.70,
max_budget=10,
tools=("ruff", "imports", "mypy", "pytest"),
hidden_tests=_HIDDEN_HARD,
),
)
def get_task(task_level: str) -> Task:
for t in TASKS:
if t.task_level == task_level:
return t
msg = f"unknown task_level: {task_level!r} (expected easy|medium|hard)"
raise ValueError(msg)
|