| from __future__ import annotations |
|
|
| from dataclasses import dataclass |
|
|
|
|
| @dataclass(frozen=True) |
| class Task: |
| task_id: str |
| task_level: str |
| brief: str |
| initial_files: dict[str, str] |
| target_score: float |
| max_budget: int |
| tools: tuple[str, ...] |
| hidden_tests: dict[str, str] = () |
| """Hidden correctness tests injected by the environment during grading. |
| |
| The agent never sees these. They are written into the sandbox temp dir |
| alongside the agent's submitted files so pytest runs them automatically. |
| This prevents "clean garbage" exploits where syntactically valid but |
| semantically wrong code scores perfectly. |
| """ |
|
|
|
|
| |
|
|
| _HIDDEN_EASY = { |
| "test_hidden_greet.py": ( |
| "from __future__ import annotations\n" |
| "from main import greet\n\n" |
| "def test_greet_alice() -> None:\n" |
| ' assert greet("Alice") == "Hello, Alice!"\n\n' |
| "def test_greet_bob() -> None:\n" |
| ' assert greet("Bob") == "Hello, Bob!"\n\n' |
| "def test_greet_empty() -> None:\n" |
| ' assert greet("") == "Hello, !"\n' |
| ), |
| } |
|
|
| _HIDDEN_MEDIUM = { |
| "test_hidden_greet.py": ( |
| "from __future__ import annotations\n" |
| "import pytest\n" |
| "from main import greet\n\n" |
| "def test_greet_alice() -> None:\n" |
| ' assert greet("Alice") == "Hello, Alice!"\n\n' |
| "def test_greet_none_raises() -> None:\n" |
| " with pytest.raises(ValueError):\n" |
| " greet(None) # type: ignore[arg-type]\n\n" |
| "def test_greet_returns_str() -> None:\n" |
| ' assert isinstance(greet("X"), str)\n' |
| ), |
| } |
|
|
| _HIDDEN_HARD = { |
| "test_hidden_core.py": ( |
| "from __future__ import annotations\n" |
| "import pytest\n" |
| "from core import greet\n\n" |
| "def test_greet_alice() -> None:\n" |
| ' assert greet("Alice") == "Hello, Alice!"\n\n' |
| "def test_greet_bob() -> None:\n" |
| ' assert greet("Bob") == "Hello, Bob!"\n\n' |
| "def test_greet_returns_str() -> None:\n" |
| ' assert isinstance(greet("X"), str)\n\n' |
| "def test_greet_empty() -> None:\n" |
| ' assert greet("") == "Hello, !"\n' |
| ), |
| } |
|
|
|
|
| TASKS: tuple[Task, ...] = ( |
| Task( |
| task_id="greet_single_file", |
| task_level="easy", |
| brief=( |
| "Implement `greet(name)` in `main.py` so that `greet(\"Alice\")` returns " |
| '`"Hello, Alice!"`. Use type hints. Keep the module under 15 lines.' |
| ), |
| initial_files={"main.py": "def greet(name):\n pass\n"}, |
| target_score=0.90, |
| max_budget=4, |
| tools=("ruff", "imports", "mypy", "pytest"), |
| hidden_tests=_HIDDEN_EASY, |
| ), |
| Task( |
| task_id="greet_with_tests", |
| task_level="medium", |
| brief=( |
| "Extend `main.py` so that `greet(None)` raises `ValueError`, " |
| "and add a `test_main.py` with pytest assertions. Keep `ruff` and " |
| "`mypy --strict` clean." |
| ), |
| initial_files={ |
| "main.py": ( |
| "from __future__ import annotations\n\n\n" |
| "def greet(name: str) -> str:\n" |
| ' return f"Hello, {name}!"\n' |
| ), |
| "test_main.py": "", |
| }, |
| target_score=0.80, |
| max_budget=6, |
| tools=("ruff", "imports", "mypy", "pytest"), |
| hidden_tests=_HIDDEN_MEDIUM, |
| ), |
| Task( |
| task_id="multi_file_module", |
| task_level="hard", |
| brief=( |
| "Split into three files: `main.py` (entry), `core.py` (the greet " |
| "function), `test_core.py` (tests). Every function must be type-hinted. " |
| "All tests pass. `mypy --strict` clean." |
| ), |
| initial_files={ |
| "main.py": ( |
| "from __future__ import annotations\n\nfrom core import greet\n\n\n" |
| 'if __name__ == "__main__":\n' |
| ' print(greet("World"))\n' |
| ), |
| "core.py": "", |
| "test_core.py": "", |
| }, |
| target_score=0.70, |
| max_budget=10, |
| tools=("ruff", "imports", "mypy", "pytest"), |
| hidden_tests=_HIDDEN_HARD, |
| ), |
| ) |
|
|
|
|
| def get_task(task_level: str) -> Task: |
| for t in TASKS: |
| if t.task_level == task_level: |
| return t |
| msg = f"unknown task_level: {task_level!r} (expected easy|medium|hard)" |
| raise ValueError(msg) |
|
|