File size: 4,505 Bytes
acf77ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class Task:
    task_id: str
    task_level: str
    brief: str
    initial_files: dict[str, str]
    target_score: float
    max_budget: int
    tools: tuple[str, ...]
    hidden_tests: dict[str, str] = ()  # type: ignore[assignment]
    """Hidden correctness tests injected by the environment during grading.

    The agent never sees these. They are written into the sandbox temp dir
    alongside the agent's submitted files so pytest runs them automatically.
    This prevents "clean garbage" exploits where syntactically valid but
    semantically wrong code scores perfectly.
    """


# -- Hidden test suites (agent never sees these) ----------------------------

_HIDDEN_EASY = {
    "test_hidden_greet.py": (
        "from __future__ import annotations\n"
        "from main import greet\n\n"
        "def test_greet_alice() -> None:\n"
        '    assert greet("Alice") == "Hello, Alice!"\n\n'
        "def test_greet_bob() -> None:\n"
        '    assert greet("Bob") == "Hello, Bob!"\n\n'
        "def test_greet_empty() -> None:\n"
        '    assert greet("") == "Hello, !"\n'
    ),
}

_HIDDEN_MEDIUM = {
    "test_hidden_greet.py": (
        "from __future__ import annotations\n"
        "import pytest\n"
        "from main import greet\n\n"
        "def test_greet_alice() -> None:\n"
        '    assert greet("Alice") == "Hello, Alice!"\n\n'
        "def test_greet_none_raises() -> None:\n"
        "    with pytest.raises(ValueError):\n"
        "        greet(None)  # type: ignore[arg-type]\n\n"
        "def test_greet_returns_str() -> None:\n"
        '    assert isinstance(greet("X"), str)\n'
    ),
}

_HIDDEN_HARD = {
    "test_hidden_core.py": (
        "from __future__ import annotations\n"
        "import pytest\n"
        "from core import greet\n\n"
        "def test_greet_alice() -> None:\n"
        '    assert greet("Alice") == "Hello, Alice!"\n\n'
        "def test_greet_bob() -> None:\n"
        '    assert greet("Bob") == "Hello, Bob!"\n\n'
        "def test_greet_returns_str() -> None:\n"
        '    assert isinstance(greet("X"), str)\n\n'
        "def test_greet_empty() -> None:\n"
        '    assert greet("") == "Hello, !"\n'
    ),
}


TASKS: tuple[Task, ...] = (
    Task(
        task_id="greet_single_file",
        task_level="easy",
        brief=(
            "Implement `greet(name)` in `main.py` so that `greet(\"Alice\")` returns "
            '`"Hello, Alice!"`. Use type hints. Keep the module under 15 lines.'
        ),
        initial_files={"main.py": "def greet(name):\n    pass\n"},
        target_score=0.90,
        max_budget=4,
        tools=("ruff", "imports", "mypy", "pytest"),
        hidden_tests=_HIDDEN_EASY,
    ),
    Task(
        task_id="greet_with_tests",
        task_level="medium",
        brief=(
            "Extend `main.py` so that `greet(None)` raises `ValueError`, "
            "and add a `test_main.py` with pytest assertions. Keep `ruff` and "
            "`mypy --strict` clean."
        ),
        initial_files={
            "main.py": (
                "from __future__ import annotations\n\n\n"
                "def greet(name: str) -> str:\n"
                '    return f"Hello, {name}!"\n'
            ),
            "test_main.py": "",
        },
        target_score=0.80,
        max_budget=6,
        tools=("ruff", "imports", "mypy", "pytest"),
        hidden_tests=_HIDDEN_MEDIUM,
    ),
    Task(
        task_id="multi_file_module",
        task_level="hard",
        brief=(
            "Split into three files: `main.py` (entry), `core.py` (the greet "
            "function), `test_core.py` (tests). Every function must be type-hinted. "
            "All tests pass. `mypy --strict` clean."
        ),
        initial_files={
            "main.py": (
                "from __future__ import annotations\n\nfrom core import greet\n\n\n"
                'if __name__ == "__main__":\n'
                '    print(greet("World"))\n'
            ),
            "core.py": "",
            "test_core.py": "",
        },
        target_score=0.70,
        max_budget=10,
        tools=("ruff", "imports", "mypy", "pytest"),
        hidden_tests=_HIDDEN_HARD,
    ),
)


def get_task(task_level: str) -> Task:
    for t in TASKS:
        if t.task_level == task_level:
            return t
    msg = f"unknown task_level: {task_level!r} (expected easy|medium|hard)"
    raise ValueError(msg)