File size: 11,157 Bytes
637f42c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
"""
TeamForge Integration Tests
Tests the full environment loop for all task difficulties.
"""

from __future__ import annotations

import pytest
from environment import TeamForgeEnv
from models import (
    ActionStatus,
    Commit,
    EditFile,
    GenerateReview,
    PhaseState,
    PlanStep,
    RunLint,
    RunTests,
    SelfReflect,
    TaskDifficulty,
)
from tasks import ALL_TASK_IDS


# ─────────────────────────────────────────────
# FIXTURES
# ─────────────────────────────────────────────

@pytest.fixture
def env():
    e = TeamForgeEnv()
    yield e
    # Teardown sandbox after each test
    e._sandbox.teardown()


# ─────────────────────────────────────────────
# RESET TESTS
# ─────────────────────────────────────────────

class TestReset:
    def test_reset_returns_observation(self, env):
        obs = env.reset("easy_bugfix_chunk_list")
        assert obs is not None
        assert obs.task_id == "easy_bugfix_chunk_list"
        assert obs.difficulty == TaskDifficulty.EASY
        assert obs.step_number == 0
        assert obs.done is False

    def test_reset_populates_repo_files(self, env):
        obs = env.reset("easy_bugfix_chunk_list")
        assert len(obs.repo_files) > 0
        paths = [f.path for f in obs.repo_files]
        assert any("utils" in p for p in paths)

    def test_reset_clears_previous_episode(self, env):
        env.reset("easy_bugfix_chunk_list")
        obs = env.reset("easy_bugfix_chunk_list")
        assert obs.step_number == 0
        assert obs.cumulative_reward == 0.0

    def test_all_task_ids_reset(self, env):
        for task_id in ALL_TASK_IDS:
            obs = env.reset(task_id)
            assert obs.task_id == task_id

    def test_unknown_task_raises(self, env):
        with pytest.raises(KeyError):
            env.reset("nonexistent_task")


# ─────────────────────────────────────────────
# STEP TESTS
# ─────────────────────────────────────────────

class TestStep:
    def test_plan_step_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = PlanStep(
            step_number=1,
            description="Read and understand the buggy chunk_list function",
            estimated_effort="low",
        )
        obs = env.step(action)
        assert obs.step_number == 1
        assert obs.last_action_type == "plan_step"
        assert obs.last_action_status == ActionStatus.SUCCESS
        assert len(obs.plan) == 1

    def test_edit_file_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = EditFile(
            file_path="utils/list_ops.py",
            content='"""Fixed."""\n\ndef chunk_list(lst, n):\n    return [lst[i:i+n] for i in range(0, len(lst), n)]\n',
            reason="Fix off-by-one bug in range stop",
        )
        obs = env.step(action)
        assert obs.last_action_status == ActionStatus.SUCCESS
        assert obs.reward > 0

    def test_run_tests_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = RunTests(timeout_seconds=30)
        obs = env.step(action)
        assert obs.last_action_type == "run_tests"
        assert obs.test_results is not None
        assert obs.test_results.passed >= 0

    def test_run_lint_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = RunLint(fix=False)
        obs = env.step(action)
        assert obs.last_action_type == "run_lint"
        assert obs.lint_results is not None
        assert 0.0 <= obs.lint_results.score <= 1.0

    def test_generate_review_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = GenerateReview(
            focus_areas=["correctness", "off-by-one"],
            review_text=(
                "The bug was an off-by-one in the range() call. "
                "The original used range(0, len(lst)-1, n) which dropped the last chunk. "
                "Fixed by changing to range(0, len(lst), n). "
                "The chunk function now correctly handles all edge cases."
            ),
        )
        obs = env.step(action)
        assert obs.last_action_status == ActionStatus.SUCCESS
        assert len(obs.reviews) == 1
        assert obs.reward > 0

    def test_self_reflect_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = SelfReflect(
            what_went_well="Identified the off-by-one error quickly by reading tests",
            what_to_improve="Should have run lint before committing",
        )
        obs = env.step(action)
        assert len(obs.reflections) == 1
        assert obs.reward > 0

    def test_commit_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        # First make a change
        env.step(EditFile(
            file_path="README.md",
            content="# Fixed!\n",
            reason="update readme",
        ))
        obs = env.step(Commit(message="fix: update readme"))
        assert obs.last_action_type == "commit"

    def test_step_before_reset_raises(self, env):
        with pytest.raises(RuntimeError):
            env.step(RunTests())


# ─────────────────────────────────────────────
# REWARD TESTS
# ─────────────────────────────────────────────

class TestReward:
    def test_reward_is_float(self, env):
        env.reset("easy_bugfix_chunk_list")
        obs = env.step(PlanStep(step_number=1, description="Plan the fix"))
        assert isinstance(obs.reward, float)

    def test_cumulative_reward_accumulates(self, env):
        env.reset("easy_bugfix_chunk_list")
        obs1 = env.step(PlanStep(step_number=1, description="Plan step"))
        obs2 = env.step(PlanStep(step_number=2, description="Another plan step"))
        assert abs(obs2.cumulative_reward - (obs1.cumulative_reward + obs2.reward)) < 1e-6

    def test_review_reward_is_positive(self, env):
        env.reset("easy_bugfix_chunk_list")
        obs = env.step(GenerateReview(
            review_text="This review is about correctness and the off-by-one bug in range.",
            focus_areas=["correctness"],
        ))
        assert obs.reward > 0

    def test_test_file_modification_penalised(self, env):
        env.reset("easy_bugfix_chunk_list")
        obs = env.step(EditFile(
            file_path="tests/test_list_ops.py",
            content="# Cleared tests\n",
            reason="removing tests",
        ))
        assert obs.reward < -0.2  # heavy penalty


# ─────────────────────────────────────────────
# STATE TESTS
# ─────────────────────────────────────────────

class TestState:
    def test_state_before_reset(self, env):
        s = env.state()
        assert s["status"] == "not_started"

    def test_state_after_reset(self, env):
        env.reset("easy_bugfix_chunk_list")
        s = env.state()
        assert s["task_id"] == "easy_bugfix_chunk_list"
        assert s["step"] == 0
        assert s["done"] is False

    def test_state_tracks_steps(self, env):
        env.reset("easy_bugfix_chunk_list")
        env.step(PlanStep(step_number=1, description="Step 1"))
        env.step(PlanStep(step_number=2, description="Step 2"))
        s = env.state()
        assert s["step"] == 2
        assert s["plan_steps"] == 2


# ─────────────────────────────────────────────
# FULL EPISODE SMOKE TEST (easy task)
# ─────────────────────────────────────────────

class TestFullEpisode:
    """Smoke test: manually solve the easy task without an LLM."""

    def test_easy_task_solvable(self, env):
        obs = env.reset("easy_bugfix_chunk_list")

        # Plan
        env.step(PlanStep(
            step_number=1,
            description="Read utils/list_ops.py and identify the range() bug",
            estimated_effort="low",
        ))
        env.step(PlanStep(
            step_number=2,
            description="Fix the off-by-one: change range(0, len(lst)-1, n) to range(0, len(lst), n)",
            estimated_effort="low",
        ))

        # Fix the bug
        fixed_code = '''\
"""List utility operations."""
from typing import Any, List


def chunk_list(lst: List[Any], n: int) -> List[List[Any]]:
    """Split *lst* into consecutive chunks of size *n*."""
    if n <= 0:
        raise ValueError("Chunk size must be positive")
    result = []
    for i in range(0, len(lst), n):
        result.append(lst[i : i + n])
    return result


def flatten(lst: List[List[Any]]) -> List[Any]:
    """Flatten a list of lists by one level."""
    return [item for sublist in lst for item in sublist]
'''
        env.step(EditFile(
            file_path="utils/list_ops.py",
            content=fixed_code,
            reason="Fix off-by-one: range stop was len(lst)-1, should be len(lst)",
        ))

        # Test
        obs = env.step(RunTests())
        assert obs.test_results is not None
        assert obs.test_results.failed == 0, f"Tests still failing: {obs.test_results.output}"

        # Lint
        env.step(RunLint(fix=False))

        # Review
        env.step(GenerateReview(
            focus_areas=["correctness", "off-by-one", "range"],
            review_text=(
                "The bug was a classic off-by-one error in the range() call. "
                "Original code used range(0, len(lst)-1, n), which stopped one index short "
                "and caused the final chunk to be silently dropped. "
                "Fix: change to range(0, len(lst), n). "
                "All edge cases now pass: empty list, chunk larger than list, exact division."
            ),
        ))

        # Reflect
        env.step(SelfReflect(
            what_went_well="Identified the off-by-one error quickly by reading the test assertions.",
            what_to_improve="Should run lint immediately after editing, not after testing.",
        ))

        # Commit
        obs = env.step(Commit(
            message="fix(list_ops): correct off-by-one in chunk_list range() call"
        ))

        # Grade
        result = env.grade()
        assert result.test_pass_rate == 1.0
        assert result.final_score >= 0.70, f"Expected score >= 0.70, got {result.final_score}"
        assert result.passed is True