""" TeamForge Integration Tests Tests the full environment loop for all task difficulties. """ from __future__ import annotations import pytest from environment import TeamForgeEnv from models import ( ActionStatus, Commit, EditFile, GenerateReview, PhaseState, PlanStep, RunLint, RunTests, SelfReflect, TaskDifficulty, ) from tasks import ALL_TASK_IDS # ───────────────────────────────────────────── # FIXTURES # ───────────────────────────────────────────── @pytest.fixture def env(): e = TeamForgeEnv() yield e # Teardown sandbox after each test e._sandbox.teardown() # ───────────────────────────────────────────── # RESET TESTS # ───────────────────────────────────────────── class TestReset: def test_reset_returns_observation(self, env): obs = env.reset("easy_bugfix_chunk_list") assert obs is not None assert obs.task_id == "easy_bugfix_chunk_list" assert obs.difficulty == TaskDifficulty.EASY assert obs.step_number == 0 assert obs.done is False def test_reset_populates_repo_files(self, env): obs = env.reset("easy_bugfix_chunk_list") assert len(obs.repo_files) > 0 paths = [f.path for f in obs.repo_files] assert any("utils" in p for p in paths) def test_reset_clears_previous_episode(self, env): env.reset("easy_bugfix_chunk_list") obs = env.reset("easy_bugfix_chunk_list") assert obs.step_number == 0 assert obs.cumulative_reward == 0.0 def test_all_task_ids_reset(self, env): for task_id in ALL_TASK_IDS: obs = env.reset(task_id) assert obs.task_id == task_id def test_unknown_task_raises(self, env): with pytest.raises(KeyError): env.reset("nonexistent_task") # ───────────────────────────────────────────── # STEP TESTS # ───────────────────────────────────────────── class TestStep: def test_plan_step_action(self, env): env.reset("easy_bugfix_chunk_list") action = PlanStep( step_number=1, description="Read and understand the buggy chunk_list function", estimated_effort="low", ) obs = env.step(action) assert obs.step_number == 1 assert obs.last_action_type == "plan_step" assert obs.last_action_status == ActionStatus.SUCCESS assert len(obs.plan) == 1 def test_edit_file_action(self, env): env.reset("easy_bugfix_chunk_list") action = EditFile( file_path="utils/list_ops.py", content='"""Fixed."""\n\ndef chunk_list(lst, n):\n return [lst[i:i+n] for i in range(0, len(lst), n)]\n', reason="Fix off-by-one bug in range stop", ) obs = env.step(action) assert obs.last_action_status == ActionStatus.SUCCESS assert obs.reward > 0 def test_run_tests_action(self, env): env.reset("easy_bugfix_chunk_list") action = RunTests(timeout_seconds=30) obs = env.step(action) assert obs.last_action_type == "run_tests" assert obs.test_results is not None assert obs.test_results.passed >= 0 def test_run_lint_action(self, env): env.reset("easy_bugfix_chunk_list") action = RunLint(fix=False) obs = env.step(action) assert obs.last_action_type == "run_lint" assert obs.lint_results is not None assert 0.0 <= obs.lint_results.score <= 1.0 def test_generate_review_action(self, env): env.reset("easy_bugfix_chunk_list") action = GenerateReview( focus_areas=["correctness", "off-by-one"], review_text=( "The bug was an off-by-one in the range() call. " "The original used range(0, len(lst)-1, n) which dropped the last chunk. " "Fixed by changing to range(0, len(lst), n). " "The chunk function now correctly handles all edge cases." ), ) obs = env.step(action) assert obs.last_action_status == ActionStatus.SUCCESS assert len(obs.reviews) == 1 assert obs.reward > 0 def test_self_reflect_action(self, env): env.reset("easy_bugfix_chunk_list") action = SelfReflect( what_went_well="Identified the off-by-one error quickly by reading tests", what_to_improve="Should have run lint before committing", ) obs = env.step(action) assert len(obs.reflections) == 1 assert obs.reward > 0 def test_commit_action(self, env): env.reset("easy_bugfix_chunk_list") # First make a change env.step(EditFile( file_path="README.md", content="# Fixed!\n", reason="update readme", )) obs = env.step(Commit(message="fix: update readme")) assert obs.last_action_type == "commit" def test_step_before_reset_raises(self, env): with pytest.raises(RuntimeError): env.step(RunTests()) # ───────────────────────────────────────────── # REWARD TESTS # ───────────────────────────────────────────── class TestReward: def test_reward_is_float(self, env): env.reset("easy_bugfix_chunk_list") obs = env.step(PlanStep(step_number=1, description="Plan the fix")) assert isinstance(obs.reward, float) def test_cumulative_reward_accumulates(self, env): env.reset("easy_bugfix_chunk_list") obs1 = env.step(PlanStep(step_number=1, description="Plan step")) obs2 = env.step(PlanStep(step_number=2, description="Another plan step")) assert abs(obs2.cumulative_reward - (obs1.cumulative_reward + obs2.reward)) < 1e-6 def test_review_reward_is_positive(self, env): env.reset("easy_bugfix_chunk_list") obs = env.step(GenerateReview( review_text="This review is about correctness and the off-by-one bug in range.", focus_areas=["correctness"], )) assert obs.reward > 0 def test_test_file_modification_penalised(self, env): env.reset("easy_bugfix_chunk_list") obs = env.step(EditFile( file_path="tests/test_list_ops.py", content="# Cleared tests\n", reason="removing tests", )) assert obs.reward < -0.2 # heavy penalty # ───────────────────────────────────────────── # STATE TESTS # ───────────────────────────────────────────── class TestState: def test_state_before_reset(self, env): s = env.state() assert s["status"] == "not_started" def test_state_after_reset(self, env): env.reset("easy_bugfix_chunk_list") s = env.state() assert s["task_id"] == "easy_bugfix_chunk_list" assert s["step"] == 0 assert s["done"] is False def test_state_tracks_steps(self, env): env.reset("easy_bugfix_chunk_list") env.step(PlanStep(step_number=1, description="Step 1")) env.step(PlanStep(step_number=2, description="Step 2")) s = env.state() assert s["step"] == 2 assert s["plan_steps"] == 2 # ───────────────────────────────────────────── # FULL EPISODE SMOKE TEST (easy task) # ───────────────────────────────────────────── class TestFullEpisode: """Smoke test: manually solve the easy task without an LLM.""" def test_easy_task_solvable(self, env): obs = env.reset("easy_bugfix_chunk_list") # Plan env.step(PlanStep( step_number=1, description="Read utils/list_ops.py and identify the range() bug", estimated_effort="low", )) env.step(PlanStep( step_number=2, description="Fix the off-by-one: change range(0, len(lst)-1, n) to range(0, len(lst), n)", estimated_effort="low", )) # Fix the bug fixed_code = '''\ """List utility operations.""" from typing import Any, List def chunk_list(lst: List[Any], n: int) -> List[List[Any]]: """Split *lst* into consecutive chunks of size *n*.""" if n <= 0: raise ValueError("Chunk size must be positive") result = [] for i in range(0, len(lst), n): result.append(lst[i : i + n]) return result def flatten(lst: List[List[Any]]) -> List[Any]: """Flatten a list of lists by one level.""" return [item for sublist in lst for item in sublist] ''' env.step(EditFile( file_path="utils/list_ops.py", content=fixed_code, reason="Fix off-by-one: range stop was len(lst)-1, should be len(lst)", )) # Test obs = env.step(RunTests()) assert obs.test_results is not None assert obs.test_results.failed == 0, f"Tests still failing: {obs.test_results.output}" # Lint env.step(RunLint(fix=False)) # Review env.step(GenerateReview( focus_areas=["correctness", "off-by-one", "range"], review_text=( "The bug was a classic off-by-one error in the range() call. " "Original code used range(0, len(lst)-1, n), which stopped one index short " "and caused the final chunk to be silently dropped. " "Fix: change to range(0, len(lst), n). " "All edge cases now pass: empty list, chunk larger than list, exact division." ), )) # Reflect env.step(SelfReflect( what_went_well="Identified the off-by-one error quickly by reading the test assertions.", what_to_improve="Should run lint immediately after editing, not after testing.", )) # Commit obs = env.step(Commit( message="fix(list_ops): correct off-by-one in chunk_list range() call" )) # Grade result = env.grade() assert result.test_pass_rate == 1.0 assert result.final_score >= 0.70, f"Expected score >= 0.70, got {result.final_score}" assert result.passed is True