Spaces:
Sleeping
Sleeping
| """ | |
| TeamForge Integration Tests | |
| Tests the full environment loop for all task difficulties. | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| from environment import TeamForgeEnv | |
| from models import ( | |
| ActionStatus, | |
| Commit, | |
| EditFile, | |
| GenerateReview, | |
| PhaseState, | |
| PlanStep, | |
| RunLint, | |
| RunTests, | |
| SelfReflect, | |
| TaskDifficulty, | |
| ) | |
| from tasks import ALL_TASK_IDS | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # FIXTURES | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def env(): | |
| e = TeamForgeEnv() | |
| yield e | |
| # Teardown sandbox after each test | |
| e._sandbox.teardown() | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # RESET TESTS | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestReset: | |
| def test_reset_returns_observation(self, env): | |
| obs = env.reset("easy_bugfix_chunk_list") | |
| assert obs is not None | |
| assert obs.task_id == "easy_bugfix_chunk_list" | |
| assert obs.difficulty == TaskDifficulty.EASY | |
| assert obs.step_number == 0 | |
| assert obs.done is False | |
| def test_reset_populates_repo_files(self, env): | |
| obs = env.reset("easy_bugfix_chunk_list") | |
| assert len(obs.repo_files) > 0 | |
| paths = [f.path for f in obs.repo_files] | |
| assert any("utils" in p for p in paths) | |
| def test_reset_clears_previous_episode(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| obs = env.reset("easy_bugfix_chunk_list") | |
| assert obs.step_number == 0 | |
| assert obs.cumulative_reward == 0.0 | |
| def test_all_task_ids_reset(self, env): | |
| for task_id in ALL_TASK_IDS: | |
| obs = env.reset(task_id) | |
| assert obs.task_id == task_id | |
| def test_unknown_task_raises(self, env): | |
| with pytest.raises(KeyError): | |
| env.reset("nonexistent_task") | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP TESTS | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestStep: | |
| def test_plan_step_action(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| action = PlanStep( | |
| step_number=1, | |
| description="Read and understand the buggy chunk_list function", | |
| estimated_effort="low", | |
| ) | |
| obs = env.step(action) | |
| assert obs.step_number == 1 | |
| assert obs.last_action_type == "plan_step" | |
| assert obs.last_action_status == ActionStatus.SUCCESS | |
| assert len(obs.plan) == 1 | |
| def test_edit_file_action(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| action = EditFile( | |
| file_path="utils/list_ops.py", | |
| content='"""Fixed."""\n\ndef chunk_list(lst, n):\n return [lst[i:i+n] for i in range(0, len(lst), n)]\n', | |
| reason="Fix off-by-one bug in range stop", | |
| ) | |
| obs = env.step(action) | |
| assert obs.last_action_status == ActionStatus.SUCCESS | |
| assert obs.reward > 0 | |
| def test_run_tests_action(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| action = RunTests(timeout_seconds=30) | |
| obs = env.step(action) | |
| assert obs.last_action_type == "run_tests" | |
| assert obs.test_results is not None | |
| assert obs.test_results.passed >= 0 | |
| def test_run_lint_action(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| action = RunLint(fix=False) | |
| obs = env.step(action) | |
| assert obs.last_action_type == "run_lint" | |
| assert obs.lint_results is not None | |
| assert 0.0 <= obs.lint_results.score <= 1.0 | |
| def test_generate_review_action(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| action = GenerateReview( | |
| focus_areas=["correctness", "off-by-one"], | |
| review_text=( | |
| "The bug was an off-by-one in the range() call. " | |
| "The original used range(0, len(lst)-1, n) which dropped the last chunk. " | |
| "Fixed by changing to range(0, len(lst), n). " | |
| "The chunk function now correctly handles all edge cases." | |
| ), | |
| ) | |
| obs = env.step(action) | |
| assert obs.last_action_status == ActionStatus.SUCCESS | |
| assert len(obs.reviews) == 1 | |
| assert obs.reward > 0 | |
| def test_self_reflect_action(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| action = SelfReflect( | |
| what_went_well="Identified the off-by-one error quickly by reading tests", | |
| what_to_improve="Should have run lint before committing", | |
| ) | |
| obs = env.step(action) | |
| assert len(obs.reflections) == 1 | |
| assert obs.reward > 0 | |
| def test_commit_action(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| # First make a change | |
| env.step(EditFile( | |
| file_path="README.md", | |
| content="# Fixed!\n", | |
| reason="update readme", | |
| )) | |
| obs = env.step(Commit(message="fix: update readme")) | |
| assert obs.last_action_type == "commit" | |
| def test_step_before_reset_raises(self, env): | |
| with pytest.raises(RuntimeError): | |
| env.step(RunTests()) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # REWARD TESTS | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestReward: | |
| def test_reward_is_float(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| obs = env.step(PlanStep(step_number=1, description="Plan the fix")) | |
| assert isinstance(obs.reward, float) | |
| def test_cumulative_reward_accumulates(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| obs1 = env.step(PlanStep(step_number=1, description="Plan step")) | |
| obs2 = env.step(PlanStep(step_number=2, description="Another plan step")) | |
| assert abs(obs2.cumulative_reward - (obs1.cumulative_reward + obs2.reward)) < 1e-6 | |
| def test_review_reward_is_positive(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| obs = env.step(GenerateReview( | |
| review_text="This review is about correctness and the off-by-one bug in range.", | |
| focus_areas=["correctness"], | |
| )) | |
| assert obs.reward > 0 | |
| def test_test_file_modification_penalised(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| obs = env.step(EditFile( | |
| file_path="tests/test_list_ops.py", | |
| content="# Cleared tests\n", | |
| reason="removing tests", | |
| )) | |
| assert obs.reward < -0.2 # heavy penalty | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # STATE TESTS | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestState: | |
| def test_state_before_reset(self, env): | |
| s = env.state() | |
| assert s["status"] == "not_started" | |
| def test_state_after_reset(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| s = env.state() | |
| assert s["task_id"] == "easy_bugfix_chunk_list" | |
| assert s["step"] == 0 | |
| assert s["done"] is False | |
| def test_state_tracks_steps(self, env): | |
| env.reset("easy_bugfix_chunk_list") | |
| env.step(PlanStep(step_number=1, description="Step 1")) | |
| env.step(PlanStep(step_number=2, description="Step 2")) | |
| s = env.state() | |
| assert s["step"] == 2 | |
| assert s["plan_steps"] == 2 | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # FULL EPISODE SMOKE TEST (easy task) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestFullEpisode: | |
| """Smoke test: manually solve the easy task without an LLM.""" | |
| def test_easy_task_solvable(self, env): | |
| obs = env.reset("easy_bugfix_chunk_list") | |
| # Plan | |
| env.step(PlanStep( | |
| step_number=1, | |
| description="Read utils/list_ops.py and identify the range() bug", | |
| estimated_effort="low", | |
| )) | |
| env.step(PlanStep( | |
| step_number=2, | |
| description="Fix the off-by-one: change range(0, len(lst)-1, n) to range(0, len(lst), n)", | |
| estimated_effort="low", | |
| )) | |
| # Fix the bug | |
| fixed_code = '''\ | |
| """List utility operations.""" | |
| from typing import Any, List | |
| def chunk_list(lst: List[Any], n: int) -> List[List[Any]]: | |
| """Split *lst* into consecutive chunks of size *n*.""" | |
| if n <= 0: | |
| raise ValueError("Chunk size must be positive") | |
| result = [] | |
| for i in range(0, len(lst), n): | |
| result.append(lst[i : i + n]) | |
| return result | |
| def flatten(lst: List[List[Any]]) -> List[Any]: | |
| """Flatten a list of lists by one level.""" | |
| return [item for sublist in lst for item in sublist] | |
| ''' | |
| env.step(EditFile( | |
| file_path="utils/list_ops.py", | |
| content=fixed_code, | |
| reason="Fix off-by-one: range stop was len(lst)-1, should be len(lst)", | |
| )) | |
| # Test | |
| obs = env.step(RunTests()) | |
| assert obs.test_results is not None | |
| assert obs.test_results.failed == 0, f"Tests still failing: {obs.test_results.output}" | |
| # Lint | |
| env.step(RunLint(fix=False)) | |
| # Review | |
| env.step(GenerateReview( | |
| focus_areas=["correctness", "off-by-one", "range"], | |
| review_text=( | |
| "The bug was a classic off-by-one error in the range() call. " | |
| "Original code used range(0, len(lst)-1, n), which stopped one index short " | |
| "and caused the final chunk to be silently dropped. " | |
| "Fix: change to range(0, len(lst), n). " | |
| "All edge cases now pass: empty list, chunk larger than list, exact division." | |
| ), | |
| )) | |
| # Reflect | |
| env.step(SelfReflect( | |
| what_went_well="Identified the off-by-one error quickly by reading the test assertions.", | |
| what_to_improve="Should run lint immediately after editing, not after testing.", | |
| )) | |
| # Commit | |
| obs = env.step(Commit( | |
| message="fix(list_ops): correct off-by-one in chunk_list range() call" | |
| )) | |
| # Grade | |
| result = env.grade() | |
| assert result.test_pass_rate == 1.0 | |
| assert result.final_score >= 0.70, f"Expected score >= 0.70, got {result.final_score}" | |
| assert result.passed is True | |