teamforge / tests /test_environment.py
Your Name
fix: add FastAPI REST endpoints for OpenEnv validator
637f42c
"""
TeamForge Integration Tests
Tests the full environment loop for all task difficulties.
"""
from __future__ import annotations
import pytest
from environment import TeamForgeEnv
from models import (
ActionStatus,
Commit,
EditFile,
GenerateReview,
PhaseState,
PlanStep,
RunLint,
RunTests,
SelfReflect,
TaskDifficulty,
)
from tasks import ALL_TASK_IDS
# ─────────────────────────────────────────────
# FIXTURES
# ─────────────────────────────────────────────
@pytest.fixture
def env():
e = TeamForgeEnv()
yield e
# Teardown sandbox after each test
e._sandbox.teardown()
# ─────────────────────────────────────────────
# RESET TESTS
# ─────────────────────────────────────────────
class TestReset:
def test_reset_returns_observation(self, env):
obs = env.reset("easy_bugfix_chunk_list")
assert obs is not None
assert obs.task_id == "easy_bugfix_chunk_list"
assert obs.difficulty == TaskDifficulty.EASY
assert obs.step_number == 0
assert obs.done is False
def test_reset_populates_repo_files(self, env):
obs = env.reset("easy_bugfix_chunk_list")
assert len(obs.repo_files) > 0
paths = [f.path for f in obs.repo_files]
assert any("utils" in p for p in paths)
def test_reset_clears_previous_episode(self, env):
env.reset("easy_bugfix_chunk_list")
obs = env.reset("easy_bugfix_chunk_list")
assert obs.step_number == 0
assert obs.cumulative_reward == 0.0
def test_all_task_ids_reset(self, env):
for task_id in ALL_TASK_IDS:
obs = env.reset(task_id)
assert obs.task_id == task_id
def test_unknown_task_raises(self, env):
with pytest.raises(KeyError):
env.reset("nonexistent_task")
# ─────────────────────────────────────────────
# STEP TESTS
# ─────────────────────────────────────────────
class TestStep:
def test_plan_step_action(self, env):
env.reset("easy_bugfix_chunk_list")
action = PlanStep(
step_number=1,
description="Read and understand the buggy chunk_list function",
estimated_effort="low",
)
obs = env.step(action)
assert obs.step_number == 1
assert obs.last_action_type == "plan_step"
assert obs.last_action_status == ActionStatus.SUCCESS
assert len(obs.plan) == 1
def test_edit_file_action(self, env):
env.reset("easy_bugfix_chunk_list")
action = EditFile(
file_path="utils/list_ops.py",
content='"""Fixed."""\n\ndef chunk_list(lst, n):\n return [lst[i:i+n] for i in range(0, len(lst), n)]\n',
reason="Fix off-by-one bug in range stop",
)
obs = env.step(action)
assert obs.last_action_status == ActionStatus.SUCCESS
assert obs.reward > 0
def test_run_tests_action(self, env):
env.reset("easy_bugfix_chunk_list")
action = RunTests(timeout_seconds=30)
obs = env.step(action)
assert obs.last_action_type == "run_tests"
assert obs.test_results is not None
assert obs.test_results.passed >= 0
def test_run_lint_action(self, env):
env.reset("easy_bugfix_chunk_list")
action = RunLint(fix=False)
obs = env.step(action)
assert obs.last_action_type == "run_lint"
assert obs.lint_results is not None
assert 0.0 <= obs.lint_results.score <= 1.0
def test_generate_review_action(self, env):
env.reset("easy_bugfix_chunk_list")
action = GenerateReview(
focus_areas=["correctness", "off-by-one"],
review_text=(
"The bug was an off-by-one in the range() call. "
"The original used range(0, len(lst)-1, n) which dropped the last chunk. "
"Fixed by changing to range(0, len(lst), n). "
"The chunk function now correctly handles all edge cases."
),
)
obs = env.step(action)
assert obs.last_action_status == ActionStatus.SUCCESS
assert len(obs.reviews) == 1
assert obs.reward > 0
def test_self_reflect_action(self, env):
env.reset("easy_bugfix_chunk_list")
action = SelfReflect(
what_went_well="Identified the off-by-one error quickly by reading tests",
what_to_improve="Should have run lint before committing",
)
obs = env.step(action)
assert len(obs.reflections) == 1
assert obs.reward > 0
def test_commit_action(self, env):
env.reset("easy_bugfix_chunk_list")
# First make a change
env.step(EditFile(
file_path="README.md",
content="# Fixed!\n",
reason="update readme",
))
obs = env.step(Commit(message="fix: update readme"))
assert obs.last_action_type == "commit"
def test_step_before_reset_raises(self, env):
with pytest.raises(RuntimeError):
env.step(RunTests())
# ─────────────────────────────────────────────
# REWARD TESTS
# ─────────────────────────────────────────────
class TestReward:
def test_reward_is_float(self, env):
env.reset("easy_bugfix_chunk_list")
obs = env.step(PlanStep(step_number=1, description="Plan the fix"))
assert isinstance(obs.reward, float)
def test_cumulative_reward_accumulates(self, env):
env.reset("easy_bugfix_chunk_list")
obs1 = env.step(PlanStep(step_number=1, description="Plan step"))
obs2 = env.step(PlanStep(step_number=2, description="Another plan step"))
assert abs(obs2.cumulative_reward - (obs1.cumulative_reward + obs2.reward)) < 1e-6
def test_review_reward_is_positive(self, env):
env.reset("easy_bugfix_chunk_list")
obs = env.step(GenerateReview(
review_text="This review is about correctness and the off-by-one bug in range.",
focus_areas=["correctness"],
))
assert obs.reward > 0
def test_test_file_modification_penalised(self, env):
env.reset("easy_bugfix_chunk_list")
obs = env.step(EditFile(
file_path="tests/test_list_ops.py",
content="# Cleared tests\n",
reason="removing tests",
))
assert obs.reward < -0.2 # heavy penalty
# ─────────────────────────────────────────────
# STATE TESTS
# ─────────────────────────────────────────────
class TestState:
def test_state_before_reset(self, env):
s = env.state()
assert s["status"] == "not_started"
def test_state_after_reset(self, env):
env.reset("easy_bugfix_chunk_list")
s = env.state()
assert s["task_id"] == "easy_bugfix_chunk_list"
assert s["step"] == 0
assert s["done"] is False
def test_state_tracks_steps(self, env):
env.reset("easy_bugfix_chunk_list")
env.step(PlanStep(step_number=1, description="Step 1"))
env.step(PlanStep(step_number=2, description="Step 2"))
s = env.state()
assert s["step"] == 2
assert s["plan_steps"] == 2
# ─────────────────────────────────────────────
# FULL EPISODE SMOKE TEST (easy task)
# ─────────────────────────────────────────────
class TestFullEpisode:
"""Smoke test: manually solve the easy task without an LLM."""
def test_easy_task_solvable(self, env):
obs = env.reset("easy_bugfix_chunk_list")
# Plan
env.step(PlanStep(
step_number=1,
description="Read utils/list_ops.py and identify the range() bug",
estimated_effort="low",
))
env.step(PlanStep(
step_number=2,
description="Fix the off-by-one: change range(0, len(lst)-1, n) to range(0, len(lst), n)",
estimated_effort="low",
))
# Fix the bug
fixed_code = '''\
"""List utility operations."""
from typing import Any, List
def chunk_list(lst: List[Any], n: int) -> List[List[Any]]:
"""Split *lst* into consecutive chunks of size *n*."""
if n <= 0:
raise ValueError("Chunk size must be positive")
result = []
for i in range(0, len(lst), n):
result.append(lst[i : i + n])
return result
def flatten(lst: List[List[Any]]) -> List[Any]:
"""Flatten a list of lists by one level."""
return [item for sublist in lst for item in sublist]
'''
env.step(EditFile(
file_path="utils/list_ops.py",
content=fixed_code,
reason="Fix off-by-one: range stop was len(lst)-1, should be len(lst)",
))
# Test
obs = env.step(RunTests())
assert obs.test_results is not None
assert obs.test_results.failed == 0, f"Tests still failing: {obs.test_results.output}"
# Lint
env.step(RunLint(fix=False))
# Review
env.step(GenerateReview(
focus_areas=["correctness", "off-by-one", "range"],
review_text=(
"The bug was a classic off-by-one error in the range() call. "
"Original code used range(0, len(lst)-1, n), which stopped one index short "
"and caused the final chunk to be silently dropped. "
"Fix: change to range(0, len(lst), n). "
"All edge cases now pass: empty list, chunk larger than list, exact division."
),
))
# Reflect
env.step(SelfReflect(
what_went_well="Identified the off-by-one error quickly by reading the test assertions.",
what_to_improve="Should run lint immediately after editing, not after testing.",
))
# Commit
obs = env.step(Commit(
message="fix(list_ops): correct off-by-one in chunk_list range() call"
))
# Grade
result = env.grade()
assert result.test_pass_rate == 1.0
assert result.final_score >= 0.70, f"Expected score >= 0.70, got {result.final_score}"
assert result.passed is True