Spaces:

PrakashCider
/

teamforge

Sleeping

teamforge / tests /test_environment.py

Your Name

fix: add FastAPI REST endpoints for OpenEnv validator

637f42c about 1 month ago

11.2 kB

	"""
	TeamForge Integration Tests
	Tests the full environment loop for all task difficulties.
	"""

	from __future__ import annotations

	import pytest
	from environment import TeamForgeEnv
	from models import (
	ActionStatus,
	Commit,
	EditFile,
	GenerateReview,
	PhaseState,
	PlanStep,
	RunLint,
	RunTests,
	SelfReflect,
	TaskDifficulty,
	)
	from tasks import ALL_TASK_IDS


	# ─────────────────────────────────────────────
	# FIXTURES
	# ─────────────────────────────────────────────

	@pytest.fixture
	def env():
	e = TeamForgeEnv()
	yield e
	# Teardown sandbox after each test
	e._sandbox.teardown()


	# ─────────────────────────────────────────────
	# RESET TESTS
	# ─────────────────────────────────────────────

	class TestReset:
	def test_reset_returns_observation(self, env):
	obs = env.reset("easy_bugfix_chunk_list")
	assert obs is not None
	assert obs.task_id == "easy_bugfix_chunk_list"
	assert obs.difficulty == TaskDifficulty.EASY
	assert obs.step_number == 0
	assert obs.done is False

	def test_reset_populates_repo_files(self, env):
	obs = env.reset("easy_bugfix_chunk_list")
	assert len(obs.repo_files) > 0
	paths = [f.path for f in obs.repo_files]
	assert any("utils" in p for p in paths)

	def test_reset_clears_previous_episode(self, env):
	env.reset("easy_bugfix_chunk_list")
	obs = env.reset("easy_bugfix_chunk_list")
	assert obs.step_number == 0
	assert obs.cumulative_reward == 0.0

	def test_all_task_ids_reset(self, env):
	for task_id in ALL_TASK_IDS:
	obs = env.reset(task_id)
	assert obs.task_id == task_id

	def test_unknown_task_raises(self, env):
	with pytest.raises(KeyError):
	env.reset("nonexistent_task")


	# ─────────────────────────────────────────────
	# STEP TESTS
	# ─────────────────────────────────────────────

	class TestStep:
	def test_plan_step_action(self, env):
	env.reset("easy_bugfix_chunk_list")
	action = PlanStep(
	step_number=1,
	description="Read and understand the buggy chunk_list function",
	estimated_effort="low",
	)
	obs = env.step(action)
	assert obs.step_number == 1
	assert obs.last_action_type == "plan_step"
	assert obs.last_action_status == ActionStatus.SUCCESS
	assert len(obs.plan) == 1

	def test_edit_file_action(self, env):
	env.reset("easy_bugfix_chunk_list")
	action = EditFile(
	file_path="utils/list_ops.py",
	content='"""Fixed."""\n\ndef chunk_list(lst, n):\n return [lst[i:i+n] for i in range(0, len(lst), n)]\n',
	reason="Fix off-by-one bug in range stop",
	)
	obs = env.step(action)
	assert obs.last_action_status == ActionStatus.SUCCESS
	assert obs.reward > 0

	def test_run_tests_action(self, env):
	env.reset("easy_bugfix_chunk_list")
	action = RunTests(timeout_seconds=30)
	obs = env.step(action)
	assert obs.last_action_type == "run_tests"
	assert obs.test_results is not None
	assert obs.test_results.passed >= 0

	def test_run_lint_action(self, env):
	env.reset("easy_bugfix_chunk_list")
	action = RunLint(fix=False)
	obs = env.step(action)
	assert obs.last_action_type == "run_lint"
	assert obs.lint_results is not None
	assert 0.0 <= obs.lint_results.score <= 1.0

	def test_generate_review_action(self, env):
	env.reset("easy_bugfix_chunk_list")
	action = GenerateReview(
	focus_areas=["correctness", "off-by-one"],
	review_text=(
	"The bug was an off-by-one in the range() call. "
	"The original used range(0, len(lst)-1, n) which dropped the last chunk. "
	"Fixed by changing to range(0, len(lst), n). "
	"The chunk function now correctly handles all edge cases."
	),
	)
	obs = env.step(action)
	assert obs.last_action_status == ActionStatus.SUCCESS
	assert len(obs.reviews) == 1
	assert obs.reward > 0

	def test_self_reflect_action(self, env):
	env.reset("easy_bugfix_chunk_list")
	action = SelfReflect(
	what_went_well="Identified the off-by-one error quickly by reading tests",
	what_to_improve="Should have run lint before committing",
	)
	obs = env.step(action)
	assert len(obs.reflections) == 1
	assert obs.reward > 0

	def test_commit_action(self, env):
	env.reset("easy_bugfix_chunk_list")
	# First make a change
	env.step(EditFile(
	file_path="README.md",
	content="# Fixed!\n",
	reason="update readme",
	))
	obs = env.step(Commit(message="fix: update readme"))
	assert obs.last_action_type == "commit"

	def test_step_before_reset_raises(self, env):
	with pytest.raises(RuntimeError):
	env.step(RunTests())


	# ─────────────────────────────────────────────
	# REWARD TESTS
	# ─────────────────────────────────────────────

	class TestReward:
	def test_reward_is_float(self, env):
	env.reset("easy_bugfix_chunk_list")
	obs = env.step(PlanStep(step_number=1, description="Plan the fix"))
	assert isinstance(obs.reward, float)

	def test_cumulative_reward_accumulates(self, env):
	env.reset("easy_bugfix_chunk_list")
	obs1 = env.step(PlanStep(step_number=1, description="Plan step"))
	obs2 = env.step(PlanStep(step_number=2, description="Another plan step"))
	assert abs(obs2.cumulative_reward - (obs1.cumulative_reward + obs2.reward)) < 1e-6

	def test_review_reward_is_positive(self, env):
	env.reset("easy_bugfix_chunk_list")
	obs = env.step(GenerateReview(
	review_text="This review is about correctness and the off-by-one bug in range.",
	focus_areas=["correctness"],
	))
	assert obs.reward > 0

	def test_test_file_modification_penalised(self, env):
	env.reset("easy_bugfix_chunk_list")
	obs = env.step(EditFile(
	file_path="tests/test_list_ops.py",
	content="# Cleared tests\n",
	reason="removing tests",
	))
	assert obs.reward < -0.2 # heavy penalty


	# ─────────────────────────────────────────────
	# STATE TESTS
	# ─────────────────────────────────────────────

	class TestState:
	def test_state_before_reset(self, env):
	s = env.state()
	assert s["status"] == "not_started"

	def test_state_after_reset(self, env):
	env.reset("easy_bugfix_chunk_list")
	s = env.state()
	assert s["task_id"] == "easy_bugfix_chunk_list"
	assert s["step"] == 0
	assert s["done"] is False

	def test_state_tracks_steps(self, env):
	env.reset("easy_bugfix_chunk_list")
	env.step(PlanStep(step_number=1, description="Step 1"))
	env.step(PlanStep(step_number=2, description="Step 2"))
	s = env.state()
	assert s["step"] == 2
	assert s["plan_steps"] == 2


	# ─────────────────────────────────────────────
	# FULL EPISODE SMOKE TEST (easy task)
	# ─────────────────────────────────────────────

	class TestFullEpisode:
	"""Smoke test: manually solve the easy task without an LLM."""

	def test_easy_task_solvable(self, env):
	obs = env.reset("easy_bugfix_chunk_list")

	# Plan
	env.step(PlanStep(
	step_number=1,
	description="Read utils/list_ops.py and identify the range() bug",
	estimated_effort="low",
	))
	env.step(PlanStep(
	step_number=2,
	description="Fix the off-by-one: change range(0, len(lst)-1, n) to range(0, len(lst), n)",
	estimated_effort="low",
	))

	# Fix the bug
	fixed_code = '''\
	"""List utility operations."""
	from typing import Any, List


	def chunk_list(lst: List[Any], n: int) -> List[List[Any]]:
	"""Split lst into consecutive chunks of size n."""
	if n <= 0:
	raise ValueError("Chunk size must be positive")
	result = []
	for i in range(0, len(lst), n):
	result.append(lst[i : i + n])
	return result


	def flatten(lst: List[List[Any]]) -> List[Any]:
	"""Flatten a list of lists by one level."""
	return [item for sublist in lst for item in sublist]
	'''
	env.step(EditFile(
	file_path="utils/list_ops.py",
	content=fixed_code,
	reason="Fix off-by-one: range stop was len(lst)-1, should be len(lst)",
	))

	# Test
	obs = env.step(RunTests())
	assert obs.test_results is not None
	assert obs.test_results.failed == 0, f"Tests still failing: {obs.test_results.output}"

	# Lint
	env.step(RunLint(fix=False))

	# Review
	env.step(GenerateReview(
	focus_areas=["correctness", "off-by-one", "range"],
	review_text=(
	"The bug was a classic off-by-one error in the range() call. "
	"Original code used range(0, len(lst)-1, n), which stopped one index short "
	"and caused the final chunk to be silently dropped. "
	"Fix: change to range(0, len(lst), n). "
	"All edge cases now pass: empty list, chunk larger than list, exact division."
	),
	))

	# Reflect
	env.step(SelfReflect(
	what_went_well="Identified the off-by-one error quickly by reading the test assertions.",
	what_to_improve="Should run lint immediately after editing, not after testing.",
	))

	# Commit
	obs = env.step(Commit(
	message="fix(list_ops): correct off-by-one in chunk_list range() call"
	))

	# Grade
	result = env.grade()
	assert result.test_pass_rate == 1.0
	assert result.final_score >= 0.70, f"Expected score >= 0.70, got {result.final_score}"
	assert result.passed is True