Spaces:

PrakashCider
/

teamforge

Sleeping

File size: 11,157 Bytes

637f42c

"""
TeamForge Integration Tests
Tests the full environment loop for all task difficulties.
"""

from __future__ import annotations

import pytest
from environment import TeamForgeEnv
from models import (
    ActionStatus,
    Commit,
    EditFile,
    GenerateReview,
    PhaseState,
    PlanStep,
    RunLint,
    RunTests,
    SelfReflect,
    TaskDifficulty,
)
from tasks import ALL_TASK_IDS


# ─────────────────────────────────────────────
# FIXTURES
# ─────────────────────────────────────────────

@pytest.fixture
def env():
    e = TeamForgeEnv()
    yield e
    # Teardown sandbox after each test
    e._sandbox.teardown()


# ─────────────────────────────────────────────
# RESET TESTS
# ─────────────────────────────────────────────

class TestReset:
    def test_reset_returns_observation(self, env):
        obs = env.reset("easy_bugfix_chunk_list")
        assert obs is not None
        assert obs.task_id == "easy_bugfix_chunk_list"
        assert obs.difficulty == TaskDifficulty.EASY
        assert obs.step_number == 0
        assert obs.done is False

    def test_reset_populates_repo_files(self, env):
        obs = env.reset("easy_bugfix_chunk_list")
        assert len(obs.repo_files) > 0
        paths = [f.path for f in obs.repo_files]
        assert any("utils" in p for p in paths)

    def test_reset_clears_previous_episode(self, env):
        env.reset("easy_bugfix_chunk_list")
        obs = env.reset("easy_bugfix_chunk_list")
        assert obs.step_number == 0
        assert obs.cumulative_reward == 0.0

    def test_all_task_ids_reset(self, env):
        for task_id in ALL_TASK_IDS:
            obs = env.reset(task_id)
            assert obs.task_id == task_id

    def test_unknown_task_raises(self, env):
        with pytest.raises(KeyError):
            env.reset("nonexistent_task")


# ─────────────────────────────────────────────
# STEP TESTS
# ─────────────────────────────────────────────

class TestStep:
    def test_plan_step_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = PlanStep(
            step_number=1,
            description="Read and understand the buggy chunk_list function",
            estimated_effort="low",
        )
        obs = env.step(action)
        assert obs.step_number == 1
        assert obs.last_action_type == "plan_step"
        assert obs.last_action_status == ActionStatus.SUCCESS
        assert len(obs.plan) == 1

    def test_edit_file_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = EditFile(
            file_path="utils/list_ops.py",
            content='"""Fixed."""\n\ndef chunk_list(lst, n):\n    return [lst[i:i+n] for i in range(0, len(lst), n)]\n',
            reason="Fix off-by-one bug in range stop",
        )
        obs = env.step(action)
        assert obs.last_action_status == ActionStatus.SUCCESS
        assert obs.reward > 0

    def test_run_tests_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = RunTests(timeout_seconds=30)
        obs = env.step(action)
        assert obs.last_action_type == "run_tests"
        assert obs.test_results is not None
        assert obs.test_results.passed >= 0

    def test_run_lint_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = RunLint(fix=False)
        obs = env.step(action)
        assert obs.last_action_type == "run_lint"
        assert obs.lint_results is not None
        assert 0.0 <= obs.lint_results.score <= 1.0

    def test_generate_review_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = GenerateReview(
            focus_areas=["correctness", "off-by-one"],
            review_text=(
                "The bug was an off-by-one in the range() call. "
                "The original used range(0, len(lst)-1, n) which dropped the last chunk. "
                "Fixed by changing to range(0, len(lst), n). "
                "The chunk function now correctly handles all edge cases."
            ),
        )
        obs = env.step(action)
        assert obs.last_action_status == ActionStatus.SUCCESS
        assert len(obs.reviews) == 1
        assert obs.reward > 0

    def test_self_reflect_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        action = SelfReflect(
            what_went_well="Identified the off-by-one error quickly by reading tests",
            what_to_improve="Should have run lint before committing",
        )
        obs = env.step(action)
        assert len(obs.reflections) == 1
        assert obs.reward > 0

    def test_commit_action(self, env):
        env.reset("easy_bugfix_chunk_list")
        # First make a change
        env.step(EditFile(
            file_path="README.md",
            content="# Fixed!\n",
            reason="update readme",
        ))
        obs = env.step(Commit(message="fix: update readme"))
        assert obs.last_action_type == "commit"

    def test_step_before_reset_raises(self, env):
        with pytest.raises(RuntimeError):
            env.step(RunTests())


# ─────────────────────────────────────────────
# REWARD TESTS
# ─────────────────────────────────────────────

class TestReward:
    def test_reward_is_float(self, env):
        env.reset("easy_bugfix_chunk_list")
        obs = env.step(PlanStep(step_number=1, description="Plan the fix"))
        assert isinstance(obs.reward, float)

    def test_cumulative_reward_accumulates(self, env):
        env.reset("easy_bugfix_chunk_list")
        obs1 = env.step(PlanStep(step_number=1, description="Plan step"))
        obs2 = env.step(PlanStep(step_number=2, description="Another plan step"))
        assert abs(obs2.cumulative_reward - (obs1.cumulative_reward + obs2.reward)) < 1e-6

    def test_review_reward_is_positive(self, env):
        env.reset("easy_bugfix_chunk_list")
        obs = env.step(GenerateReview(
            review_text="This review is about correctness and the off-by-one bug in range.",
            focus_areas=["correctness"],
        ))
        assert obs.reward > 0

    def test_test_file_modification_penalised(self, env):
        env.reset("easy_bugfix_chunk_list")
        obs = env.step(EditFile(
            file_path="tests/test_list_ops.py",
            content="# Cleared tests\n",
            reason="removing tests",
        ))
        assert obs.reward < -0.2  # heavy penalty


# ─────────────────────────────────────────────
# STATE TESTS
# ─────────────────────────────────────────────

class TestState:
    def test_state_before_reset(self, env):
        s = env.state()
        assert s["status"] == "not_started"

    def test_state_after_reset(self, env):
        env.reset("easy_bugfix_chunk_list")
        s = env.state()
        assert s["task_id"] == "easy_bugfix_chunk_list"
        assert s["step"] == 0
        assert s["done"] is False

    def test_state_tracks_steps(self, env):
        env.reset("easy_bugfix_chunk_list")
        env.step(PlanStep(step_number=1, description="Step 1"))
        env.step(PlanStep(step_number=2, description="Step 2"))
        s = env.state()
        assert s["step"] == 2
        assert s["plan_steps"] == 2


# ─────────────────────────────────────────────
# FULL EPISODE SMOKE TEST (easy task)
# ─────────────────────────────────────────────

class TestFullEpisode:
    """Smoke test: manually solve the easy task without an LLM."""

    def test_easy_task_solvable(self, env):
        obs = env.reset("easy_bugfix_chunk_list")

        # Plan
        env.step(PlanStep(
            step_number=1,
            description="Read utils/list_ops.py and identify the range() bug",
            estimated_effort="low",
        ))
        env.step(PlanStep(
            step_number=2,
            description="Fix the off-by-one: change range(0, len(lst)-1, n) to range(0, len(lst), n)",
            estimated_effort="low",
        ))

        # Fix the bug
        fixed_code = '''\
"""List utility operations."""
from typing import Any, List


def chunk_list(lst: List[Any], n: int) -> List[List[Any]]:
    """Split *lst* into consecutive chunks of size *n*."""
    if n <= 0:
        raise ValueError("Chunk size must be positive")
    result = []
    for i in range(0, len(lst), n):
        result.append(lst[i : i + n])
    return result


def flatten(lst: List[List[Any]]) -> List[Any]:
    """Flatten a list of lists by one level."""
    return [item for sublist in lst for item in sublist]
'''
        env.step(EditFile(
            file_path="utils/list_ops.py",
            content=fixed_code,
            reason="Fix off-by-one: range stop was len(lst)-1, should be len(lst)",
        ))

        # Test
        obs = env.step(RunTests())
        assert obs.test_results is not None
        assert obs.test_results.failed == 0, f"Tests still failing: {obs.test_results.output}"

        # Lint
        env.step(RunLint(fix=False))

        # Review
        env.step(GenerateReview(
            focus_areas=["correctness", "off-by-one", "range"],
            review_text=(
                "The bug was a classic off-by-one error in the range() call. "
                "Original code used range(0, len(lst)-1, n), which stopped one index short "
                "and caused the final chunk to be silently dropped. "
                "Fix: change to range(0, len(lst), n). "
                "All edge cases now pass: empty list, chunk larger than list, exact division."
            ),
        ))

        # Reflect
        env.step(SelfReflect(
            what_went_well="Identified the off-by-one error quickly by reading the test assertions.",
            what_to_improve="Should run lint immediately after editing, not after testing.",
        ))

        # Commit
        obs = env.step(Commit(
            message="fix(list_ops): correct off-by-one in chunk_list range() call"
        ))

        # Grade
        result = env.grade()
        assert result.test_pass_rate == 1.0
        assert result.final_score >= 0.70, f"Expected score >= 0.70, got {result.final_score}"
        assert result.passed is True