Spaces:

jester1177
/

cloudnative-devops-debug-env

Sleeping

File size: 10,266 Bytes

"""Determinism and score-range tests for the grader and environment."""

from server.environment import CloudNativeDebugEnvironment
from server.graders import run_grader
from server.models import Action, ActionType, FileEdit
from server.tasks.task_registry import TASK_REGISTRY


# -- determinism --


def test_reset_deterministic_with_seed():
    """Same seed → same task, scenario, files, error."""
    env1 = CloudNativeDebugEnvironment()
    env2 = CloudNativeDebugEnvironment()

    obs1 = env1.reset(seed=42)
    obs2 = env2.reset(seed=42)

    assert obs1.task_id == obs2.task_id
    assert obs1.error.error_message == obs2.error.error_message
    assert [f.path for f in obs1.files] == [f.path for f in obs2.files]
    assert [f.content for f in obs1.files] == [f.content for f in obs2.files]


def test_grader_deterministic_same_trajectory():
    """Identical trajectory → identical score and breakdown."""
    trajectory = [
        {
            "step": 1,
            "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
            "reward": 0.3,
            "done": False,
            "info": {"issues_fixed": 1, "issues_total": 2},
        },
        {
            "step": 2,
            "action": {"action_type": "submit"},
            "reward": 0.4,
            "done": True,
            "info": {"issues_fixed": 1, "issues_total": 2},
        },
    ]
    results = [run_grader("dockerfile_syntax", trajectory) for _ in range(10)]
    scores = [r.score for r in results]
    assert len(set(scores)) == 1, f"Non-deterministic scores: {scores}"
    breakdowns = [tuple(sorted(r.breakdown.items())) for r in results]
    assert len(set(breakdowns)) == 1


def test_grader_deterministic_across_tasks():
    """Same trajectory structure scores identically regardless of task_id."""
    trajectory = [
        {
            "step": 1,
            "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
            "reward": 0.3,
            "done": True,
            "info": {"issues_fixed": 1, "issues_total": 1},
        },
    ]
    scores = set()
    for task_id in TASK_REGISTRY:
        r = run_grader(task_id, trajectory)
        scores.add(r.score)
    # All tasks with same trajectory should get same score (task-agnostic grader)
    assert len(scores) == 1, f"Different scores across tasks: {scores}"


def test_full_episode_determinism():
    """Full episode replay produces identical trajectory and score."""
    scores = []
    for _ in range(5):
        env = CloudNativeDebugEnvironment()
        env.reset(task_id="dockerfile_syntax", scenario_id="typo_filename")
        action = Action(
            action_type=ActionType.EDIT_FILE,
            edits=[FileEdit(file_path="Dockerfile", old_content="COPY requirments.txt .", new_content="COPY requirements.txt .")]
        )
        env.step(action)
        r = run_grader("dockerfile_syntax", env.trajectory)
        scores.append(r.score)
    assert len(set(scores)) == 1, f"Non-deterministic episode scores: {scores}"


# -- score ranges --


def test_empty_trajectory_scores_zero():
    r = run_grader("dockerfile_syntax", [])
    assert r.score == 0.0
    assert r.steps_taken == 0


def test_zero_fixes_scores_zero():
    trajectory = [
        {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
         "reward": 0.0, "done": True, "info": {"issues_fixed": 0, "issues_total": 2}},
    ]
    r = run_grader("dockerfile_syntax", trajectory)
    assert r.score == 0.0


def test_partial_fix_scores_moderate():
    """1 of 2 issues fixed → score between 0.3 and 0.6."""
    trajectory = [
        {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
         "reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}},
        {"step": 2, "action": {"action_type": "submit"},
         "reward": 0.0, "done": True, "info": {"issues_fixed": 1, "issues_total": 2}},
    ]
    r = run_grader("dockerfile_syntax", trajectory)
    assert 0.3 <= r.score <= 0.6, f"Partial fix score {r.score} out of range"


def test_complete_fix_scores_high():
    """All issues fixed → score >= 0.85."""
    trajectory = [
        {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
         "reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}},
        {"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
         "reward": 0.3, "done": True, "info": {"issues_fixed": 2, "issues_total": 2}},
    ]
    r = run_grader("dockerfile_syntax", trajectory)
    assert r.score >= 0.85, f"Complete fix score {r.score} too low"


def test_perfect_score_achievable():
    """Single issue, single step → exactly 1.0."""
    trajectory = [
        {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
         "reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
    ]
    r = run_grader("dockerfile_syntax", trajectory)
    assert r.score == 1.0, f"Perfect scenario scored {r.score}, not 1.0"


def test_hint_penalty_applied():
    """Hints reduce score by 0.05 each."""
    base_traj = [
        {"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
         "reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
    ]
    hint_traj = [
        {"step": 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": False,
         "info": {"issues_fixed": 0, "issues_total": 1}},
        {"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
         "reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
    ]
    r_base = run_grader("dockerfile_syntax", base_traj)
    r_hint = run_grader("dockerfile_syntax", hint_traj)
    assert r_base.score > r_hint.score
    assert abs((r_base.score - r_hint.score) - 0.08) < 0.05  # ~0.05 hint + efficiency decay


def test_score_always_in_0_1_range():
    """Score must always be between 0.0 and 1.0."""
    test_cases = [
        [],
        [{"step": 1, "action": {"action_type": "submit"}, "reward": 0.0, "done": True,
          "info": {"issues_fixed": 0, "issues_total": 5}}],
        # Many hints — could potentially go negative
        *[[{"step": i + 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": i == 9,
            "info": {"issues_fixed": 0, "issues_total": 1}} for i in range(10)]],
    ]
    for traj in test_cases:
        r = run_grader("dockerfile_syntax", traj)
        assert 0.0 <= r.score <= 1.0, f"Score {r.score} out of [0, 1] range"


# -- difficulty progression --


def test_difficulty_progression():
    """Tasks are ordered by difficulty: easy < medium < hard."""
    difficulties = []
    for task_id, task_cls in TASK_REGISTRY.items():
        difficulties.append((task_id, task_cls.DIFFICULTY.value))

    expected_order = {
        "dockerfile_syntax": "easy",
        "dockerfile_runtime": "medium",
        "workflow_syntax_structure": "easy",
        "workflow_secrets_permissions": "medium",
        "ci_docker_integration": "medium",
        "multi_stage_pipeline_matrix": "hard",
    }
    for task_id, expected_diff in expected_order.items():
        actual = TASK_REGISTRY[task_id].DIFFICULTY.value
        assert actual == expected_diff, f"{task_id}: expected {expected_diff}, got {actual}"


def test_hard_tasks_have_more_issues():
    """Hard tasks should generally have more expected_fixes per scenario."""
    easy_max_issues = 0
    hard_min_issues = float("inf")

    for task_id, task_cls in TASK_REGISTRY.items():
        task = task_cls()
        for scenario in task.SCENARIOS:
            n_fixes = len(scenario["expected_fixes"])
            if task.DIFFICULTY.value == "easy":
                easy_max_issues = max(easy_max_issues, n_fixes)
            elif task.DIFFICULTY.value == "hard":
                hard_min_issues = min(hard_min_issues, n_fixes)

    # At least some hard scenarios should have more issues than easy ones
    assert hard_min_issues >= easy_max_issues, (
        f"Hard tasks ({hard_min_issues} min issues) should have >= issues than easy ({easy_max_issues} max)"
    )


def test_all_tasks_have_minimum_scenarios():
    """Each task must have at least 4 scenarios."""
    for task_id, task_cls in TASK_REGISTRY.items():
        assert len(task_cls.SCENARIOS) >= 4, f"{task_id} has only {len(task_cls.SCENARIOS)} scenarios (need >= 4)"


def test_scenario_ids_unique():
    """All scenario IDs must be unique within each task."""
    for task_id, task_cls in TASK_REGISTRY.items():
        ids = [s["id"] for s in task_cls.SCENARIOS]
        assert len(ids) == len(set(ids)), f"{task_id} has duplicate scenario IDs: {ids}"


def test_all_scenarios_have_required_fields():
    """Every scenario has id, files, error, expected_fixes."""
    for task_id, task_cls in TASK_REGISTRY.items():
        for scenario in task_cls.SCENARIOS:
            assert "id" in scenario, f"{task_id}: scenario missing 'id'"
            assert "files" in scenario, f"{task_id}/{scenario.get('id')}: missing 'files'"
            assert "error" in scenario, f"{task_id}/{scenario.get('id')}: missing 'error'"
            assert "expected_fixes" in scenario, f"{task_id}/{scenario.get('id')}: missing 'expected_fixes'"
            assert len(scenario["files"]) >= 1, f"{task_id}/{scenario['id']}: no files"
            assert len(scenario["expected_fixes"]) >= 1, f"{task_id}/{scenario['id']}: no expected_fixes"


# -- e2e grading --


def test_end_to_end_grading_all_tasks():
    """Every task/scenario can be reset, fixed, and graded with score > 0."""
    env = CloudNativeDebugEnvironment()
    for task_id, task_cls in TASK_REGISTRY.items():
        task = task_cls()
        for scenario in task.SCENARIOS:
            obs = env.reset(task_id=task_id, scenario_id=scenario["id"])
            assert obs.total_issues >= 1
            assert obs.issues_fixed == 0
            # Just verify the grader doesn't crash on an empty trajectory
            r = run_grader(task_id, env.trajectory)
            assert r.score == 0.0