Spaces:

yadnyeshkolte
/

api-debug-env

Sleeping

File size: 30,743 Bytes

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

"""
Comprehensive tests for the API Integration Debugging Environment.

Tests cover:
- Environment reset and initialization
- Action handling (inspect_logs, inspect_config, inspect_endpoint, submit_fix)
- Multi-dimensional grading rubric
- Fix validation (strict value matching + partial credit)
- Episode termination conditions
- Repeated inspection penalty
- Seed-based reproducibility and issue pool selection
- Dynamic state: service health, cascading failures, dynamic logs
- Strategy scoring
"""

import sys
import os
import pytest

# Add parent directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from models import ApiDebugAction, ApiDebugObservation
from server.api_debug_env_environment import ApiDebugEnvironment
from scenarios import get_scenario, get_all_task_ids, Issue


# ─── Scenario Tests ──────────────────────────────────────────────────────────


class TestScenarios:
    """Test scenario loading and configuration."""

    def test_all_task_ids_returns_three(self):
        task_ids = get_all_task_ids()
        assert task_ids == ["easy", "medium", "hard"]

    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
    def test_scenario_loads(self, task_id):
        scenario = get_scenario(task_id)
        assert scenario.task_id == task_id
        assert len(scenario.issues) > 0
        assert len(scenario.services) > 0
        assert scenario.max_steps > 0

    def test_invalid_task_id_raises(self):
        with pytest.raises(ValueError, match="Unknown task_id"):
            get_scenario("nonexistent")

    def test_easy_has_two_issues(self):
        s = get_scenario("easy")
        assert len(s.issues) == 2

    def test_medium_has_three_issues(self):
        s = get_scenario("medium")
        assert len(s.issues) == 3

    def test_hard_has_five_issues(self):
        s = get_scenario("hard")
        assert len(s.issues) == 5

    def test_seed_randomization_reproducible(self):
        """Same seed should produce same scenario."""
        s1 = get_scenario("easy", seed=42)
        s2 = get_scenario("easy", seed=42)
        assert [i.issue_id for i in s1.issues] == [i.issue_id for i in s2.issues]

    def test_different_seeds_may_vary(self):
        """Different seeds should produce potentially different scenarios."""
        s1 = get_scenario("easy", seed=42)
        s2 = get_scenario("easy", seed=99)
        # They might differ (pool has 4 issues, selecting 2)
        # At minimum, they should both be valid
        assert len(s1.issues) == 2
        assert len(s2.issues) == 2

    def test_each_issue_has_log_hint(self):
        """Every issue should have a corresponding log hint findable in the logs."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            for issue in s.issues:
                found = False
                for service_logs in s.logs.values():
                    for log_line in service_logs:
                        if issue.log_hint in log_line:
                            found = True
                            break
                    if found:
                        break
                assert found, f"Issue {issue.issue_id} log_hint '{issue.log_hint}' not found in any logs"

    def test_service_graph_exists(self):
        """Every scenario should have a service dependency graph."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            assert len(s.service_graph) > 0
            for svc in s.services:
                assert svc in s.service_graph, f"Service {svc} missing from graph in {task_id}"

    def test_dynamic_logs_defined(self):
        """Every scenario should have dynamic logs for at least some issues."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            assert len(s.dynamic_logs) > 0, f"No dynamic logs in {task_id}"

    def test_optimal_fix_order_defined(self):
        """Every scenario should have an optimal fix order."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            assert len(s.optimal_fix_order) > 0

    def test_issues_have_categories(self):
        """Every issue should have a category."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            for issue in s.issues:
                assert issue.category in (
                    "configuration", "authentication", "networking", "protocol"
                ), f"Issue {issue.issue_id} has invalid category: {issue.category}"

    def test_context_provided(self):
        """Every scenario should have context."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            assert len(s.context) > 0


# ─── Environment Reset Tests ─────────────────────────────────────────────────


class TestEnvironmentReset:
    """Test environment initialization and reset."""

    def test_reset_returns_observation(self):
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert isinstance(obs, ApiDebugObservation)

    def test_reset_clears_state(self):
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert obs.issues_found == 0
        assert obs.issues_fixed == 0
        assert obs.done is False
        assert obs.remaining_steps == 15  # easy max_steps

    def test_reset_provides_available_targets(self):
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert len(obs.available_targets) > 0
        assert "payment_client" in obs.available_targets

    def test_reset_with_different_task(self):
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset(task_id="hard")
        assert obs.issues_total == 5

    def test_initial_reward_is_zero(self):
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert obs.reward == 0.0

    def test_reset_includes_service_status(self):
        """Reset should include service health status."""
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert len(obs.service_status) > 0
        assert "payment_client" in obs.service_status

    def test_reset_includes_dependency_graph(self):
        """Reset should include service dependency graph."""
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert len(obs.dependency_graph) > 0

    def test_reset_includes_error_trace(self):
        """Reset should include initial error trace."""
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert len(obs.error_trace) > 0


# ─── Action Handler Tests ────────────────────────────────────────────────────


class TestInspectLogs:
    """Test inspect_logs action."""

    def test_inspect_logs_returns_logs(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        assert len(obs.logs) > 0

    def test_inspect_logs_finds_issues(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        assert obs.issues_found > 0
        assert obs.reward > 0

    def test_repeated_inspect_logs_no_reward(self):
        """Second inspection of same target should give 0 reward (+ step cost)."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs1 = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        obs2 = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        assert obs2.reward < obs1.reward

    def test_dynamic_logs_after_fix(self):
        """After fixing an issue, re-inspecting should show new log entries."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        # Fix content-type
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        # Re-inspect logs — should include dynamic log entries
        obs = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        # Should have the original logs PLUS dynamic logs
        assert any("application/json" in log.lower() or "parsed" in log.lower()
                    for log in obs.logs)


class TestInspectConfig:
    """Test inspect_config action."""

    def test_inspect_config_returns_config(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="inspect_config",
            target="payment_client",
        ))
        assert len(obs.config_snapshot) > 0
        assert "headers" in obs.config_snapshot


class TestInspectEndpoint:
    """Test inspect_endpoint action."""

    def test_inspect_endpoint_shows_error(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="inspect_endpoint",
            target="payment_client",
        ))
        assert obs.api_response is not None
        assert obs.api_response["status"] == "error"

    def test_inspect_endpoint_shows_success_after_fix(self):
        """After all issues fixed, endpoint should show success."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer valid_token_123"},
        ))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        # Episode is done now, but let's check service status
        # The service health should be updated
        assert env._service_health.get("payment_client") == "healthy"

    def test_inspect_endpoint_shows_category_status_code(self):
        """Endpoint errors should have category-appropriate status codes."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="inspect_endpoint",
            target="payment_client",
        ))
        assert obs.api_response is not None
        # Should have a realistic HTTP status code
        assert obs.api_response["status_code"] in [401, 415, 500, 504]


class TestSubmitFix:
    """Test submit_fix action with value validation and partial credit."""

    def test_correct_fix_accepted(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        assert obs.issues_fixed > 0
        assert "accepted" in obs.action_result.lower() or "fixed" in obs.action_result.lower()

    def test_wrong_value_rejected(self):
        """Right key but wrong value should be rejected."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "text/xml"},
        ))
        assert obs.issues_fixed == 0

    def test_partial_credit_close_value(self):
        """Right key, close value should get partial credit feedback."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/xml"},
        ))
        # Should get partial credit (same prefix "application/")
        assert obs.reward > -0.05  # Better than full reject

    def test_correct_auth_fix(self):
        """Bearer token fix should work with any valid token."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer my_actual_api_key_123"},
        ))
        assert obs.issues_fixed > 0

    def test_empty_payload_rejected(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={},
        ))
        assert obs.reward < 0

    def test_invalid_target_penalized(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="nonexistent_service",
            fix_payload={"key": "value"},
        ))
        assert obs.reward < 0

    def test_fix_all_issues_completes_episode(self):
        """Fixing all issues should mark episode as done with completion bonus."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer valid_token_123"},
        ))
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        assert obs.done is True
        assert obs.issues_fixed == 2

    def test_strategy_bonus_for_inspecting_first(self):
        """Should get higher reward when inspecting before fixing."""
        env1 = ApiDebugEnvironment(task_id="easy")
        env1.reset()
        # Fix directly (no inspection)
        obs1 = env1.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))

        env2 = ApiDebugEnvironment(task_id="easy")
        env2.reset()
        # Inspect first, then fix
        env2.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        obs2 = env2.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))

        # Fix with prior inspection should give higher reward
        assert obs2.reward > obs1.reward


# ─── Service Health Tests ─────────────────────────────────────────────────────


class TestServiceHealth:
    """Test dynamic service health tracking."""

    def test_initial_health_reflects_issues(self):
        """Services with issues should start as degraded/error."""
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert obs.service_status.get("payment_client") in ("error", "degraded")

    def test_health_updates_after_fix(self):
        """Fixing all issues on a service should mark it healthy."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer valid_token_123"},
        ))
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        # payment_client should be healthy after both fixes
        assert env._service_health.get("payment_client") == "healthy"

    def test_error_trace_updates(self):
        """Error trace should shrink as issues are fixed."""
        env = ApiDebugEnvironment(task_id="easy")
        obs1 = env.reset()
        initial_trace_len = len(obs1.error_trace)

        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        trace_after_fix = env._build_error_trace()
        assert len(trace_after_fix) < initial_trace_len


# ─── Grading Tests ────────────────────────────────────────────────────────────


class TestGrading:
    """Test the multi-dimensional grading rubric."""

    def test_grade_no_fixes_is_low(self):
        """Grade with no fixes should be low (but not zero — exploration gets some credit)."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        score = env.grade()
        assert 0.0 < score < 0.5  # Gets some credit for exploration and efficiency

    def test_grade_all_fixes_is_high(self):
        """Grade with all fixes should be high."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        env.step(ApiDebugAction(action_type="inspect_config", target="payment_client"))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer valid_token_123"},
        ))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score = env.grade()
        assert score > 0.6

    def test_grade_strictly_between_0_and_1(self):
        """Grade must be strictly in (0, 1), never exactly 0.0 or 1.0."""
        for task_id in get_all_task_ids():
            env = ApiDebugEnvironment(task_id=task_id)
            env.reset()
            score = env.grade()
            assert 0.0 < score < 1.0, f"Score for {task_id} was {score}"

    def test_efficiency_bonus(self):
        """Faster solutions with same fix count should score higher efficiency component."""
        # Both inspect then fix (same strategy), but one uses more steps
        env1 = ApiDebugEnvironment(task_id="easy")
        env1.reset()
        env1.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        env1.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score_fast = env1.grade()

        env2 = ApiDebugEnvironment(task_id="easy")
        env2.reset()
        env2.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        for _ in range(10):
            env2.step(ApiDebugAction(action_type="inspect_logs", target="payment_gateway"))
        env2.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score_slow = env2.grade()

        assert score_fast > score_slow, f"Fast={score_fast} should beat Slow={score_slow}"

    def test_strategy_affects_grade(self):
        """Proper strategy (inspect before fix) should improve grade."""
        # No inspection
        env1 = ApiDebugEnvironment(task_id="easy")
        env1.reset()
        env1.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer token"},
        ))
        env1.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score_no_inspect = env1.grade()

        # With inspection
        env2 = ApiDebugEnvironment(task_id="easy")
        env2.reset()
        env2.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        env2.step(ApiDebugAction(action_type="inspect_config", target="payment_client"))
        env2.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer token"},
        ))
        env2.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score_with_inspect = env2.grade()

        # Both should be decent but strategy should boost the inspecting one
        assert score_with_inspect >= score_no_inspect * 0.9  # At least close

    def test_grade_dimensions_nonzero(self):
        """Each grading dimension should be computable."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score = env.grade()
        assert score > 0.001  # Should have some score from partial fix


# ─── Episode Termination Tests ────────────────────────────────────────────────


class TestEpisodeTermination:
    """Test episode ending conditions."""

    def test_out_of_steps_ends_episode(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        for _ in range(15):
            obs = env.step(ApiDebugAction(
                action_type="inspect_logs",
                target="payment_client",
            ))
        assert obs.done is True
        assert obs.remaining_steps == 0

    def test_invalid_action_type_penalized(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="nonexistent_action",
            target="payment_client",
        ))
        assert obs.reward < 0


# ─── Value Matching Tests ─────────────────────────────────────────────────────


class TestValueMatching:
    """Test the _values_match method directly."""

    def setup_method(self):
        self.env = ApiDebugEnvironment(task_id="easy")

    def test_exact_string_match(self):
        assert self.env._values_match("application/json", "application/json")

    def test_case_insensitive_match(self):
        assert self.env._values_match("Application/JSON", "application/json")

    def test_numeric_exact(self):
        assert self.env._values_match(10, 10)

    def test_numeric_tolerance_tight(self):
        """10% tolerance — 10 accepts 10 and 9.5 but not 8."""
        assert self.env._values_match(10, 10)  # Exact
        assert self.env._values_match(10, 9.5)  # Within 10% (5% diff)
        assert not self.env._values_match(10, 8)  # Outside 10% (20% diff)

    def test_boolean_match(self):
        assert self.env._values_match(True, True)
        assert not self.env._values_match(True, False)

    def test_boolean_from_string(self):
        assert self.env._values_match(True, "true")
        assert self.env._values_match(False, "false")

    def test_list_containment(self):
        assert self.env._values_match([429, 500], [429, 500])
        assert self.env._values_match([429, 500], [500, 429, 502])

    def test_bearer_token_pattern(self):
        assert self.env._values_match("Bearer <token>", "Bearer my_secret_key")
        assert not self.env._values_match("Bearer <token>", "Bearer ")  # Empty token

    def test_wrong_value_rejected(self):
        assert not self.env._values_match("application/json", "text/xml")
        assert not self.env._values_match(10, 100)


class TestPartialCredit:
    """Test the _values_close method for partial credit."""

    def setup_method(self):
        self.env = ApiDebugEnvironment(task_id="easy")

    def test_numeric_close(self):
        assert self.env._values_close(10, 7)  # Within 50%
        assert not self.env._values_close(10, 100)

    def test_string_same_prefix(self):
        assert self.env._values_close("application/json", "application/xml")

    def test_check_fix_returns_partial(self):
        """Right key, close value should return 'partial'."""
        issue = Issue(
            issue_id="test",
            service="test_svc",
            description="test",
            expected_fix={"timeout": 10},
            fix_key="timeout",
            log_hint="test",
        )
        result = self.env._check_fix(issue, {"timeout": 7})
        assert result == "partial"

    def test_check_fix_returns_exact(self):
        issue = Issue(
            issue_id="test",
            service="test_svc",
            description="test",
            expected_fix={"timeout": 10},
            fix_key="timeout",
            log_hint="test",
        )
        result = self.env._check_fix(issue, {"timeout": 10})
        assert result == "exact"

    def test_check_fix_returns_none(self):
        issue = Issue(
            issue_id="test",
            service="test_svc",
            description="test",
            expected_fix={"timeout": 10},
            fix_key="timeout",
            log_hint="test",
        )
        result = self.env._check_fix(issue, {"base_url": "http://example.com"})
        assert result == "none"


# ─── Integration Tests ────────────────────────────────────────────────────────


class TestFullEpisode:
    """Test complete episode flows."""

    def test_easy_full_solve(self):
        """Run a complete easy episode from start to finish."""
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()

        obs = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        assert obs.issues_found >= 1

        obs = env.step(ApiDebugAction(
            action_type="inspect_config",
            target="payment_client",
        ))
        assert "headers" in obs.config_snapshot

        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer my_token_123"},
        ))
        assert obs.issues_fixed >= 1

        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        assert obs.issues_fixed == 2
        assert obs.done is True

        score = env.grade()
        assert score > 0.6

    def test_medium_full_solve(self):
        """Run a complete medium episode."""
        env = ApiDebugEnvironment(task_id="medium")
        obs = env.reset()
        assert obs.issues_total == 3

        # Inspect logs
        for svc in obs.available_targets:
            obs = env.step(ApiDebugAction(
                action_type="inspect_logs", target=svc,
            ))

        # Inspect configs
        obs = env.step(ApiDebugAction(
            action_type="inspect_config", target="webhook_sender",
        ))

        # Fix rate limit
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="webhook_sender",
            fix_payload={"rate_limit.requests_per_second": 10},
        ))
        assert obs.issues_fixed >= 1

        # Fix retry
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="webhook_sender",
            fix_payload={"retry": {"max_retries": 3, "backoff_factor": 2, "retry_on_status": [429, 500]}},
        ))

        # Fix signature
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="webhook_sender",
            fix_payload={"headers.X-Webhook-Signature": "sha256=computed_hmac"},
        ))

        assert obs.done is True
        score = env.grade()
        assert score > 0.4

    def test_hard_partial_solve(self):
        """Partially solve hard task and verify partial credit in grading."""
        env = ApiDebugEnvironment(task_id="hard")
        obs = env.reset()
        assert obs.issues_total == 5

        # Fix just 2 issues
        env.step(ApiDebugAction(action_type="inspect_logs", target="order_service"))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="order_service",
            fix_payload={"inventory_url": "https://inventory.internal/v2/reserve"},
        ))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="order_service",
            fix_payload={"timeout": 10},
        ))

        score = env.grade()
        assert 0.0 < score < 0.999
        assert len(env._issues_fixed) == 2


class TestCascadingFailures:
    """Test cascading failure dynamics."""

    def test_hard_dependency_chain(self):
        """Hard scenario has dependent issues (timeout depends on wrong_url)."""
        s = get_scenario("hard")
        timeout_issue = next(i for i in s.issues if i.issue_id == "hard_timeout")
        assert "hard_wrong_url" in timeout_issue.depends_on

    def test_cascade_effects_defined(self):
        """Issues with cascade effects should specify affected services."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            any_cascade = any(len(i.cascade_effects) > 0 for i in s.issues)
            assert any_cascade, f"No cascade effects in {task_id}"


if __name__ == "__main__":
    pytest.main([__file__, "-v"])