SystemTruth / tests /test_runners.py
Madhav189's picture
hackathon sprint: grader collapse + coliseum rename + training pipeline
c9baa73
"""End-to-end tests for the Advanced and Max runners.
The runners live in ``sre_gym.advanced.runner`` and ``sre_gym.max.runner``.
These tests assert that:
- All 3 Advanced reference scenarios can be run with the default
scripted-optimal policy and produce a coherent ``AdvancedResult``.
- Every Max chaos pattern is reachable by ``run_max(...)`` against the
``ecommerce_vibecoded_saas`` family without raising.
- The runners produce deterministic outputs given the same seed.
"""
from __future__ import annotations
import pytest
from sre_gym.strategy.runner import (
AdvancedResult,
list_advanced_scenarios,
run_advanced,
)
from sre_gym.operations.runner import (
CHAOS_PATTERNS,
MaxResult,
list_max_families,
run_max,
)
# ---------- Advanced runner ----------
def test_list_advanced_scenarios_has_three_entries() -> None:
scenarios = list_advanced_scenarios()
assert {"cascading_release_train", "observability_pipeline_outage", "supabase_rls_silent_leak"}.issubset(
set(scenarios)
)
@pytest.mark.parametrize(
"scenario_id",
["cascading_release_train", "observability_pipeline_outage", "supabase_rls_silent_leak"],
)
def test_advanced_scenario_runs_end_to_end(scenario_id: str) -> None:
result = run_advanced(scenario_id, seed=1)
assert isinstance(result, AdvancedResult)
assert result.scenario_id == scenario_id
assert len(result.phases) >= 1
# Every phase must produce a final score in [0.0, 1.0].
for phase in result.phases:
assert 0.0 <= phase.final_score <= 1.0
# Horizon-decay ≤ 1, raw mean ≥ 0.
assert 0.0 <= result.horizon_decay_factor <= 1.0
assert 0.0 <= result.raw_mean_reward <= 1.0
def test_advanced_runner_is_deterministic_for_same_seed() -> None:
a = run_advanced("cascading_release_train", seed=42)
b = run_advanced("cascading_release_train", seed=42)
assert a.final_reward == b.final_reward
assert a.raw_mean_reward == b.raw_mean_reward
assert len(a.phases) == len(b.phases)
def test_advanced_emits_log_lines_when_callback_provided() -> None:
captured: list[str] = []
run_advanced("cascading_release_train", seed=1, on_log=captured.append)
assert any("phase 1" in line for line in captured)
assert any("phase 2" in line for line in captured)
assert any("declare_resolved" in line for line in captured)
# ---------- Max runner ----------
def test_list_max_families_has_ecommerce_family() -> None:
families = list_max_families()
assert "ecommerce_vibecoded_saas" in families
@pytest.mark.parametrize("chaos", list(CHAOS_PATTERNS))
def test_max_chaos_pattern_runs_without_error(chaos: str) -> None:
"""Every documented chaos pattern must be reachable from run_max."""
result = run_max("ecommerce_vibecoded_saas", chaos=chaos, seed=1)
assert isinstance(result, MaxResult)
assert result.chaos == chaos
assert 0.0 <= result.final_reward <= 1.0
assert 0 <= result.tick_count <= 25
def test_max_runner_is_deterministic_for_same_seed() -> None:
a = run_max("ecommerce_vibecoded_saas", chaos="deploy_regression", seed=42)
b = run_max("ecommerce_vibecoded_saas", chaos="deploy_regression", seed=42)
assert a.final_reward == b.final_reward
assert a.cumulative_reward == b.cumulative_reward
assert a.tick_count == b.tick_count
def test_max_security_classified_chaos_carries_classification() -> None:
"""rls_silent_leak / oauth_supply_chain_pivot / cdn_cache_contamination
must surface ``classification == 'security'`` in MaxResult so the UI
can render a security badge."""
for chaos in ("rls_silent_leak", "oauth_supply_chain_pivot", "cdn_cache_contamination"):
result = run_max("ecommerce_vibecoded_saas", chaos=chaos, seed=1)
assert result.classification == "security", (
f"{chaos} must be classification='security' (got {result.classification!r})"
)
def test_max_per_step_env_returns_observation_on_reset() -> None:
"""The MaxRunnerEnv (used by SREGym(tier=Tier.MAX).reset/step) returns a
valid observation."""
from sre_gym.operations.runner import MaxRunnerEnv
env = MaxRunnerEnv(family_id="ecommerce_vibecoded_saas")
obs = env.reset(chaos="deploy_regression", seed=1)
assert obs.tick_count == 0
assert obs.family_id == "ecommerce_vibecoded_saas"
assert obs.chaos == "deploy_regression"
assert "api-gateway" in obs.services
def test_max_per_step_env_steps() -> None:
from sre_gym.operations.runner import MaxRunnerEnv
env = MaxRunnerEnv()
env.reset(chaos="deploy_regression", seed=1)
obs = env.step({"action_type": "query_logs", "service": "orders-service"})
assert obs.tick_count == 1
assert "query" in obs.last_log