Spaces:

Mist-ic
/

sevzero

Sleeping

Mist-ic commited on Mar 29

Commit

5f8bd3c

1 Parent(s): 00225fe

Add tests and fix premature termination bug

- tests/test_simulator.py: 22 tests for determinism, SLO scoring, all
action types, termination logic, and observation helpers
- tests/test_grader.py: 5 tests for score bounds, partial credit,
determinism, and resolved bonus
- tests/test_propagation.py: 10 tests for queueing theory (Little's Law,
retry amplification) and circuit breaker state machine
- Fix: episodes no longer terminate as "resolved" when SLO=1.0 but
injected failures haven't been remediated (resource leaks, gradual
failures now properly degrade over time before resolution)
All 37 tests pass.

Files changed (5) hide show

server/simulator.py +6 -2
tests/__init__.py +0 -0
tests/test_grader.py +88 -0
tests/test_propagation.py +92 -0
tests/test_simulator.py +181 -0

server/simulator.py CHANGED Viewed

@@ -745,8 +745,12 @@ class Simulator:
         """Check if the episode should end."""
         slo = self.get_slo_score()
-        # Success: all SLOs met
-        if slo >= 1.0:
             self.terminated = True
             self.termination_reason = "resolved"
             return

         """Check if the episode should end."""
         slo = self.get_slo_score()
+        # Success: all SLOs met AND all injected failures have been remediated
+        all_remediated = all(
+            spec.service_id in self.remediated_services
+            for spec in self.failures
+        )
+        if slo >= 1.0 and all_remediated:
             self.terminated = True
             self.termination_reason = "resolved"
             return

tests/__init__.py ADDED Viewed

File without changes

tests/test_grader.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""Tests for the deterministic grader."""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from server.grader import grade_episode
+class TestGraderBounds:
+    """Score is always 0.0–1.0."""
+    def test_perfect_score(self):
+        result = grade_episode(
+            final_slo_score=1.0,
+            steps_taken=3,
+            max_steps=10,
+            actions_taken=[
+                {"tick": 0, "action": "inspect_logs", "target": "svc", "success": True},
+                {"tick": 1, "action": "restart_service", "target": "svc", "success": True},
+            ],
+            terminated=True,
+            termination_reason="resolved",
+        )
+        assert 0.0 <= result.score <= 1.0
+        assert result.score > 0.8  # Resolved quickly = high score
+    def test_zero_score(self):
+        result = grade_episode(
+            final_slo_score=0.0,
+            steps_taken=10,
+            max_steps=10,
+            actions_taken=[],
+            terminated=True,
+            termination_reason="timeout",
+        )
+        assert result.score == 0.0
+    def test_partial_credit(self):
+        result = grade_episode(
+            final_slo_score=0.5,
+            steps_taken=10,
+            max_steps=10,
+            actions_taken=[
+                {"tick": i, "action": "noop", "success": True}
+                for i in range(10)
+            ],
+            terminated=True,
+            termination_reason="timeout",
+        )
+        assert 0.0 < result.score < 1.0
+    def test_determinism(self):
+        args = dict(
+            final_slo_score=0.7,
+            steps_taken=5,
+            max_steps=20,
+            actions_taken=[
+                {"tick": 0, "action": "inspect_logs", "target": "svc", "success": True},
+                {"tick": 1, "action": "restart_service", "target": "svc", "success": True},
+            ],
+            terminated=True,
+            termination_reason="timeout",
+        )
+        r1 = grade_episode(**args)
+        r2 = grade_episode(**args)
+        assert r1.score == r2.score
+    def test_resolved_bonus(self):
+        """Resolved episodes should score higher than timed-out ones at same SLO."""
+        resolved = grade_episode(
+            final_slo_score=1.0,
+            steps_taken=5,
+            max_steps=10,
+            actions_taken=[{"tick": i, "action": "restart_service", "target": "svc", "success": True} for i in range(5)],
+            terminated=True,
+            termination_reason="resolved",
+        )
+        timeout = grade_episode(
+            final_slo_score=1.0,
+            steps_taken=10,
+            max_steps=10,
+            actions_taken=[{"tick": i, "action": "noop", "success": True} for i in range(10)],
+            terminated=True,
+            termination_reason="timeout",
+        )
+        assert resolved.score > timeout.score

tests/test_propagation.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""Tests for queueing theory and propagation."""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from server.propagation import (
+    compute_utilisation,
+    compute_queueing_latency_multiplier,
+    compute_retry_amplification,
+    CircuitBreaker,
+    BreakerState,
+)
+import random
+class TestQueueingTheory:
+    """Little's Law and M/M/c approximations."""
+    def test_utilisation_basic(self):
+        # L = 100 * 0.05 = 5, T = 50, ρ = 0.1
+        rho = compute_utilisation(100.0, 0.05, 50)
+        assert abs(rho - 0.1) < 0.001
+    def test_utilisation_saturated(self):
+        # L = 1000 * 0.1 = 100, T = 50, ρ = 2.0 → capped at 1.0
+        rho = compute_utilisation(1000.0, 0.1, 50)
+        assert rho == 1.0
+    def test_utilisation_zero_traffic(self):
+        rho = compute_utilisation(0.0, 0.05, 50)
+        assert rho == 0.0
+    def test_latency_multiplier_low_utilisation(self):
+        mult = compute_queueing_latency_multiplier(0.1)
+        assert 1.0 < mult < 2.0  # ~1.11x
+    def test_latency_multiplier_high_utilisation(self):
+        mult = compute_queueing_latency_multiplier(0.95)
+        assert mult >= 10.0
+    def test_latency_multiplier_saturated(self):
+        mult = compute_queueing_latency_multiplier(0.99)
+        assert mult >= 20.0
+    def test_retry_amplification_no_failures(self):
+        amp = compute_retry_amplification(0.0, 3)
+        assert amp == 1.0
+    def test_retry_amplification_total_failure(self):
+        amp = compute_retry_amplification(1.0, 3)
+        assert amp == 4.0  # 1 + 3 retries
+    def test_retry_amplification_partial(self):
+        amp = compute_retry_amplification(0.5, 3)
+        assert 1.0 < amp < 4.0
+class TestCircuitBreaker:
+    """Circuit breaker state transitions."""
+    def test_starts_closed(self):
+        cb = CircuitBreaker()
+        assert cb.state == BreakerState.CLOSED
+    def test_trips_open_on_high_errors(self):
+        cb = CircuitBreaker(error_threshold=0.5, window_size=3)
+        rng = random.Random(42)
+        for _ in range(5):
+            cb.tick(0.8, rng)
+        assert cb.state == BreakerState.OPEN
+    def test_transitions_to_half_open(self):
+        cb = CircuitBreaker(error_threshold=0.5, cooldown_ticks=5, window_size=2)
+        rng = random.Random(42)
+        # Trip open
+        for _ in range(3):
+            cb.tick(0.9, rng)
+        assert cb.state == BreakerState.OPEN
+        # Wait for cooldown
+        for _ in range(6):
+            cb.tick(0.0, rng)
+        assert cb.state in (BreakerState.HALF_OPEN, BreakerState.CLOSED)
+    def test_dampening_factor(self):
+        cb = CircuitBreaker()
+        assert cb.dampening_factor == 1.0  # CLOSED
+        cb.state = BreakerState.OPEN
+        assert cb.dampening_factor == 0.05
+        cb.state = BreakerState.HALF_OPEN
+        assert cb.dampening_factor == 0.3

tests/test_simulator.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""Tests for the simulation engine — determinism, actions, SLO scoring."""
+import sys
+import os
+# Ensure project root is on the path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from server.simulator import Simulator
+from server.scenarios import generate_scenario
+def _make_sim(task_id: str = "easy", seed: int = 42) -> Simulator:
+    scenario = generate_scenario(seed, task_id)
+    sim = Simulator()
+    sim.reset(seed=seed, difficulty=scenario.difficulty, failure_specs=scenario.failure_specs)
+    return sim
+class TestDeterminism:
+    """Same seed + same actions = identical state."""
+    def test_reset_determinism(self):
+        sim1 = _make_sim(seed=42)
+        sim2 = _make_sim(seed=42)
+        assert sim1.get_slo_score() == sim2.get_slo_score()
+        assert len(sim1.services) == len(sim2.services)
+        for sid in sim1.services:
+            s1 = sim1.services[sid]
+            s2 = sim2.services[sid]
+            assert s1.error_rate == s2.error_rate
+            assert s1.latency_p99_ms == s2.latency_p99_ms
+    def test_step_determinism(self):
+        sim1 = _make_sim(seed=42)
+        sim2 = _make_sim(seed=42)
+        # Take same actions
+        for _ in range(3):
+            r1 = sim1.step("noop", {})
+            r2 = sim2.step("noop", {})
+            assert r1 == r2
+            assert sim1.get_slo_score() == sim2.get_slo_score()
+    def test_different_seeds_differ(self):
+        sim1 = _make_sim(seed=42)
+        sim2 = _make_sim(seed=999)
+        # Different seeds should (very likely) produce different failure targets
+        failures1 = {s.service_id for s in sim1.failures}
+        failures2 = {s.service_id for s in sim2.failures}
+        # At minimum, graphs or failures should differ (not guaranteed but extremely likely)
+        services1 = set(sim1.services.keys())
+        services2 = set(sim2.services.keys())
+        assert failures1 != failures2 or services1 != services2
+class TestSLOScoring:
+    """SLO score is 0.0–1.0 and reflects service health."""
+    def test_slo_range(self):
+        sim = _make_sim()
+        score = sim.get_slo_score()
+        assert 0.0 <= score <= 1.0
+    def test_initial_slo_below_one(self):
+        """After failure injection, at least one service should be degraded."""
+        sim = _make_sim()
+        assert sim.get_slo_score() < 1.0
+    def test_slo_after_noop(self):
+        sim = _make_sim()
+        sim.step("noop", {})
+        score = sim.get_slo_score()
+        assert 0.0 <= score <= 1.0
+class TestActions:
+    """Action processing works correctly."""
+    def test_noop(self):
+        sim = _make_sim()
+        reward = sim.step("noop", {})
+        assert isinstance(reward, float)
+    def test_inspect_logs(self):
+        sim = _make_sim()
+        # Get any service
+        service_id = list(sim.services.keys())[0]
+        sim.step("inspect_logs", {"service_id": service_id})
+        assert sim.last_logs is not None
+        assert len(sim.last_logs) > 0
+    def test_inspect_metrics(self):
+        sim = _make_sim()
+        service_id = list(sim.services.keys())[0]
+        sim.step("inspect_metrics", {"service_id": service_id})
+        assert sim.last_metric_history is not None
+    def test_inspect_traces(self):
+        sim = _make_sim()
+        service_id = list(sim.services.keys())[0]
+        sim.step("inspect_traces", {"service_id": service_id})
+        assert sim.last_traces is not None
+        assert "trace_id" in sim.last_traces
+        assert "spans" in sim.last_traces
+    def test_restart_service(self):
+        sim = _make_sim()
+        target = sim.failures[0].service_id if sim.failures else list(sim.services.keys())[0]
+        reward = sim.step("restart_service", {"service_id": target})
+        assert isinstance(reward, float)
+        assert len(sim.pending_effects) >= 0  # May or may not have pending
+    def test_invalid_service(self):
+        sim = _make_sim()
+        sim.step("inspect_logs", {"service_id": "nonexistent-service"})
+        assert sim.last_logs is None
+        # Should have a failed action record
+        assert not sim.actions_taken[-1]["success"]
+    def test_unknown_action(self):
+        sim = _make_sim()
+        reward = sim.step("fly_to_moon", {})
+        assert not sim.actions_taken[-1]["success"]
+class TestTermination:
+    """Episode termination logic."""
+    def test_timeout(self):
+        sim = _make_sim(task_id="easy")  # 10 step budget
+        for _ in range(15):
+            if sim.terminated:
+                break
+            sim.step("noop", {})
+        assert sim.terminated
+        assert sim.termination_reason in ("timeout", "resolved", "failed")
+    def test_tick_advances(self):
+        sim = _make_sim()
+        assert sim.tick == 0
+        sim.step("noop", {})
+        assert sim.tick == 1
+        sim.step("noop", {})
+        assert sim.tick == 2
+class TestObservationHelpers:
+    """Observation builder methods."""
+    def test_observation_summary(self):
+        sim = _make_sim()
+        summary = sim.get_observation_summary()
+        assert "Tick" in summary
+        assert "SLO" in summary
+    def test_alerts(self):
+        sim = _make_sim()
+        alerts = sim.get_alerts()
+        assert isinstance(alerts, list)
+        # With failures injected, there should be at least one alert
+        assert len(alerts) > 0
+    def test_legal_actions(self):
+        sim = _make_sim()
+        legal = sim.get_legal_actions()
+        assert isinstance(legal, list)
+        assert len(legal) > 0
+        action_types = {a["action_type"] for a in legal}
+        assert "noop" in action_types
+        assert "inspect_logs" in action_types
+    def test_service_observations(self):
+        sim = _make_sim()
+        obs = sim.get_service_observations()
+        assert isinstance(obs, list)
+        assert len(obs) > 0
+        svc = obs[0]
+        assert "id" in svc
+        assert "error_rate" in svc
+        assert "latency_p99_ms" in svc
+        assert "circuit_breakers" in svc