Mist-ic commited on
Commit
5f8bd3c
·
1 Parent(s): 00225fe

Add tests and fix premature termination bug

Browse files

- tests/test_simulator.py: 22 tests for determinism, SLO scoring, all
action types, termination logic, and observation helpers
- tests/test_grader.py: 5 tests for score bounds, partial credit,
determinism, and resolved bonus
- tests/test_propagation.py: 10 tests for queueing theory (Little's Law,
retry amplification) and circuit breaker state machine
- Fix: episodes no longer terminate as "resolved" when SLO=1.0 but
injected failures haven't been remediated (resource leaks, gradual
failures now properly degrade over time before resolution)
All 37 tests pass.

server/simulator.py CHANGED
@@ -745,8 +745,12 @@ class Simulator:
745
  """Check if the episode should end."""
746
  slo = self.get_slo_score()
747
 
748
- # Success: all SLOs met
749
- if slo >= 1.0:
 
 
 
 
750
  self.terminated = True
751
  self.termination_reason = "resolved"
752
  return
 
745
  """Check if the episode should end."""
746
  slo = self.get_slo_score()
747
 
748
+ # Success: all SLOs met AND all injected failures have been remediated
749
+ all_remediated = all(
750
+ spec.service_id in self.remediated_services
751
+ for spec in self.failures
752
+ )
753
+ if slo >= 1.0 and all_remediated:
754
  self.terminated = True
755
  self.termination_reason = "resolved"
756
  return
tests/__init__.py ADDED
File without changes
tests/test_grader.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the deterministic grader."""
2
+
3
+ import sys
4
+ import os
5
+
6
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
7
+
8
+ from server.grader import grade_episode
9
+
10
+
11
+ class TestGraderBounds:
12
+ """Score is always 0.0–1.0."""
13
+
14
+ def test_perfect_score(self):
15
+ result = grade_episode(
16
+ final_slo_score=1.0,
17
+ steps_taken=3,
18
+ max_steps=10,
19
+ actions_taken=[
20
+ {"tick": 0, "action": "inspect_logs", "target": "svc", "success": True},
21
+ {"tick": 1, "action": "restart_service", "target": "svc", "success": True},
22
+ ],
23
+ terminated=True,
24
+ termination_reason="resolved",
25
+ )
26
+ assert 0.0 <= result.score <= 1.0
27
+ assert result.score > 0.8 # Resolved quickly = high score
28
+
29
+ def test_zero_score(self):
30
+ result = grade_episode(
31
+ final_slo_score=0.0,
32
+ steps_taken=10,
33
+ max_steps=10,
34
+ actions_taken=[],
35
+ terminated=True,
36
+ termination_reason="timeout",
37
+ )
38
+ assert result.score == 0.0
39
+
40
+ def test_partial_credit(self):
41
+ result = grade_episode(
42
+ final_slo_score=0.5,
43
+ steps_taken=10,
44
+ max_steps=10,
45
+ actions_taken=[
46
+ {"tick": i, "action": "noop", "success": True}
47
+ for i in range(10)
48
+ ],
49
+ terminated=True,
50
+ termination_reason="timeout",
51
+ )
52
+ assert 0.0 < result.score < 1.0
53
+
54
+ def test_determinism(self):
55
+ args = dict(
56
+ final_slo_score=0.7,
57
+ steps_taken=5,
58
+ max_steps=20,
59
+ actions_taken=[
60
+ {"tick": 0, "action": "inspect_logs", "target": "svc", "success": True},
61
+ {"tick": 1, "action": "restart_service", "target": "svc", "success": True},
62
+ ],
63
+ terminated=True,
64
+ termination_reason="timeout",
65
+ )
66
+ r1 = grade_episode(**args)
67
+ r2 = grade_episode(**args)
68
+ assert r1.score == r2.score
69
+
70
+ def test_resolved_bonus(self):
71
+ """Resolved episodes should score higher than timed-out ones at same SLO."""
72
+ resolved = grade_episode(
73
+ final_slo_score=1.0,
74
+ steps_taken=5,
75
+ max_steps=10,
76
+ actions_taken=[{"tick": i, "action": "restart_service", "target": "svc", "success": True} for i in range(5)],
77
+ terminated=True,
78
+ termination_reason="resolved",
79
+ )
80
+ timeout = grade_episode(
81
+ final_slo_score=1.0,
82
+ steps_taken=10,
83
+ max_steps=10,
84
+ actions_taken=[{"tick": i, "action": "noop", "success": True} for i in range(10)],
85
+ terminated=True,
86
+ termination_reason="timeout",
87
+ )
88
+ assert resolved.score > timeout.score
tests/test_propagation.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for queueing theory and propagation."""
2
+
3
+ import sys
4
+ import os
5
+
6
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
7
+
8
+ from server.propagation import (
9
+ compute_utilisation,
10
+ compute_queueing_latency_multiplier,
11
+ compute_retry_amplification,
12
+ CircuitBreaker,
13
+ BreakerState,
14
+ )
15
+ import random
16
+
17
+
18
+ class TestQueueingTheory:
19
+ """Little's Law and M/M/c approximations."""
20
+
21
+ def test_utilisation_basic(self):
22
+ # L = 100 * 0.05 = 5, T = 50, ρ = 0.1
23
+ rho = compute_utilisation(100.0, 0.05, 50)
24
+ assert abs(rho - 0.1) < 0.001
25
+
26
+ def test_utilisation_saturated(self):
27
+ # L = 1000 * 0.1 = 100, T = 50, ρ = 2.0 → capped at 1.0
28
+ rho = compute_utilisation(1000.0, 0.1, 50)
29
+ assert rho == 1.0
30
+
31
+ def test_utilisation_zero_traffic(self):
32
+ rho = compute_utilisation(0.0, 0.05, 50)
33
+ assert rho == 0.0
34
+
35
+ def test_latency_multiplier_low_utilisation(self):
36
+ mult = compute_queueing_latency_multiplier(0.1)
37
+ assert 1.0 < mult < 2.0 # ~1.11x
38
+
39
+ def test_latency_multiplier_high_utilisation(self):
40
+ mult = compute_queueing_latency_multiplier(0.95)
41
+ assert mult >= 10.0
42
+
43
+ def test_latency_multiplier_saturated(self):
44
+ mult = compute_queueing_latency_multiplier(0.99)
45
+ assert mult >= 20.0
46
+
47
+ def test_retry_amplification_no_failures(self):
48
+ amp = compute_retry_amplification(0.0, 3)
49
+ assert amp == 1.0
50
+
51
+ def test_retry_amplification_total_failure(self):
52
+ amp = compute_retry_amplification(1.0, 3)
53
+ assert amp == 4.0 # 1 + 3 retries
54
+
55
+ def test_retry_amplification_partial(self):
56
+ amp = compute_retry_amplification(0.5, 3)
57
+ assert 1.0 < amp < 4.0
58
+
59
+
60
+ class TestCircuitBreaker:
61
+ """Circuit breaker state transitions."""
62
+
63
+ def test_starts_closed(self):
64
+ cb = CircuitBreaker()
65
+ assert cb.state == BreakerState.CLOSED
66
+
67
+ def test_trips_open_on_high_errors(self):
68
+ cb = CircuitBreaker(error_threshold=0.5, window_size=3)
69
+ rng = random.Random(42)
70
+ for _ in range(5):
71
+ cb.tick(0.8, rng)
72
+ assert cb.state == BreakerState.OPEN
73
+
74
+ def test_transitions_to_half_open(self):
75
+ cb = CircuitBreaker(error_threshold=0.5, cooldown_ticks=5, window_size=2)
76
+ rng = random.Random(42)
77
+ # Trip open
78
+ for _ in range(3):
79
+ cb.tick(0.9, rng)
80
+ assert cb.state == BreakerState.OPEN
81
+ # Wait for cooldown
82
+ for _ in range(6):
83
+ cb.tick(0.0, rng)
84
+ assert cb.state in (BreakerState.HALF_OPEN, BreakerState.CLOSED)
85
+
86
+ def test_dampening_factor(self):
87
+ cb = CircuitBreaker()
88
+ assert cb.dampening_factor == 1.0 # CLOSED
89
+ cb.state = BreakerState.OPEN
90
+ assert cb.dampening_factor == 0.05
91
+ cb.state = BreakerState.HALF_OPEN
92
+ assert cb.dampening_factor == 0.3
tests/test_simulator.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the simulation engine — determinism, actions, SLO scoring."""
2
+
3
+ import sys
4
+ import os
5
+
6
+ # Ensure project root is on the path
7
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
8
+
9
+ from server.simulator import Simulator
10
+ from server.scenarios import generate_scenario
11
+
12
+
13
+ def _make_sim(task_id: str = "easy", seed: int = 42) -> Simulator:
14
+ scenario = generate_scenario(seed, task_id)
15
+ sim = Simulator()
16
+ sim.reset(seed=seed, difficulty=scenario.difficulty, failure_specs=scenario.failure_specs)
17
+ return sim
18
+
19
+
20
+ class TestDeterminism:
21
+ """Same seed + same actions = identical state."""
22
+
23
+ def test_reset_determinism(self):
24
+ sim1 = _make_sim(seed=42)
25
+ sim2 = _make_sim(seed=42)
26
+ assert sim1.get_slo_score() == sim2.get_slo_score()
27
+ assert len(sim1.services) == len(sim2.services)
28
+ for sid in sim1.services:
29
+ s1 = sim1.services[sid]
30
+ s2 = sim2.services[sid]
31
+ assert s1.error_rate == s2.error_rate
32
+ assert s1.latency_p99_ms == s2.latency_p99_ms
33
+
34
+ def test_step_determinism(self):
35
+ sim1 = _make_sim(seed=42)
36
+ sim2 = _make_sim(seed=42)
37
+ # Take same actions
38
+ for _ in range(3):
39
+ r1 = sim1.step("noop", {})
40
+ r2 = sim2.step("noop", {})
41
+ assert r1 == r2
42
+ assert sim1.get_slo_score() == sim2.get_slo_score()
43
+
44
+ def test_different_seeds_differ(self):
45
+ sim1 = _make_sim(seed=42)
46
+ sim2 = _make_sim(seed=999)
47
+ # Different seeds should (very likely) produce different failure targets
48
+ failures1 = {s.service_id for s in sim1.failures}
49
+ failures2 = {s.service_id for s in sim2.failures}
50
+ # At minimum, graphs or failures should differ (not guaranteed but extremely likely)
51
+ services1 = set(sim1.services.keys())
52
+ services2 = set(sim2.services.keys())
53
+ assert failures1 != failures2 or services1 != services2
54
+
55
+
56
+ class TestSLOScoring:
57
+ """SLO score is 0.0–1.0 and reflects service health."""
58
+
59
+ def test_slo_range(self):
60
+ sim = _make_sim()
61
+ score = sim.get_slo_score()
62
+ assert 0.0 <= score <= 1.0
63
+
64
+ def test_initial_slo_below_one(self):
65
+ """After failure injection, at least one service should be degraded."""
66
+ sim = _make_sim()
67
+ assert sim.get_slo_score() < 1.0
68
+
69
+ def test_slo_after_noop(self):
70
+ sim = _make_sim()
71
+ sim.step("noop", {})
72
+ score = sim.get_slo_score()
73
+ assert 0.0 <= score <= 1.0
74
+
75
+
76
+ class TestActions:
77
+ """Action processing works correctly."""
78
+
79
+ def test_noop(self):
80
+ sim = _make_sim()
81
+ reward = sim.step("noop", {})
82
+ assert isinstance(reward, float)
83
+
84
+ def test_inspect_logs(self):
85
+ sim = _make_sim()
86
+ # Get any service
87
+ service_id = list(sim.services.keys())[0]
88
+ sim.step("inspect_logs", {"service_id": service_id})
89
+ assert sim.last_logs is not None
90
+ assert len(sim.last_logs) > 0
91
+
92
+ def test_inspect_metrics(self):
93
+ sim = _make_sim()
94
+ service_id = list(sim.services.keys())[0]
95
+ sim.step("inspect_metrics", {"service_id": service_id})
96
+ assert sim.last_metric_history is not None
97
+
98
+ def test_inspect_traces(self):
99
+ sim = _make_sim()
100
+ service_id = list(sim.services.keys())[0]
101
+ sim.step("inspect_traces", {"service_id": service_id})
102
+ assert sim.last_traces is not None
103
+ assert "trace_id" in sim.last_traces
104
+ assert "spans" in sim.last_traces
105
+
106
+ def test_restart_service(self):
107
+ sim = _make_sim()
108
+ target = sim.failures[0].service_id if sim.failures else list(sim.services.keys())[0]
109
+ reward = sim.step("restart_service", {"service_id": target})
110
+ assert isinstance(reward, float)
111
+ assert len(sim.pending_effects) >= 0 # May or may not have pending
112
+
113
+ def test_invalid_service(self):
114
+ sim = _make_sim()
115
+ sim.step("inspect_logs", {"service_id": "nonexistent-service"})
116
+ assert sim.last_logs is None
117
+ # Should have a failed action record
118
+ assert not sim.actions_taken[-1]["success"]
119
+
120
+ def test_unknown_action(self):
121
+ sim = _make_sim()
122
+ reward = sim.step("fly_to_moon", {})
123
+ assert not sim.actions_taken[-1]["success"]
124
+
125
+
126
+ class TestTermination:
127
+ """Episode termination logic."""
128
+
129
+ def test_timeout(self):
130
+ sim = _make_sim(task_id="easy") # 10 step budget
131
+ for _ in range(15):
132
+ if sim.terminated:
133
+ break
134
+ sim.step("noop", {})
135
+ assert sim.terminated
136
+ assert sim.termination_reason in ("timeout", "resolved", "failed")
137
+
138
+ def test_tick_advances(self):
139
+ sim = _make_sim()
140
+ assert sim.tick == 0
141
+ sim.step("noop", {})
142
+ assert sim.tick == 1
143
+ sim.step("noop", {})
144
+ assert sim.tick == 2
145
+
146
+
147
+ class TestObservationHelpers:
148
+ """Observation builder methods."""
149
+
150
+ def test_observation_summary(self):
151
+ sim = _make_sim()
152
+ summary = sim.get_observation_summary()
153
+ assert "Tick" in summary
154
+ assert "SLO" in summary
155
+
156
+ def test_alerts(self):
157
+ sim = _make_sim()
158
+ alerts = sim.get_alerts()
159
+ assert isinstance(alerts, list)
160
+ # With failures injected, there should be at least one alert
161
+ assert len(alerts) > 0
162
+
163
+ def test_legal_actions(self):
164
+ sim = _make_sim()
165
+ legal = sim.get_legal_actions()
166
+ assert isinstance(legal, list)
167
+ assert len(legal) > 0
168
+ action_types = {a["action_type"] for a in legal}
169
+ assert "noop" in action_types
170
+ assert "inspect_logs" in action_types
171
+
172
+ def test_service_observations(self):
173
+ sim = _make_sim()
174
+ obs = sim.get_service_observations()
175
+ assert isinstance(obs, list)
176
+ assert len(obs) > 0
177
+ svc = obs[0]
178
+ assert "id" in svc
179
+ assert "error_rate" in svc
180
+ assert "latency_p99_ms" in svc
181
+ assert "circuit_breakers" in svc