Mist-ic commited on
Commit
0e4dd30
·
1 Parent(s): a46811c

Add core simulation engine, environment, grader, and app wiring

Browse files

- server/traces.py: distributed trace generation (Jaeger/Zipkin-style spans)
- server/simulator.py: discrete-event simulation engine with tick-based
failure evolution, propagation, action processing, SLO scoring, dense
reward computation, and pending remediation effects
- server/scenarios.py: procedural scenario generation with 3 task
definitions (easy/medium/hard) and intelligent failure placement
- server/environment.py: SevZeroEnvironment(Environment) bridging
OpenEnv SDK contract with the simulator
- server/grader.py: deterministic grading (0.0-1.0) with SLO recovery,
action efficiency, and time efficiency components
- server/app.py: FastAPI app via create_app() + custom /tasks, /grader routes
- .gitignore: added __pycache__ exclusion

.gitignore CHANGED
@@ -2,4 +2,12 @@
2
  Docs/
3
 
4
  # OpenEnv preparatory course (dev reference only, not part of submission)
5
- openenv-course/
 
 
 
 
 
 
 
 
 
2
  Docs/
3
 
4
  # OpenEnv preparatory course (dev reference only, not part of submission)
5
+ openenv-course/
6
+
7
+ # Python
8
+ __pycache__/
9
+ *.pyc
10
+ *.pyo
11
+
12
+ # Environment
13
+ .env
server/app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server/app.py — FastAPI application wiring.
3
+
4
+ Uses OpenEnv SDK's create_app() for core endpoints (/reset, /step, /state, /ws, /health),
5
+ then adds custom routes for /tasks, /grader, and /baseline.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from fastapi import FastAPI
13
+ from openenv.core.env_server import create_app
14
+ from pydantic import BaseModel
15
+
16
+ from models import SevZeroAction, SevZeroObservation
17
+ from server.environment import SevZeroEnvironment
18
+ from server.grader import grade_episode
19
+ from server.scenarios import TASK_DEFINITIONS
20
+
21
+
22
+ # Create the OpenEnv app (wires /reset, /step, /state, /ws, /health, /schema, /metadata)
23
+ app = create_app(
24
+ SevZeroEnvironment,
25
+ SevZeroAction,
26
+ SevZeroObservation,
27
+ env_name="sevzero",
28
+ )
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Custom routes
33
+ # ---------------------------------------------------------------------------
34
+
35
+
36
+ @app.get("/tasks")
37
+ async def list_tasks() -> List[Dict[str, Any]]:
38
+ """Return the 3 task definitions (easy, medium, hard)."""
39
+ return [
40
+ {
41
+ "task_id": t["task_id"],
42
+ "name": t["name"],
43
+ "difficulty": t["difficulty"],
44
+ "description": t["description"],
45
+ "max_steps": t["max_steps"],
46
+ }
47
+ for t in TASK_DEFINITIONS
48
+ ]
49
+
50
+
51
+ class GraderRequest(BaseModel):
52
+ final_slo_score: float
53
+ steps_taken: int
54
+ max_steps: int
55
+ actions_taken: List[Dict[str, Any]]
56
+ terminated: bool
57
+ termination_reason: Optional[str] = None
58
+
59
+
60
+ @app.post("/grader")
61
+ async def grade(request: GraderRequest) -> Dict[str, Any]:
62
+ """
63
+ Deterministic grading endpoint.
64
+ Accepts episode results and returns a score 0.0–1.0 with breakdown.
65
+ """
66
+ result = grade_episode(
67
+ final_slo_score=request.final_slo_score,
68
+ steps_taken=request.steps_taken,
69
+ max_steps=request.max_steps,
70
+ actions_taken=request.actions_taken,
71
+ terminated=request.terminated,
72
+ termination_reason=request.termination_reason,
73
+ )
74
+ return {
75
+ "score": result.score,
76
+ "slo_recovery": result.slo_recovery,
77
+ "action_efficiency": result.action_efficiency,
78
+ "time_efficiency": result.time_efficiency,
79
+ "details": result.details,
80
+ }
81
+
82
+
83
+ # ---------------------------------------------------------------------------
84
+ # Entry point
85
+ # ---------------------------------------------------------------------------
86
+
87
+
88
+ def main() -> None:
89
+ import uvicorn
90
+ uvicorn.run(app, host="0.0.0.0", port=7860)
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()
server/environment.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server/environment.py — SevZeroEnvironment: OpenEnv Environment subclass.
3
+
4
+ Bridges the OpenEnv SDK contract (reset/step/state) with the Simulator engine.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import uuid
10
+ from typing import Any, Optional
11
+
12
+ from openenv.core.env_server import Environment
13
+ from openenv.core.env_server.types import EnvironmentMetadata
14
+
15
+ from models import SevZeroAction, SevZeroObservation, SevZeroState
16
+ from server.scenarios import generate_scenario
17
+ from server.simulator import Simulator
18
+
19
+
20
+ class SevZeroEnvironment(Environment[SevZeroAction, SevZeroObservation, SevZeroState]):
21
+ """
22
+ SRE Incident Response Environment.
23
+
24
+ The agent observes service metrics, alerts, and logs, then issues
25
+ remediation commands to restore SLO compliance across a microservice cluster.
26
+ """
27
+
28
+ def __init__(self) -> None:
29
+ super().__init__()
30
+ self._sim = Simulator()
31
+ self._episode_id: Optional[str] = None
32
+ self._task_id: str = "easy"
33
+ self._seed: Optional[int] = None
34
+ self._step_count: int = 0
35
+
36
+ def get_metadata(self) -> EnvironmentMetadata:
37
+ return EnvironmentMetadata(
38
+ name="sevzero",
39
+ description=(
40
+ "SRE Incident Response Environment — an autonomous on-call SRE "
41
+ "managing a microservice cluster undergoing cascading failures"
42
+ ),
43
+ version="1.0.0",
44
+ )
45
+
46
+ def reset(
47
+ self,
48
+ seed: Optional[int] = None,
49
+ episode_id: Optional[str] = None,
50
+ **kwargs: Any,
51
+ ) -> SevZeroObservation:
52
+ self._episode_id = episode_id or str(uuid.uuid4())
53
+ self._task_id = kwargs.get("task_id", "easy")
54
+ self._seed = seed if seed is not None else 42
55
+ self._step_count = 0
56
+
57
+ # Generate scenario and reset simulator
58
+ scenario = generate_scenario(self._seed, self._task_id)
59
+ self._sim.reset(
60
+ seed=self._seed,
61
+ difficulty=scenario.difficulty,
62
+ failure_specs=scenario.failure_specs,
63
+ )
64
+
65
+ return self._build_observation(reward=None, done=False)
66
+
67
+ def step(
68
+ self,
69
+ action: SevZeroAction,
70
+ timeout_s: Optional[float] = None,
71
+ **kwargs: Any,
72
+ ) -> SevZeroObservation:
73
+ self._step_count += 1
74
+
75
+ reward = self._sim.step(action.action_type, action.params)
76
+ done = self._sim.terminated
77
+
78
+ return self._build_observation(reward=reward, done=done)
79
+
80
+ @property
81
+ def state(self) -> SevZeroState:
82
+ return SevZeroState(
83
+ episode_id=self._episode_id,
84
+ step_count=self._step_count,
85
+ task_id=self._task_id,
86
+ seed=self._seed,
87
+ global_slo_score=self._sim.get_slo_score(),
88
+ terminated=self._sim.terminated,
89
+ termination_reason=self._sim.termination_reason,
90
+ )
91
+
92
+ def _build_observation(
93
+ self, reward: Optional[float], done: bool,
94
+ ) -> SevZeroObservation:
95
+ sim = self._sim
96
+ return SevZeroObservation(
97
+ done=done,
98
+ reward=reward,
99
+ # Episode context
100
+ tick=sim.tick,
101
+ episode_id=self._episode_id,
102
+ task_id=self._task_id,
103
+ status=sim.termination_reason or "playing",
104
+ max_steps=sim.max_steps,
105
+ # Health summary
106
+ global_slo_score=round(sim.get_slo_score(), 4),
107
+ observation_summary=sim.get_observation_summary(),
108
+ # Per-service state
109
+ services=sim.get_service_observations(),
110
+ # Alerts
111
+ alerts=sim.get_alerts(),
112
+ # Context
113
+ recent_deploys=[d for d in sim.deploys if d["ticks_ago"] <= 10],
114
+ actions_taken=sim.actions_taken[-10:],
115
+ # Action space
116
+ legal_actions=sim.get_legal_actions(),
117
+ # Diagnostics
118
+ logs=sim.last_logs,
119
+ metric_history=sim.last_metric_history,
120
+ traces=sim.last_traces,
121
+ )
server/grader.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server/grader.py — Deterministic grading for SevZero episodes.
3
+
4
+ Score formula:
5
+ score = slo_recovery * 0.70 + action_efficiency * 0.15 + time_efficiency * 0.15
6
+
7
+ All inputs are derived from the episode state — fully deterministic.
8
+ Score is continuous 0.0–1.0 with partial credit.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from dataclasses import dataclass
14
+ from typing import Any, Dict, List, Optional
15
+
16
+
17
+ @dataclass
18
+ class GradeResult:
19
+ """Grading result with breakdown."""
20
+ score: float
21
+ slo_recovery: float
22
+ action_efficiency: float
23
+ time_efficiency: float
24
+ details: Dict[str, Any]
25
+
26
+
27
+ def grade_episode(
28
+ final_slo_score: float,
29
+ steps_taken: int,
30
+ max_steps: int,
31
+ actions_taken: List[Dict[str, Any]],
32
+ terminated: bool,
33
+ termination_reason: Optional[str],
34
+ ) -> GradeResult:
35
+ """
36
+ Grade a completed episode.
37
+
38
+ Args:
39
+ final_slo_score: fraction of services meeting SLO at episode end (0.0–1.0)
40
+ steps_taken: number of steps the agent took
41
+ max_steps: maximum allowed steps for this task
42
+ actions_taken: list of action records
43
+ terminated: whether the episode ended
44
+ termination_reason: "resolved" | "timeout" | "failed" | None
45
+ """
46
+ # --- SLO recovery (70%) ---
47
+ # Direct fraction of services recovered
48
+ slo_recovery = final_slo_score
49
+
50
+ # Bonus for full resolution
51
+ if termination_reason == "resolved":
52
+ slo_recovery = 1.0
53
+
54
+ # --- Action efficiency (15%) ---
55
+ # Penalize wasted actions (noops when degraded, failed actions, redundant inspects)
56
+ total_actions = len(actions_taken)
57
+ if total_actions == 0:
58
+ action_efficiency = 0.0
59
+ else:
60
+ successful = sum(1 for a in actions_taken if a.get("success", False))
61
+ remediation_actions = sum(
62
+ 1 for a in actions_taken
63
+ if a.get("action") not in ("inspect_logs", "inspect_metrics", "inspect_traces", "noop")
64
+ and a.get("success", False)
65
+ )
66
+ inspect_actions = sum(
67
+ 1 for a in actions_taken
68
+ if a.get("action") in ("inspect_logs", "inspect_metrics", "inspect_traces")
69
+ )
70
+
71
+ # Good ratio: some inspection + targeted remediation
72
+ success_rate = successful / total_actions
73
+ # Penalize excessive inspections (>50% of budget is too much looking, not enough doing)
74
+ inspect_penalty = max(0.0, (inspect_actions / total_actions) - 0.5) if total_actions > 0 else 0.0
75
+ action_efficiency = max(0.0, success_rate - inspect_penalty)
76
+
77
+ # --- Time efficiency (15%) ---
78
+ # Faster resolution = higher score
79
+ if max_steps == 0:
80
+ time_efficiency = 0.0
81
+ elif termination_reason == "resolved":
82
+ # Resolved: reward faster resolution
83
+ time_efficiency = max(0.1, 1.0 - (steps_taken / max_steps))
84
+ else:
85
+ # Not resolved: partial credit based on how close we got
86
+ time_efficiency = final_slo_score * 0.3
87
+
88
+ # --- Final score ---
89
+ score = (
90
+ slo_recovery * 0.70
91
+ + action_efficiency * 0.15
92
+ + time_efficiency * 0.15
93
+ )
94
+ score = max(0.0, min(1.0, round(score, 4)))
95
+
96
+ return GradeResult(
97
+ score=score,
98
+ slo_recovery=round(slo_recovery, 4),
99
+ action_efficiency=round(action_efficiency, 4),
100
+ time_efficiency=round(time_efficiency, 4),
101
+ details={
102
+ "final_slo_score": round(final_slo_score, 4),
103
+ "steps_taken": steps_taken,
104
+ "max_steps": max_steps,
105
+ "termination_reason": termination_reason,
106
+ "total_actions": len(actions_taken),
107
+ },
108
+ )
server/scenarios.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server/scenarios.py — Procedural scenario generation from seed + difficulty.
3
+
4
+ Maps difficulty to graph topology, failure count, and failure placement.
5
+ Same seed + same difficulty = identical scenario every time.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import random
11
+ from dataclasses import dataclass, field
12
+ from typing import List, Optional
13
+
14
+ from server.failures import (
15
+ FailureSpec,
16
+ FailureType,
17
+ make_failure_spec,
18
+ select_failure_type,
19
+ select_multi_root_failures,
20
+ )
21
+ from server.graph import ServiceGraph, generate_graph
22
+
23
+
24
+ @dataclass
25
+ class ScenarioConfig:
26
+ """Complete scenario definition for one episode."""
27
+ difficulty: str
28
+ seed: int
29
+ graph: ServiceGraph
30
+ failure_specs: List[FailureSpec]
31
+ max_steps: int
32
+ description: str
33
+
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Task definitions (the 3 required tasks)
37
+ # ---------------------------------------------------------------------------
38
+
39
+ TASK_DEFINITIONS = [
40
+ {
41
+ "task_id": "easy",
42
+ "name": "Single Service Outage",
43
+ "difficulty": "easy",
44
+ "description": (
45
+ "A single service in a small linear microservice chain is experiencing failures. "
46
+ "Diagnose the root cause and apply the correct remediation within 10 steps."
47
+ ),
48
+ "max_steps": 10,
49
+ "num_failures": 1,
50
+ },
51
+ {
52
+ "task_id": "medium",
53
+ "name": "Cascading Failure",
54
+ "difficulty": "medium",
55
+ "description": (
56
+ "A failure in a shared infrastructure service is cascading through a branching "
57
+ "dependency graph. Trace the root cause upstream from symptomatic services and "
58
+ "remediate within 20 steps."
59
+ ),
60
+ "max_steps": 20,
61
+ "num_failures": 1,
62
+ },
63
+ {
64
+ "task_id": "hard",
65
+ "name": "Multi-Root Sev-0 Incident",
66
+ "difficulty": "hard",
67
+ "description": (
68
+ "Multiple simultaneous failures across a multi-region microservice architecture. "
69
+ "Failures may have conflicting mitigations. Triage, diagnose, and resolve all "
70
+ "root causes within 50 steps."
71
+ ),
72
+ "max_steps": 50,
73
+ "num_failures": 3,
74
+ },
75
+ ]
76
+
77
+
78
+ def get_task_definition(task_id: str) -> dict:
79
+ """Get a task definition by ID."""
80
+ for t in TASK_DEFINITIONS:
81
+ if t["task_id"] == task_id:
82
+ return t
83
+ raise ValueError(f"Unknown task_id: {task_id!r}. Must be one of: easy, medium, hard")
84
+
85
+
86
+ # ---------------------------------------------------------------------------
87
+ # Failure placement logic
88
+ # ---------------------------------------------------------------------------
89
+
90
+
91
+ def _pick_failure_target(
92
+ graph: ServiceGraph,
93
+ failure_type: FailureType,
94
+ rng: random.Random,
95
+ exclude: set,
96
+ ) -> Optional[str]:
97
+ """Pick an appropriate service to inject this failure type into."""
98
+ candidates = []
99
+
100
+ for node in graph.nodes:
101
+ if node.id in exclude:
102
+ continue
103
+
104
+ # Cache failures only on cache services
105
+ if failure_type == FailureType.CACHE_FAILURE:
106
+ if node.is_cache:
107
+ candidates.append(node.id)
108
+ continue
109
+
110
+ # DB degradation on infra services (postgres, etc.)
111
+ if failure_type == FailureType.DB_DEGRADATION:
112
+ if node.layer == "infra" and "postgres" in node.id:
113
+ candidates.append(node.id)
114
+ continue
115
+
116
+ # Network errors prefer non-edge services
117
+ if failure_type == FailureType.NETWORK_ERROR:
118
+ if node.layer != "edge":
119
+ candidates.append(node.id)
120
+ continue
121
+
122
+ # Config errors on any non-edge service
123
+ if failure_type in (FailureType.CONFIG_STARTUP, FailureType.CONFIG_RUNTIME):
124
+ if node.layer != "edge":
125
+ candidates.append(node.id)
126
+ continue
127
+
128
+ # Bad deploy on business or identity services
129
+ if failure_type == FailureType.BAD_DEPLOY:
130
+ if node.layer in ("business", "identity"):
131
+ candidates.append(node.id)
132
+ continue
133
+
134
+ # Resource leak on business services
135
+ if failure_type == FailureType.RESOURCE_LEAK:
136
+ if node.layer in ("business", "identity"):
137
+ candidates.append(node.id)
138
+ continue
139
+
140
+ # Crash on any non-edge service
141
+ if failure_type == FailureType.CRASH:
142
+ if node.layer != "edge":
143
+ candidates.append(node.id)
144
+ continue
145
+
146
+ # Cascading latency: prefer hotspot infra or busy business
147
+ if failure_type == FailureType.CASCADING_LATENCY:
148
+ if node.is_hotspot or node.layer == "business":
149
+ candidates.append(node.id)
150
+ continue
151
+
152
+ if not candidates:
153
+ # Fallback: any non-edge service
154
+ candidates = [n.id for n in graph.nodes if n.layer != "edge" and n.id not in exclude]
155
+
156
+ if not candidates:
157
+ return None
158
+
159
+ return rng.choice(candidates)
160
+
161
+
162
+ # ---------------------------------------------------------------------------
163
+ # Scenario generation
164
+ # ---------------------------------------------------------------------------
165
+
166
+
167
+ def generate_scenario(seed: int, task_id: str) -> ScenarioConfig:
168
+ """
169
+ Generate a complete scenario for the given task and seed.
170
+ Deterministic: same seed + same task_id = identical scenario.
171
+ """
172
+ task = get_task_definition(task_id)
173
+ rng = random.Random(seed)
174
+
175
+ # Generate graph
176
+ difficulty = task["difficulty"]
177
+ graph = generate_graph(difficulty, rng)
178
+
179
+ # Select and place failures
180
+ num_failures = task["num_failures"]
181
+ used_services: set = set()
182
+ failure_specs: List[FailureSpec] = []
183
+
184
+ if num_failures == 1:
185
+ ft = select_failure_type(rng)
186
+ target = _pick_failure_target(graph, ft, rng, used_services)
187
+ if target:
188
+ spec = make_failure_spec(target, ft, rng)
189
+ failure_specs.append(spec)
190
+ used_services.add(target)
191
+ else:
192
+ failure_types = select_multi_root_failures(rng, count=num_failures)
193
+ for ft in failure_types:
194
+ target = _pick_failure_target(graph, ft, rng, used_services)
195
+ if target:
196
+ spec = make_failure_spec(target, ft, rng)
197
+ failure_specs.append(spec)
198
+ used_services.add(target)
199
+
200
+ return ScenarioConfig(
201
+ difficulty=difficulty,
202
+ seed=seed,
203
+ graph=graph,
204
+ failure_specs=failure_specs,
205
+ max_steps=task["max_steps"],
206
+ description=task["description"],
207
+ )
server/simulator.py ADDED
@@ -0,0 +1,965 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server/simulator.py — Core discrete-event simulation engine.
3
+
4
+ Orchestrates the service graph, failure injection, metric evolution,
5
+ propagation, log generation, and trace generation into a coherent
6
+ per-tick simulation loop.
7
+
8
+ Fully deterministic: random.Random(seed) exclusively.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import random
14
+ from dataclasses import dataclass, field
15
+ from typing import Any, Dict, List, Optional, Tuple
16
+
17
+ from server.failures import (
18
+ FailureSpec,
19
+ FailureType,
20
+ apply_failure_to_metrics,
21
+ make_failure_spec,
22
+ )
23
+ from server.graph import ServiceGraph, ServiceNode, generate_graph
24
+ from server.logs import generate_healthy_log, generate_log_message
25
+ from server.propagation import (
26
+ CircuitBreaker,
27
+ ServiceRuntimeState,
28
+ propagate_failures,
29
+ )
30
+ from server.traces import generate_trace
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # SLO targets
35
+ # ---------------------------------------------------------------------------
36
+
37
+ # Per-difficulty SLO thresholds: a service is "meeting SLO" if ALL conditions hold
38
+ SLO_TARGETS = {
39
+ "easy": {"max_error_rate": 0.05, "max_p99_ms": 500, "max_cpu": 85, "max_memory": 90},
40
+ "medium": {"max_error_rate": 0.05, "max_p99_ms": 1000, "max_cpu": 90, "max_memory": 90},
41
+ "hard": {"max_error_rate": 0.05, "max_p99_ms": 2000, "max_cpu": 95, "max_memory": 95},
42
+ }
43
+
44
+
45
+ def _service_meets_slo(state: ServiceRuntimeState, difficulty: str) -> bool:
46
+ targets = SLO_TARGETS[difficulty]
47
+ return (
48
+ state.error_rate <= targets["max_error_rate"]
49
+ and state.latency_p99_ms <= targets["max_p99_ms"]
50
+ and state.cpu_pct <= targets["max_cpu"]
51
+ and state.memory_pct <= targets["max_memory"]
52
+ )
53
+
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Pending action effects (delayed remediation)
57
+ # ---------------------------------------------------------------------------
58
+
59
+ @dataclass
60
+ class PendingEffect:
61
+ """A remediation action effect that resolves after a delay."""
62
+ action_type: str
63
+ target_service: str
64
+ params: Dict[str, Any]
65
+ resolve_tick: int # Tick at which this effect takes place
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Simulator
70
+ # ---------------------------------------------------------------------------
71
+
72
+ @dataclass
73
+ class Simulator:
74
+ """
75
+ Core simulation engine.
76
+
77
+ Usage:
78
+ sim = Simulator()
79
+ obs_data = sim.reset(seed=42, difficulty="easy")
80
+ obs_data = sim.step(action_type="inspect_logs", params={"service_id": "order-service"})
81
+ """
82
+
83
+ # --- Graph and topology ---
84
+ graph: Optional[ServiceGraph] = None
85
+ difficulty: str = "easy"
86
+
87
+ # --- Mutable per-service state ---
88
+ services: Dict[str, ServiceRuntimeState] = field(default_factory=dict)
89
+
90
+ # --- Failure injection ---
91
+ failures: List[FailureSpec] = field(default_factory=list)
92
+ failure_onset_tick: Dict[str, int] = field(default_factory=dict) # service_id → tick failure started
93
+
94
+ # --- Simulation state ---
95
+ tick: int = 0
96
+ max_steps: int = 10
97
+ terminated: bool = False
98
+ termination_reason: Optional[str] = None
99
+
100
+ # --- Pending remediation effects ---
101
+ pending_effects: List[PendingEffect] = field(default_factory=list)
102
+
103
+ # --- Action history ---
104
+ actions_taken: List[Dict[str, Any]] = field(default_factory=list)
105
+
106
+ # --- Deploy history ---
107
+ deploys: List[Dict[str, Any]] = field(default_factory=list)
108
+
109
+ # --- Diagnostic output (from inspect_* actions, consumed by observation builder) ---
110
+ last_logs: Optional[str] = None
111
+ last_metric_history: Optional[List[Dict[str, Any]]] = None
112
+ last_traces: Optional[Dict[str, Any]] = None
113
+
114
+ # --- Metric history per service (for inspect_metrics) ---
115
+ metric_history: Dict[str, List[Dict[str, Any]]] = field(default_factory=dict)
116
+
117
+ # --- RNG ---
118
+ rng: random.Random = field(default_factory=random.Random)
119
+
120
+ # --- Remediation tracking ---
121
+ remediated_services: Dict[str, int] = field(default_factory=dict) # service_id → tick remediated
122
+
123
+ def reset(
124
+ self,
125
+ seed: int,
126
+ difficulty: str,
127
+ failure_specs: Optional[List[FailureSpec]] = None,
128
+ ) -> None:
129
+ """Initialize a new episode. Call get_observation() after this."""
130
+ self.rng = random.Random(seed)
131
+ self.difficulty = difficulty
132
+ self.tick = 0
133
+ self.terminated = False
134
+ self.termination_reason = None
135
+ self.pending_effects = []
136
+ self.actions_taken = []
137
+ self.deploys = []
138
+ self.last_logs = None
139
+ self.last_metric_history = None
140
+ self.last_traces = None
141
+ self.metric_history = {}
142
+ self.remediated_services = {}
143
+
144
+ # Step budgets
145
+ budgets = {"easy": 10, "medium": 20, "hard": 50}
146
+ self.max_steps = budgets.get(difficulty, 10)
147
+
148
+ # Generate graph
149
+ self.graph = generate_graph(difficulty, self.rng)
150
+
151
+ # Initialize runtime state for each service
152
+ self.services = {}
153
+ for node in self.graph.nodes:
154
+ state = ServiceRuntimeState(
155
+ service_id=node.id,
156
+ arrival_rate=node.base_arrival_rate,
157
+ service_time_local=node.base_service_time_local,
158
+ thread_pool_size=node.thread_pool_size,
159
+ replicas=node.default_replicas,
160
+ version=node.default_version,
161
+ timeout_ms=node.default_timeout_ms,
162
+ retry_max=node.default_retry_max,
163
+ retry_backoff=node.default_retry_backoff,
164
+ pool_size=node.default_pool_size,
165
+ )
166
+ # Initialize circuit breakers for dependencies
167
+ for dep_id in self.graph.adjacency.get(node.id, []):
168
+ state.circuit_breakers[dep_id] = CircuitBreaker(
169
+ error_threshold=node.default_circuit_breaker_threshold,
170
+ )
171
+ self.services[state.service_id] = state
172
+ self.metric_history[state.service_id] = []
173
+
174
+ # Inject failures
175
+ self.failures = failure_specs or []
176
+ self.failure_onset_tick = {}
177
+ for spec in self.failures:
178
+ self.failure_onset_tick[spec.service_id] = 0
179
+ svc = self.services.get(spec.service_id)
180
+ if svc:
181
+ svc.has_active_failure = True
182
+ # Apply bad deploy version
183
+ if spec.failure_type == FailureType.BAD_DEPLOY and spec.bad_version:
184
+ svc.previous_version = svc.version
185
+ svc.version = spec.bad_version
186
+ self.deploys.append({
187
+ "service": spec.service_id,
188
+ "version": spec.bad_version,
189
+ "ticks_ago": 0,
190
+ })
191
+
192
+ # Run initial tick of failure evolution
193
+ self._evolve_failures()
194
+ self._run_propagation()
195
+ self._record_metrics()
196
+
197
+ def step(self, action_type: str, params: Dict[str, Any]) -> float:
198
+ """
199
+ Execute one agent action and advance the simulation by one tick.
200
+ Returns the step reward (dense Δ-SLO shaping).
201
+ """
202
+ if self.terminated:
203
+ return 0.0
204
+
205
+ prev_slo = self.get_slo_score()
206
+
207
+ # Clear diagnostic output from previous step
208
+ self.last_logs = None
209
+ self.last_metric_history = None
210
+ self.last_traces = None
211
+
212
+ # Process the action
213
+ action_record = self._process_action(action_type, params)
214
+ self.actions_taken.append(action_record)
215
+
216
+ # Advance tick
217
+ self.tick += 1
218
+
219
+ # Resolve pending effects
220
+ self._resolve_pending_effects()
221
+
222
+ # Evolve failures (for non-remediated services)
223
+ self._evolve_failures()
224
+
225
+ # Run propagation
226
+ self._run_propagation()
227
+
228
+ # Record metric history
229
+ self._record_metrics()
230
+
231
+ # Update deploy ticks_ago
232
+ for d in self.deploys:
233
+ d["ticks_ago"] += 1
234
+
235
+ # Compute reward
236
+ new_slo = self.get_slo_score()
237
+ reward = self._compute_reward(prev_slo, new_slo, action_type, action_record)
238
+
239
+ # Check termination
240
+ self._check_termination()
241
+
242
+ return reward
243
+
244
+ # -------------------------------------------------------------------
245
+ # Action processing
246
+ # -------------------------------------------------------------------
247
+
248
+ def _process_action(self, action_type: str, params: Dict[str, Any]) -> Dict[str, Any]:
249
+ """Process an agent action. Returns an action record dict."""
250
+ service_id = params.get("service_id")
251
+ record = {
252
+ "tick": self.tick,
253
+ "action": action_type,
254
+ "target": service_id,
255
+ "success": False,
256
+ "note": None,
257
+ }
258
+
259
+ if action_type == "noop":
260
+ record["success"] = True
261
+ record["note"] = "Waited and observed"
262
+ return record
263
+
264
+ if action_type == "inspect_logs":
265
+ return self._do_inspect_logs(service_id, record)
266
+ elif action_type == "inspect_metrics":
267
+ return self._do_inspect_metrics(service_id, record)
268
+ elif action_type == "inspect_traces":
269
+ return self._do_inspect_traces(service_id, record)
270
+ elif action_type == "restart_service":
271
+ return self._do_restart(service_id, record)
272
+ elif action_type == "rollback_service":
273
+ return self._do_rollback(service_id, record)
274
+ elif action_type == "scale_service":
275
+ return self._do_scale(service_id, params, record)
276
+ elif action_type == "tune_config":
277
+ return self._do_tune_config(service_id, params, record)
278
+ elif action_type == "clear_cache":
279
+ return self._do_clear_cache(params, record)
280
+ elif action_type == "rebalance_traffic":
281
+ return self._do_rebalance_traffic(params, record)
282
+ elif action_type == "pause_job":
283
+ return self._do_pause_job(params, record)
284
+ else:
285
+ record["note"] = f"Unknown action type: {action_type}"
286
+ return record
287
+
288
+ def _do_inspect_logs(self, service_id: Optional[str], record: Dict) -> Dict:
289
+ svc = self.services.get(service_id or "")
290
+ if not svc:
291
+ record["note"] = f"Service '{service_id}' not found"
292
+ return record
293
+
294
+ record["success"] = True
295
+ # Generate log output based on service state
296
+ logs_lines = []
297
+ failure = self._get_failure_for_service(service_id)
298
+ if failure and svc.error_rate > 0.01:
299
+ dep = self._get_primary_dependency(service_id)
300
+ for _ in range(self.rng.randint(3, 6)):
301
+ logs_lines.append(generate_log_message(
302
+ failure.failure_type, service_id, self.rng,
303
+ dependency=dep,
304
+ error_rate=svc.error_rate,
305
+ memory_pct=svc.memory_pct,
306
+ p99_ms=svc.latency_p99_ms,
307
+ pool_pct=svc.connection_pool_usage_pct,
308
+ version=svc.version,
309
+ config_key=failure.broken_config_key or "unknown",
310
+ config_value=failure.broken_config_value or "unknown",
311
+ region=self.graph.node_map[service_id].region if self.graph and service_id in self.graph.node_map else "us-east-1",
312
+ throughput=svc.throughput_rps,
313
+ ))
314
+ elif svc.error_rate > 0.01:
315
+ # Propagated errors — show upstream dependency issues
316
+ dep = self._get_primary_dependency(service_id)
317
+ logs_lines.append(f"WARN {service_id} Elevated error rate: {svc.error_rate*100:.1f}%. Upstream dependency {dep} may be degraded.")
318
+ logs_lines.append(f"ERROR {service_id} Request to {dep} failed: timeout after {svc.timeout_ms}ms. Retry 1/{svc.retry_max}.")
319
+ else:
320
+ logs_lines.append(generate_healthy_log(service_id, self.rng))
321
+
322
+ self.last_logs = "\n".join(logs_lines)
323
+ return record
324
+
325
+ def _do_inspect_metrics(self, service_id: Optional[str], record: Dict) -> Dict:
326
+ svc = self.services.get(service_id or "")
327
+ if not svc:
328
+ record["note"] = f"Service '{service_id}' not found"
329
+ return record
330
+
331
+ record["success"] = True
332
+ self.last_metric_history = self.metric_history.get(service_id, [])[-10:]
333
+ return record
334
+
335
+ def _do_inspect_traces(self, service_id: Optional[str], record: Dict) -> Dict:
336
+ svc = self.services.get(service_id or "")
337
+ if not svc or not self.graph:
338
+ record["note"] = f"Service '{service_id}' not found"
339
+ return record
340
+
341
+ record["success"] = True
342
+ errors = {sid: s.error_rate for sid, s in self.services.items()}
343
+ latencies = {sid: s.latency_p99_ms for sid, s in self.services.items()}
344
+ self.last_traces = generate_trace(
345
+ service_id, self.graph, errors, latencies, self.rng,
346
+ )
347
+ return record
348
+
349
+ def _do_restart(self, service_id: Optional[str], record: Dict) -> Dict:
350
+ svc = self.services.get(service_id or "")
351
+ if not svc:
352
+ record["note"] = f"Service '{service_id}' not found"
353
+ return record
354
+
355
+ failure = self._get_failure_for_service(service_id)
356
+ # Restart fixes: CRASH, RESOURCE_LEAK (temporarily), CONFIG_STARTUP (if config was fixed)
357
+ if failure and failure.failure_type in (FailureType.CRASH, FailureType.RESOURCE_LEAK):
358
+ delay = self.rng.randint(1, 2)
359
+ self.pending_effects.append(PendingEffect(
360
+ action_type="restart_service",
361
+ target_service=service_id,
362
+ params={},
363
+ resolve_tick=self.tick + delay,
364
+ ))
365
+ record["success"] = True
366
+ record["note"] = f"Restarting {service_id}, effect in {delay} tick(s)"
367
+ elif failure and failure.failure_type == FailureType.CONFIG_STARTUP:
368
+ # Config startup: restart alone doesn't fix it (need tune_config first)
369
+ record["success"] = True
370
+ record["note"] = f"Restarted {service_id} but config error persists — fix config first"
371
+ elif failure:
372
+ # Restart gives temporary relief for other failures
373
+ delay = self.rng.randint(1, 2)
374
+ self.pending_effects.append(PendingEffect(
375
+ action_type="restart_partial",
376
+ target_service=service_id,
377
+ params={},
378
+ resolve_tick=self.tick + delay,
379
+ ))
380
+ record["success"] = True
381
+ record["note"] = f"Restarting {service_id}, partial recovery expected in {delay} tick(s)"
382
+ else:
383
+ record["success"] = True
384
+ record["note"] = f"{service_id} is healthy, restart had no effect"
385
+ return record
386
+
387
+ def _do_rollback(self, service_id: Optional[str], record: Dict) -> Dict:
388
+ svc = self.services.get(service_id or "")
389
+ if not svc:
390
+ record["note"] = f"Service '{service_id}' not found"
391
+ return record
392
+
393
+ if not svc.previous_version:
394
+ record["note"] = f"No previous version to rollback to for {service_id}"
395
+ return record
396
+
397
+ failure = self._get_failure_for_service(service_id)
398
+ if failure and failure.failure_type == FailureType.BAD_DEPLOY:
399
+ delay = self.rng.randint(2, 3)
400
+ self.pending_effects.append(PendingEffect(
401
+ action_type="rollback_service",
402
+ target_service=service_id,
403
+ params={"version": svc.previous_version},
404
+ resolve_tick=self.tick + delay,
405
+ ))
406
+ record["success"] = True
407
+ record["note"] = f"Rolling back {service_id} to {svc.previous_version}, effect in {delay} tick(s)"
408
+ else:
409
+ record["success"] = True
410
+ record["note"] = f"Rollback queued for {service_id} but issue may not be deploy-related"
411
+ delay = self.rng.randint(2, 3)
412
+ self.pending_effects.append(PendingEffect(
413
+ action_type="rollback_service",
414
+ target_service=service_id,
415
+ params={"version": svc.previous_version},
416
+ resolve_tick=self.tick + delay,
417
+ ))
418
+ return record
419
+
420
+ def _do_scale(self, service_id: Optional[str], params: Dict, record: Dict) -> Dict:
421
+ svc = self.services.get(service_id or "")
422
+ if not svc:
423
+ record["note"] = f"Service '{service_id}' not found"
424
+ return record
425
+
426
+ target_replicas = params.get("replicas", svc.replicas + 1)
427
+ node = self.graph.node_map.get(service_id) if self.graph else None
428
+ max_r = node.max_replicas if node else 8
429
+ target_replicas = max(1, min(target_replicas, max_r))
430
+
431
+ delay = self.rng.randint(2, 4)
432
+ self.pending_effects.append(PendingEffect(
433
+ action_type="scale_service",
434
+ target_service=service_id,
435
+ params={"replicas": target_replicas},
436
+ resolve_tick=self.tick + delay,
437
+ ))
438
+ record["success"] = True
439
+ record["note"] = f"Scaling {service_id} to {target_replicas} replicas, effect in {delay} tick(s)"
440
+ return record
441
+
442
+ def _do_tune_config(self, service_id: Optional[str], params: Dict, record: Dict) -> Dict:
443
+ svc = self.services.get(service_id or "")
444
+ if not svc:
445
+ record["note"] = f"Service '{service_id}' not found"
446
+ return record
447
+
448
+ key = params.get("key", "")
449
+ value = params.get("value", "")
450
+ record["success"] = True
451
+ record["target"] = service_id
452
+
453
+ failure = self._get_failure_for_service(service_id)
454
+ if failure and failure.failure_type in (FailureType.CONFIG_STARTUP, FailureType.CONFIG_RUNTIME):
455
+ if key == failure.broken_config_key:
456
+ # Correct fix!
457
+ self.pending_effects.append(PendingEffect(
458
+ action_type="tune_config_fix",
459
+ target_service=service_id,
460
+ params={"key": key, "value": value},
461
+ resolve_tick=self.tick + 1,
462
+ ))
463
+ record["note"] = f"Config key '{key}' updated on {service_id}. Fix takes effect next tick."
464
+ else:
465
+ record["note"] = f"Config key '{key}' updated on {service_id}, but this may not be the broken key."
466
+ else:
467
+ # General config tune (e.g., timeout, retry)
468
+ self._apply_config_immediately(svc, key, value)
469
+ record["note"] = f"Config '{key}'={value} applied to {service_id}"
470
+ return record
471
+
472
+ def _do_clear_cache(self, params: Dict, record: Dict) -> Dict:
473
+ cache_name = params.get("cache_name") or params.get("service_id", "")
474
+ record["target"] = cache_name
475
+
476
+ if not self.graph or cache_name not in self.graph.cache_services:
477
+ record["note"] = f"'{cache_name}' is not a cache service"
478
+ return record
479
+
480
+ failure = self._get_failure_for_service(cache_name)
481
+ if failure and failure.failure_type == FailureType.CACHE_FAILURE:
482
+ self.pending_effects.append(PendingEffect(
483
+ action_type="clear_cache",
484
+ target_service=cache_name,
485
+ params={},
486
+ resolve_tick=self.tick + 1,
487
+ ))
488
+ record["success"] = True
489
+ record["note"] = f"Flushing cache {cache_name}, recovery in 1 tick"
490
+ else:
491
+ record["success"] = True
492
+ record["note"] = f"Cache {cache_name} flushed (was not failing)"
493
+ return record
494
+
495
+ def _do_rebalance_traffic(self, params: Dict, record: Dict) -> Dict:
496
+ from_region = params.get("from_region", "")
497
+ to_region = params.get("to_region", "")
498
+ pct = params.get("pct", 50)
499
+ record["target"] = f"{from_region}->{to_region}"
500
+
501
+ if not self.graph or not self.graph.has_multiple_regions:
502
+ record["note"] = "Traffic rebalancing only available in multi-region (hard) mode"
503
+ return record
504
+
505
+ delay = self.rng.randint(2, 3)
506
+ self.pending_effects.append(PendingEffect(
507
+ action_type="rebalance_traffic",
508
+ target_service="",
509
+ params={"from_region": from_region, "to_region": to_region, "pct": pct},
510
+ resolve_tick=self.tick + delay,
511
+ ))
512
+ record["success"] = True
513
+ record["note"] = f"Shifting {pct}% traffic from {from_region} to {to_region}, effect in {delay} tick(s)"
514
+ return record
515
+
516
+ def _do_pause_job(self, params: Dict, record: Dict) -> Dict:
517
+ job_name = params.get("job_name") or params.get("service_id", "")
518
+ record["target"] = job_name
519
+
520
+ if not self.graph or job_name not in self.graph.background_jobs:
521
+ record["note"] = f"'{job_name}' is not a background job service"
522
+ return record
523
+
524
+ svc = self.services.get(job_name)
525
+ if svc:
526
+ svc.arrival_rate *= 0.3 # Reduce load significantly
527
+ record["success"] = True
528
+ record["note"] = f"Background job on {job_name} paused, load reduced"
529
+ return record
530
+
531
+ # -------------------------------------------------------------------
532
+ # Effect resolution
533
+ # -------------------------------------------------------------------
534
+
535
+ def _resolve_pending_effects(self) -> None:
536
+ """Resolve pending effects that have reached their tick."""
537
+ still_pending = []
538
+ for effect in self.pending_effects:
539
+ if self.tick >= effect.resolve_tick:
540
+ self._apply_effect(effect)
541
+ else:
542
+ still_pending.append(effect)
543
+ self.pending_effects = still_pending
544
+
545
+ def _apply_effect(self, effect: PendingEffect) -> None:
546
+ svc = self.services.get(effect.target_service)
547
+
548
+ if effect.action_type == "restart_service":
549
+ # Full restart: clears crash/leak failures
550
+ if svc:
551
+ self._remediate_service(effect.target_service)
552
+ svc.memory_pct = 30.0 # Reset memory (leak fix)
553
+
554
+ elif effect.action_type == "restart_partial":
555
+ # Partial: temporary relief
556
+ if svc:
557
+ svc.error_rate *= 0.5
558
+ svc.memory_pct = max(30.0, svc.memory_pct * 0.7)
559
+
560
+ elif effect.action_type == "rollback_service":
561
+ if svc:
562
+ version = effect.params.get("version", svc.previous_version)
563
+ svc.version = version
564
+ svc.previous_version = None
565
+ self._remediate_service(effect.target_service)
566
+ self.deploys.append({
567
+ "service": effect.target_service,
568
+ "version": version,
569
+ "ticks_ago": 0,
570
+ })
571
+
572
+ elif effect.action_type == "scale_service":
573
+ if svc:
574
+ svc.replicas = effect.params.get("replicas", svc.replicas)
575
+
576
+ elif effect.action_type == "tune_config_fix":
577
+ self._remediate_service(effect.target_service)
578
+ # If config_startup, also need a restart — but we apply partial fix
579
+ failure = self._get_failure_for_service(effect.target_service)
580
+ if failure and failure.failure_type == FailureType.CONFIG_STARTUP:
581
+ # Config fixed + implicit restart
582
+ if svc:
583
+ svc.error_rate = 0.02 # Near-zero while restarting
584
+
585
+ elif effect.action_type == "clear_cache":
586
+ self._remediate_service(effect.target_service)
587
+
588
+ elif effect.action_type == "rebalance_traffic":
589
+ # Reduce arrival rate in from_region, increase in to_region
590
+ from_region = effect.params.get("from_region", "")
591
+ to_region = effect.params.get("to_region", "")
592
+ pct = effect.params.get("pct", 50) / 100.0
593
+ if self.graph:
594
+ for node in self.graph.nodes:
595
+ s = self.services.get(node.id)
596
+ if not s:
597
+ continue
598
+ if node.region == from_region:
599
+ s.arrival_rate *= (1 - pct)
600
+ elif node.region == to_region:
601
+ s.arrival_rate *= (1 + pct * 0.5) # Some traffic absorbed
602
+
603
+ def _remediate_service(self, service_id: str) -> None:
604
+ """Mark a service as remediated — stop failure evolution."""
605
+ self.remediated_services[service_id] = self.tick
606
+ svc = self.services.get(service_id)
607
+ if svc:
608
+ svc.has_active_failure = False
609
+ svc.failure_ticks = 0
610
+
611
+ def _apply_config_immediately(self, svc: ServiceRuntimeState, key: str, value: Any) -> None:
612
+ """Apply a config change that takes effect immediately."""
613
+ if key == "timeout_ms":
614
+ svc.timeout_ms = int(value)
615
+ elif key == "retry_max":
616
+ svc.retry_max = int(value)
617
+ elif key == "pool_size":
618
+ svc.pool_size = int(value)
619
+ elif key == "retry_backoff":
620
+ svc.retry_backoff = bool(value)
621
+
622
+ # -------------------------------------------------------------------
623
+ # Failure evolution
624
+ # -------------------------------------------------------------------
625
+
626
+ def _evolve_failures(self) -> None:
627
+ """Evolve all active failures by one tick."""
628
+ for spec in self.failures:
629
+ sid = spec.service_id
630
+ if sid in self.remediated_services:
631
+ # Remediated — gradually recover
632
+ svc = self.services.get(sid)
633
+ if svc:
634
+ svc.error_rate = max(0.0, svc.error_rate * 0.5)
635
+ svc.latency_p99_ms = max(50.0, svc.latency_p99_ms * 0.7)
636
+ svc.cpu_pct = max(10.0, svc.cpu_pct * 0.8)
637
+ svc.memory_pct = max(25.0, svc.memory_pct * 0.9)
638
+ svc.connection_pool_usage_pct = max(5.0, svc.connection_pool_usage_pct * 0.7)
639
+ svc.status = svc.compute_status()
640
+ continue
641
+
642
+ svc = self.services.get(sid)
643
+ if not svc:
644
+ continue
645
+
646
+ onset = self.failure_onset_tick.get(sid, 0)
647
+ ticks_since = self.tick - onset
648
+
649
+ node = self.graph.node_map.get(sid) if self.graph else None
650
+ base_p99 = 100.0
651
+ base_cpu = 15.0
652
+ base_memory = 30.0
653
+ base_pool = 10.0
654
+
655
+ error_rate, p99_ms, cpu_pct, memory_pct, pool_pct = apply_failure_to_metrics(
656
+ spec, ticks_since,
657
+ base_error_rate=0.0,
658
+ base_p99_ms=base_p99,
659
+ base_cpu=base_cpu,
660
+ base_memory=base_memory,
661
+ base_pool=base_pool,
662
+ rng=self.rng,
663
+ )
664
+
665
+ svc.error_rate = error_rate
666
+ svc.update_latency_percentiles(base_p99, p99_ms / base_p99, self.rng)
667
+ svc.cpu_pct = cpu_pct
668
+ svc.memory_pct = memory_pct
669
+ svc.connection_pool_usage_pct = pool_pct
670
+ svc.failure_ticks = ticks_since
671
+ svc.status = svc.compute_status()
672
+
673
+ def _run_propagation(self) -> None:
674
+ """Run propagation engine to cascade failures through the graph."""
675
+ if not self.graph:
676
+ return
677
+
678
+ edge_activation = {}
679
+ for edge in self.graph.edges:
680
+ edge_activation[(edge.source, edge.target)] = edge.activation_probability
681
+
682
+ propagate_failures(
683
+ self.services,
684
+ self.graph.adjacency,
685
+ self.graph.reverse_adjacency,
686
+ edge_activation,
687
+ self.rng,
688
+ current_tick=self.tick,
689
+ )
690
+
691
+ # -------------------------------------------------------------------
692
+ # Metric recording
693
+ # -------------------------------------------------------------------
694
+
695
+ def _record_metrics(self) -> None:
696
+ """Record current metrics snapshot for all services."""
697
+ for sid, svc in self.services.items():
698
+ self.metric_history[sid].append({
699
+ "tick": self.tick,
700
+ "error_rate": round(svc.error_rate, 4),
701
+ "latency_p99_ms": round(svc.latency_p99_ms, 1),
702
+ "cpu_pct": round(svc.cpu_pct, 1),
703
+ "memory_pct": round(svc.memory_pct, 1),
704
+ "pool_pct": round(svc.connection_pool_usage_pct, 1),
705
+ "throughput_rps": round(svc.throughput_rps, 1),
706
+ "status": svc.status,
707
+ })
708
+
709
+ # -------------------------------------------------------------------
710
+ # Reward computation
711
+ # -------------------------------------------------------------------
712
+
713
+ def _compute_reward(
714
+ self, prev_slo: float, new_slo: float,
715
+ action_type: str, record: Dict,
716
+ ) -> float:
717
+ """Dense Δ-SLO reward with action-type penalties."""
718
+ # Base: delta SLO (positive = improvement)
719
+ delta = new_slo - prev_slo
720
+ reward = delta * 10.0 # Scale up for signal strength
721
+
722
+ # Bonus for reaching full recovery
723
+ if new_slo >= 1.0:
724
+ reward += 5.0
725
+
726
+ # Penalty for invalid/failed actions
727
+ if not record.get("success", False):
728
+ reward -= 0.5
729
+
730
+ # Small penalty for non-diagnostic actions (encourage efficiency)
731
+ if action_type not in ("inspect_logs", "inspect_metrics", "inspect_traces", "noop"):
732
+ reward -= 0.1 # Small cost for remediation actions
733
+
734
+ # Penalty for redundant noops when system is degraded
735
+ if action_type == "noop" and new_slo < 0.9:
736
+ reward -= 0.2
737
+
738
+ return round(reward, 4)
739
+
740
+ # -------------------------------------------------------------------
741
+ # Termination
742
+ # -------------------------------------------------------------------
743
+
744
+ def _check_termination(self) -> None:
745
+ """Check if the episode should end."""
746
+ slo = self.get_slo_score()
747
+
748
+ # Success: all SLOs met
749
+ if slo >= 1.0:
750
+ self.terminated = True
751
+ self.termination_reason = "resolved"
752
+ return
753
+
754
+ # Timeout: exceeded step budget
755
+ if self.tick >= self.max_steps:
756
+ self.terminated = True
757
+ self.termination_reason = "timeout"
758
+ return
759
+
760
+ # System collapse: all services down
761
+ down_count = sum(1 for s in self.services.values() if s.status == "down")
762
+ if down_count == len(self.services) and len(self.services) > 0:
763
+ self.terminated = True
764
+ self.termination_reason = "failed"
765
+
766
+ # -------------------------------------------------------------------
767
+ # Observation helpers
768
+ # -------------------------------------------------------------------
769
+
770
+ def get_slo_score(self) -> float:
771
+ """Fraction of services meeting SLO targets."""
772
+ if not self.services:
773
+ return 0.0
774
+ meeting = sum(1 for s in self.services.values() if _service_meets_slo(s, self.difficulty))
775
+ return meeting / len(self.services)
776
+
777
+ def get_observation_summary(self) -> str:
778
+ """Generate a natural-language summary of the current state."""
779
+ slo = self.get_slo_score()
780
+ total = len(self.services)
781
+ healthy = sum(1 for s in self.services.values() if s.status == "healthy")
782
+ degraded = sum(1 for s in self.services.values() if s.status == "degraded")
783
+ critical = sum(1 for s in self.services.values() if s.status == "critical")
784
+ down = sum(1 for s in self.services.values() if s.status == "down")
785
+
786
+ parts = []
787
+ if down > 0:
788
+ parts.append(f"{down} service(s) DOWN")
789
+ if critical > 0:
790
+ parts.append(f"{critical} CRITICAL")
791
+ if degraded > 0:
792
+ parts.append(f"{degraded} degraded")
793
+ if healthy > 0:
794
+ parts.append(f"{healthy} healthy")
795
+
796
+ status_str = ", ".join(parts) if parts else "all nominal"
797
+ return f"Tick {self.tick}/{self.max_steps}: SLO compliance {slo*100:.0f}% ({status_str}). {total} services total."
798
+
799
+ def get_alerts(self) -> List[Dict[str, Any]]:
800
+ """Generate active alerts from current service states."""
801
+ alerts = []
802
+ for sid, svc in self.services.items():
803
+ if svc.error_rate >= 0.50:
804
+ alerts.append({
805
+ "severity": "critical",
806
+ "service": sid,
807
+ "type": "error_rate_high",
808
+ "message": f"{sid} error rate at {svc.error_rate*100:.0f}%",
809
+ "first_seen_tick": max(0, self.tick - svc.failure_ticks),
810
+ })
811
+ elif svc.error_rate >= 0.05:
812
+ alerts.append({
813
+ "severity": "warning",
814
+ "service": sid,
815
+ "type": "error_rate_high",
816
+ "message": f"{sid} error rate elevated at {svc.error_rate*100:.1f}%",
817
+ "first_seen_tick": max(0, self.tick - svc.failure_ticks),
818
+ })
819
+
820
+ if svc.latency_p99_ms >= 5000:
821
+ alerts.append({
822
+ "severity": "critical",
823
+ "service": sid,
824
+ "type": "latency_high",
825
+ "message": f"{sid} p99 latency {svc.latency_p99_ms:.0f}ms",
826
+ "first_seen_tick": max(0, self.tick - svc.failure_ticks),
827
+ })
828
+ elif svc.latency_p99_ms >= 1000:
829
+ alerts.append({
830
+ "severity": "warning",
831
+ "service": sid,
832
+ "type": "latency_high",
833
+ "message": f"{sid} p99 latency elevated at {svc.latency_p99_ms:.0f}ms",
834
+ "first_seen_tick": max(0, self.tick - svc.failure_ticks),
835
+ })
836
+
837
+ if svc.status == "down":
838
+ alerts.append({
839
+ "severity": "critical",
840
+ "service": sid,
841
+ "type": "service_down",
842
+ "message": f"{sid} is DOWN",
843
+ "first_seen_tick": max(0, self.tick - svc.failure_ticks),
844
+ })
845
+
846
+ if svc.memory_pct >= 90:
847
+ alerts.append({
848
+ "severity": "warning",
849
+ "service": sid,
850
+ "type": "memory_high",
851
+ "message": f"{sid} memory at {svc.memory_pct:.0f}%",
852
+ "first_seen_tick": max(0, self.tick - svc.failure_ticks),
853
+ })
854
+
855
+ if svc.connection_pool_usage_pct >= 80:
856
+ alerts.append({
857
+ "severity": "warning",
858
+ "service": sid,
859
+ "type": "connection_pool_saturated",
860
+ "message": f"{sid} connection pool at {svc.connection_pool_usage_pct:.0f}%",
861
+ "first_seen_tick": max(0, self.tick - svc.failure_ticks),
862
+ })
863
+
864
+ # Circuit breaker alerts
865
+ for dep_id, breaker in svc.circuit_breakers.items():
866
+ if breaker.state.value == "OPEN":
867
+ alerts.append({
868
+ "severity": "warning",
869
+ "service": sid,
870
+ "type": "circuit_breaker_open",
871
+ "message": f"{sid} circuit breaker OPEN for {dep_id}",
872
+ "first_seen_tick": max(0, self.tick - breaker.ticks_in_current_state),
873
+ })
874
+
875
+ # Sort by severity (critical first)
876
+ severity_order = {"critical": 0, "warning": 1, "info": 2}
877
+ alerts.sort(key=lambda a: severity_order.get(a["severity"], 9))
878
+ return alerts
879
+
880
+ def get_legal_actions(self) -> List[Dict[str, Any]]:
881
+ """Return the set of currently legal actions with valid targets."""
882
+ service_ids = list(self.services.keys())
883
+ actions = [
884
+ {"action_type": "noop", "valid_targets": []},
885
+ {"action_type": "inspect_logs", "valid_targets": service_ids},
886
+ {"action_type": "inspect_metrics", "valid_targets": service_ids},
887
+ {"action_type": "inspect_traces", "valid_targets": service_ids},
888
+ {"action_type": "restart_service", "valid_targets": service_ids},
889
+ ]
890
+
891
+ # Rollback: only services with previous versions
892
+ rollback_targets = [sid for sid, s in self.services.items() if s.previous_version]
893
+ if rollback_targets:
894
+ actions.append({"action_type": "rollback_service", "valid_targets": rollback_targets})
895
+
896
+ # Scale: all services
897
+ actions.append({"action_type": "scale_service", "valid_targets": service_ids})
898
+
899
+ # Tune config: all services
900
+ actions.append({"action_type": "tune_config", "valid_targets": service_ids})
901
+
902
+ # Clear cache: only cache services
903
+ if self.graph and self.graph.cache_services:
904
+ actions.append({"action_type": "clear_cache", "valid_targets": self.graph.cache_services})
905
+
906
+ # Rebalance traffic: only in multi-region
907
+ if self.graph and self.graph.has_multiple_regions:
908
+ actions.append({
909
+ "action_type": "rebalance_traffic",
910
+ "valid_targets": self.graph.regions,
911
+ })
912
+
913
+ # Pause job: only background job services
914
+ if self.graph and self.graph.background_jobs:
915
+ actions.append({"action_type": "pause_job", "valid_targets": self.graph.background_jobs})
916
+
917
+ return actions
918
+
919
+ def get_service_observations(self) -> List[Dict[str, Any]]:
920
+ """Build per-service observation dicts."""
921
+ result = []
922
+ for sid, svc in self.services.items():
923
+ node = self.graph.node_map.get(sid) if self.graph else None
924
+ deps = self.graph.adjacency.get(sid, []) if self.graph else []
925
+ cb_states = {
926
+ dep: breaker.state.value
927
+ for dep, breaker in svc.circuit_breakers.items()
928
+ }
929
+ result.append({
930
+ "id": sid,
931
+ "layer": node.layer if node else "unknown",
932
+ "status": svc.status,
933
+ "error_rate": round(svc.error_rate, 4),
934
+ "latency_p50_ms": round(svc.latency_p50_ms, 1),
935
+ "latency_p95_ms": round(svc.latency_p95_ms, 1),
936
+ "latency_p99_ms": round(svc.latency_p99_ms, 1),
937
+ "throughput_rps": round(svc.throughput_rps, 1),
938
+ "cpu_pct": round(svc.cpu_pct, 1),
939
+ "memory_pct": round(svc.memory_pct, 1),
940
+ "connection_pool_usage_pct": round(svc.connection_pool_usage_pct, 1),
941
+ "replicas": svc.replicas,
942
+ "version": svc.version,
943
+ "previous_version": svc.previous_version,
944
+ "depends_on": deps,
945
+ "circuit_breakers": cb_states,
946
+ })
947
+ return result
948
+
949
+ # -------------------------------------------------------------------
950
+ # Internal helpers
951
+ # -------------------------------------------------------------------
952
+
953
+ def _get_failure_for_service(self, service_id: Optional[str]) -> Optional[FailureSpec]:
954
+ if not service_id:
955
+ return None
956
+ for spec in self.failures:
957
+ if spec.service_id == service_id and service_id not in self.remediated_services:
958
+ return spec
959
+ return None
960
+
961
+ def _get_primary_dependency(self, service_id: Optional[str]) -> str:
962
+ if not service_id or not self.graph:
963
+ return "unknown"
964
+ deps = self.graph.adjacency.get(service_id, [])
965
+ return deps[0] if deps else "unknown"
server/traces.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server/traces.py — Distributed trace generation for inspect_traces action.
3
+
4
+ Generates realistic Jaeger/Zipkin-style trace trees showing request flow
5
+ through the service dependency graph. Healthy services show normal latencies;
6
+ failing services show errors, timeouts, and cascading delays.
7
+
8
+ Each trace is a tree of spans rooted at the inspected service.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import random
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ from server.graph import ServiceGraph
17
+
18
+
19
+ def _make_span_id(rng: random.Random) -> str:
20
+ return f"{rng.randint(0, 0xFFFFFFFF):08x}"
21
+
22
+
23
+ def _make_trace_id(rng: random.Random) -> str:
24
+ return f"{rng.randint(0, 0xFFFFFFFFFFFFFFFF):016x}"
25
+
26
+
27
+ def generate_trace(
28
+ service_id: str,
29
+ graph: ServiceGraph,
30
+ service_errors: Dict[str, float],
31
+ service_latencies: Dict[str, float],
32
+ rng: random.Random,
33
+ max_depth: int = 4,
34
+ ) -> Dict[str, Any]:
35
+ """
36
+ Generate a distributed trace tree rooted at service_id.
37
+
38
+ Returns a dict with trace_id, root_span, and flat spans list.
39
+ service_errors: service_id → error_rate (0.0–1.0)
40
+ service_latencies: service_id → p99_ms
41
+ """
42
+ trace_id = _make_trace_id(rng)
43
+ spans: List[Dict[str, Any]] = []
44
+
45
+ def _build_span(
46
+ svc_id: str,
47
+ parent_span_id: Optional[str],
48
+ depth: int,
49
+ start_offset_ms: float,
50
+ ) -> Dict[str, Any]:
51
+ span_id = _make_span_id(rng)
52
+ error_rate = service_errors.get(svc_id, 0.0)
53
+ base_latency = service_latencies.get(svc_id, rng.uniform(5, 50))
54
+ has_error = rng.random() < error_rate
55
+
56
+ # Span duration: base latency + noise
57
+ if has_error and error_rate > 0.8:
58
+ # Fast fail or timeout
59
+ duration_ms = rng.choice([
60
+ rng.uniform(0.5, 5), # Fast fail
61
+ rng.uniform(3000, 10000), # Timeout
62
+ ])
63
+ elif has_error:
64
+ duration_ms = base_latency * rng.uniform(1.5, 5.0)
65
+ else:
66
+ duration_ms = base_latency * rng.uniform(0.3, 1.2)
67
+
68
+ duration_ms = max(0.1, duration_ms)
69
+
70
+ span = {
71
+ "span_id": span_id,
72
+ "parent_span_id": parent_span_id,
73
+ "service": svc_id,
74
+ "operation": _operation_name(svc_id, rng),
75
+ "start_ms": round(start_offset_ms, 1),
76
+ "duration_ms": round(duration_ms, 1),
77
+ "status": "ERROR" if has_error else "OK",
78
+ "tags": {},
79
+ }
80
+
81
+ if has_error:
82
+ span["tags"]["error"] = True
83
+ span["tags"]["error.message"] = _error_message(svc_id, error_rate, rng)
84
+
85
+ node = graph.node_map.get(svc_id)
86
+ if node:
87
+ span["tags"]["service.layer"] = node.layer
88
+ span["tags"]["service.region"] = node.region
89
+
90
+ spans.append(span)
91
+
92
+ # Recurse into downstream dependencies
93
+ if depth < max_depth:
94
+ deps = graph.adjacency.get(svc_id, [])
95
+ child_offset = start_offset_ms + rng.uniform(0.1, 2.0)
96
+ for dep_id in deps:
97
+ # Check edge activation (probabilistic)
98
+ edge = next(
99
+ (e for e in graph.edges if e.source == svc_id and e.target == dep_id),
100
+ None,
101
+ )
102
+ if edge and rng.random() > edge.activation_probability:
103
+ continue
104
+
105
+ child_span = _build_span(dep_id, span_id, depth + 1, child_offset)
106
+ child_offset += child_span["duration_ms"] + rng.uniform(0.1, 1.0)
107
+
108
+ return span
109
+
110
+ root_span = _build_span(service_id, None, 0, 0.0)
111
+
112
+ # Compute total trace duration
113
+ if spans:
114
+ total_duration = max(s["start_ms"] + s["duration_ms"] for s in spans)
115
+ else:
116
+ total_duration = 0.0
117
+
118
+ return {
119
+ "trace_id": trace_id,
120
+ "root_service": service_id,
121
+ "span_count": len(spans),
122
+ "total_duration_ms": round(total_duration, 1),
123
+ "spans": spans,
124
+ }
125
+
126
+
127
+ def _operation_name(service_id: str, rng: random.Random) -> str:
128
+ """Generate a realistic operation name based on service type."""
129
+ if "gateway" in service_id or "bff" in service_id:
130
+ return rng.choice(["HTTP GET /api/v1/resource", "HTTP POST /api/v1/action", "HTTP GET /health"])
131
+ if "auth" in service_id or "identity" in service_id or "session" in service_id:
132
+ return rng.choice(["validateToken", "authenticate", "refreshSession"])
133
+ if "postgres" in service_id:
134
+ return rng.choice(["SELECT", "INSERT", "UPDATE", "pg_pool.checkout"])
135
+ if "redis" in service_id:
136
+ return rng.choice(["GET", "SET", "MGET", "EXPIRE"])
137
+ if "kafka" in service_id:
138
+ return rng.choice(["produce", "consume", "commitOffset"])
139
+ if "elasticsearch" in service_id:
140
+ return rng.choice(["search", "index", "bulk"])
141
+ return rng.choice(["processRequest", "handleMessage", "execute"])
142
+
143
+
144
+ def _error_message(service_id: str, error_rate: float, rng: random.Random) -> str:
145
+ """Generate a trace-level error message."""
146
+ if error_rate > 0.8:
147
+ return rng.choice([
148
+ f"{service_id}: Connection refused",
149
+ f"{service_id}: Service unavailable (HTTP 503)",
150
+ f"{service_id}: Timeout after 5000ms",
151
+ ])
152
+ return rng.choice([
153
+ f"{service_id}: Internal server error (HTTP 500)",
154
+ f"{service_id}: Upstream dependency timeout",
155
+ f"{service_id}: Rate limited (HTTP 429)",
156
+ f"{service_id}: Bad gateway (HTTP 502)",
157
+ ])