Mist-ic commited on
Commit
64d38cb
·
0 Parent(s):

Checkpoint: existing implementation (pre-cleanup)

Browse files
Files changed (8) hide show
  1. .gitignore +5 -0
  2. models.py +300 -0
  3. openenv.yaml +10 -0
  4. pyproject.toml +32 -0
  5. sdk_info.txt +24 -0
  6. server/__init__.py +1 -0
  7. server/failures.py +437 -0
  8. server/graph.py +470 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Documentation and research (not part of the submission)
2
+ Docs/
3
+
4
+ # OpenEnv preparatory course (dev reference only, not part of submission)
5
+ openenv-course/
models.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SevZero — Typed Pydantic models for Action, Observation, and State.
3
+
4
+ These are the public API contracts at the package root (OpenEnv requirement).
5
+ Every field is documented because the observation JSON must be self-explanatory
6
+ to any LLM evaluator without additional context.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, Dict, List, Optional, Union
12
+
13
+ from pydantic import Field
14
+
15
+ from openenv.core.env_server import Action, Observation, State
16
+
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Sub-models: nested inside SevZeroObservation
20
+ # ---------------------------------------------------------------------------
21
+
22
+
23
+ class CircuitBreakerInfo(dict):
24
+ """Maps dependency name -> breaker state ('CLOSED' | 'OPEN' | 'HALF_OPEN')."""
25
+
26
+
27
+ class ServiceInfo(object):
28
+ """Per-service observable state — declared as plain dict in observation for
29
+ JSON-serialisability; structured via ServiceInfoModel for validation."""
30
+
31
+
32
+ class ServiceInfoModel:
33
+ """Pydantic model for a single service's metrics (used internally)."""
34
+
35
+
36
+ from pydantic import BaseModel
37
+
38
+
39
+ class ServiceInfoModel(BaseModel):
40
+ """
41
+ All observable per-service metrics, ordered by SRE triage priority:
42
+ symptoms first, traffic second, saturation third, context last.
43
+ """
44
+
45
+ # Identity
46
+ id: str = Field(description="Service identifier, e.g. 'payment-service'")
47
+ layer: str = Field(
48
+ description="Service layer: 'edge' | 'domain' | 'infra' | 'cross-cutting'"
49
+ )
50
+ status: str = Field(
51
+ description="Aggregate health: 'healthy' | 'degraded' | 'critical' | 'down'"
52
+ )
53
+
54
+ # --- Symptoms (error + latency) ---
55
+ error_rate: float = Field(
56
+ description="Fraction of requests failing this tick (0.0–1.0)"
57
+ )
58
+ latency_p50_ms: float = Field(description="Median request latency in milliseconds")
59
+ latency_p95_ms: float = Field(description="95th-percentile latency in milliseconds")
60
+ latency_p99_ms: float = Field(description="99th-percentile latency in milliseconds")
61
+
62
+ # --- Traffic ---
63
+ throughput_rps: float = Field(
64
+ description="Successful requests served per tick"
65
+ )
66
+
67
+ # --- Saturation ---
68
+ cpu_pct: float = Field(description="CPU utilisation 0–100")
69
+ memory_pct: float = Field(description="Memory utilisation 0–100")
70
+ connection_pool_usage_pct: float = Field(
71
+ description="DB connection pool saturation 0–100; high = I/O bottleneck"
72
+ )
73
+
74
+ # --- Deployment context ---
75
+ replicas: int = Field(description="Number of running replicas")
76
+ version: str = Field(description="Currently deployed version tag")
77
+ previous_version: Optional[str] = Field(
78
+ default=None,
79
+ description="Previous version available for rollback; null if never changed",
80
+ )
81
+
82
+ # --- Dependency graph ---
83
+ depends_on: List[str] = Field(
84
+ default_factory=list,
85
+ description="Direct service dependencies (downstream calls)",
86
+ )
87
+ circuit_breakers: Dict[str, str] = Field(
88
+ default_factory=dict,
89
+ description=(
90
+ "Per-dependency circuit breaker state. "
91
+ "Keys are dependency IDs; values are 'CLOSED' | 'OPEN' | 'HALF_OPEN'."
92
+ ),
93
+ )
94
+
95
+
96
+ class AlertInfo(BaseModel):
97
+ """A structured active alert, ordered by severity."""
98
+
99
+ severity: str = Field(description="'critical' | 'warning' | 'info'")
100
+ service: str = Field(description="Service ID that triggered the alert")
101
+ type: str = Field(
102
+ description=(
103
+ "Alert category: 'error_rate_high' | 'latency_high' | "
104
+ "'circuit_breaker_open' | 'connection_pool_saturated' | "
105
+ "'memory_high' | 'cpu_high' | 'service_down'"
106
+ )
107
+ )
108
+ message: str = Field(description="Human-readable alert description with metric values")
109
+ first_seen_tick: int = Field(description="Tick at which this alert first fired")
110
+
111
+
112
+ class DeployInfo(BaseModel):
113
+ """A recent deployment event visible in the observation."""
114
+
115
+ service: str = Field(description="Service that was deployed")
116
+ version: str = Field(description="New version deployed")
117
+ ticks_ago: int = Field(description="How many ticks ago the deploy happened")
118
+
119
+
120
+ class ActionRecord(BaseModel):
121
+ """A previously taken action, shown in the observation for agent context."""
122
+
123
+ tick: int = Field(description="Tick at which the action was executed")
124
+ action: str = Field(description="Action type, e.g. 'restart_service'")
125
+ target: Optional[str] = Field(default=None, description="Primary target service/resource")
126
+ success: bool = Field(description="Whether the action completed successfully")
127
+ note: Optional[str] = Field(
128
+ default=None,
129
+ description="Extra context, e.g. 'service already healthy' or error reason",
130
+ )
131
+
132
+
133
+ class LegalAction(BaseModel):
134
+ """One type of action the agent is currently allowed to take."""
135
+
136
+ action_type: str = Field(
137
+ description=(
138
+ "One of: inspect_logs | inspect_metrics | inspect_traces | "
139
+ "restart_service | rollback_service | scale_service | tune_config | "
140
+ "clear_cache | rebalance_traffic | pause_job | noop"
141
+ )
142
+ )
143
+ valid_targets: List[str] = Field(
144
+ description="Service IDs (or other resource names) this action can target right now"
145
+ )
146
+
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # Top-level OpenEnv models
150
+ # ---------------------------------------------------------------------------
151
+
152
+
153
+ class SevZeroAction(Action):
154
+ """
155
+ An action the agent takes in SevZero.
156
+
157
+ Choose exactly one action_type and provide the required params for it:
158
+
159
+ inspect_logs(service_id) -> logs: str in next observation
160
+ inspect_metrics(service_id) -> metric_history in next observation
161
+ inspect_traces(service_id) -> traces in next observation
162
+ restart_service(service_id) -> restarts pod; 1-2 tick delay
163
+ rollback_service(service_id) -> reverts to previous_version; 2-3 tick delay
164
+ scale_service(service_id, replicas=N) -> adjusts replica count; 2-4 tick delay
165
+ tune_config(service_id, key, value) -> updates config param; 1 tick delay
166
+ clear_cache(cache_name) -> flushes cache; 1 tick delay
167
+ rebalance_traffic(from_region, to_region, pct) -> shifts traffic; 2-3 tick delay
168
+ pause_job(job_name) -> pauses background job; 1 tick delay
169
+ noop() -> wait and observe; 0 ticks
170
+ """
171
+
172
+ action_type: str = Field(
173
+ description=(
174
+ "Which operation to perform. Must be one of the 11 action types. "
175
+ "Must appear in legal_actions from the previous observation."
176
+ )
177
+ )
178
+ params: Dict[str, Any] = Field(
179
+ default_factory=dict,
180
+ description=(
181
+ "Action parameters. Examples: "
182
+ "{'service_id': 'payment-service'}, "
183
+ "{'service_id': 'payment-service', 'replicas': 4}, "
184
+ "{'service_id': 'payment-service', 'key': 'timeout_ms', 'value': 2000}"
185
+ ),
186
+ )
187
+
188
+
189
+ class SevZeroObservation(Observation):
190
+ """
191
+ Full observation returned by reset() and step().
192
+
193
+ Fields are ordered by SRE triage priority: incident summary first,
194
+ then per-service metrics, then alerts, then context, then agent state.
195
+
196
+ The `done` and `reward` fields are inherited from Observation base.
197
+ """
198
+
199
+ # --- Episode context ---
200
+ tick: int = Field(default=0, description="Current simulation tick (0-indexed)")
201
+ episode_id: Optional[str] = Field(
202
+ default=None, description="Unique ID for this episode"
203
+ )
204
+ task_id: str = Field(
205
+ default="easy",
206
+ description="Which task is running: 'easy' | 'medium' | 'hard'",
207
+ )
208
+ status: str = Field(
209
+ default="playing",
210
+ description=(
211
+ "Episode status: 'playing' | 'resolved' (all SLOs met) | "
212
+ "'failed' (system collapse) | 'timeout' (max steps exceeded)"
213
+ ),
214
+ )
215
+ max_steps: int = Field(
216
+ default=10, description="Step budget for this task (Easy=10, Medium=20, Hard=50)"
217
+ )
218
+
219
+ # --- Health summary ---
220
+ global_slo_score: float = Field(
221
+ default=0.0,
222
+ description="Fraction of services currently meeting all SLO targets (0.0–1.0)",
223
+ )
224
+ observation_summary: str = Field(
225
+ default="",
226
+ description=(
227
+ "One-sentence natural-language summary of the current situation. "
228
+ "Read this first — it gives you the critical context for your next action."
229
+ ),
230
+ )
231
+
232
+ # --- Per-service state ---
233
+ services: List[Dict[str, Any]] = Field(
234
+ default_factory=list,
235
+ description=(
236
+ "Full state for every service in the cluster. "
237
+ "See ServiceInfoModel for field definitions."
238
+ ),
239
+ )
240
+
241
+ # --- Active alerts ---
242
+ alerts: List[Dict[str, Any]] = Field(
243
+ default_factory=list,
244
+ description="Active alerts sorted by severity (critical first). See AlertInfo.",
245
+ )
246
+
247
+ # --- Context ---
248
+ recent_deploys: List[Dict[str, Any]] = Field(
249
+ default_factory=list,
250
+ description="Deployments in the last 10 ticks. Correlate with error onset.",
251
+ )
252
+ actions_taken: List[Dict[str, Any]] = Field(
253
+ default_factory=list,
254
+ description="Last 10 actions taken in this episode, for agent context.",
255
+ )
256
+
257
+ # --- Action space ---
258
+ legal_actions: List[Dict[str, Any]] = Field(
259
+ default_factory=list,
260
+ description=(
261
+ "Exactly what actions are available right now with valid targets. "
262
+ "Only use actions listed here. Invalid actions return a -0.5 penalty."
263
+ ),
264
+ )
265
+
266
+ # --- Diagnostic output from inspect_* actions ---
267
+ logs: Optional[str] = Field(
268
+ default=None,
269
+ description="Log output from the most recent inspect_logs action, if any.",
270
+ )
271
+ metric_history: Optional[List[Dict[str, Any]]] = Field(
272
+ default=None,
273
+ description="Per-tick metric history from the most recent inspect_metrics action.",
274
+ )
275
+ traces: Optional[Dict[str, Any]] = Field(
276
+ default=None,
277
+ description="Distributed trace from the most recent inspect_traces action.",
278
+ )
279
+
280
+
281
+ class SevZeroState(State):
282
+ """
283
+ Episode metadata returned by the state property.
284
+ `episode_id` and `step_count` are inherited from State base.
285
+ """
286
+
287
+ task_id: str = Field(default="easy", description="Which task: 'easy' | 'medium' | 'hard'")
288
+ seed: Optional[int] = Field(
289
+ default=None, description="Seed used for this episode (for reproducibility)"
290
+ )
291
+ global_slo_score: float = Field(
292
+ default=0.0, description="Current fraction of services meeting SLO targets"
293
+ )
294
+ terminated: bool = Field(
295
+ default=False, description="Whether the episode has ended for any reason"
296
+ )
297
+ termination_reason: Optional[str] = Field(
298
+ default=None,
299
+ description="Why the episode ended: 'resolved' | 'failed' | 'timeout' | None",
300
+ )
openenv.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ name: sevzero
2
+ version: "1.0.0"
3
+ description: "SRE Incident Response Environment — an autonomous on-call SRE managing a microservice cluster undergoing cascading failures"
4
+ tags:
5
+ - openenv
6
+ - sre
7
+ - incident-response
8
+ - reinforcement-learning
9
+ - microservices
10
+ - agentic
pyproject.toml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "sevzero"
3
+ version = "1.0.0"
4
+ description = "SRE Incident Response Environment for OpenEnv"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "openenv-core>=0.2.2",
9
+ "fastapi>=0.104.0",
10
+ "uvicorn>=0.24.0",
11
+ "pydantic>=2.0.0",
12
+ "openai>=1.0.0",
13
+ ]
14
+
15
+ [project.optional-dependencies]
16
+ dev = [
17
+ "pytest>=7.0.0",
18
+ "httpx>=0.24.0",
19
+ ]
20
+
21
+ [build-system]
22
+ requires = ["hatchling"]
23
+ build-backend = "hatchling.build"
24
+
25
+ [tool.hatch.build.targets.wheel]
26
+ packages = ["server"]
27
+
28
+ [tool.uv]
29
+ dev-dependencies = [
30
+ "pytest>=7.0.0",
31
+ "httpx>=0.24.0",
32
+ ]
sdk_info.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ === Observation fields ===
2
+ done: annotation=bool required=False default=False description='Whether the episode has terminated'
3
+ reward: annotation=Union[bool, int, float, NoneType] required=False default=None description='Reward signal from the last action'
4
+ metadata: annotation=Dict[str, Any] required=False default_factory=dict description='Additional metadata for the observation'
5
+
6
+ === State fields ===
7
+ episode_id: annotation=Union[str, NoneType] required=False default=None description='Unique identifier for the current episode'
8
+ step_count: annotation=int required=False default=0 description='Number of steps taken in the current episode' metadata=[Ge(ge=0)]
9
+
10
+ === Action fields ===
11
+ metadata: annotation=Dict[str, Any] required=False default_factory=dict description='Additional metadata for the action'
12
+
13
+ === Environment methods ===
14
+ _apply_rubric(self, action: ~ActT, observation: ~ObsT) -> float
15
+ _apply_rubric_async(self, action: ~ActT, observation: ~ObsT) -> float
16
+ _apply_transform(self, observation: ~ObsT) -> ~ObsT
17
+ _reset_rubric(self) -> None
18
+ _reset_rubric_async(self) -> None
19
+ close(self) -> None
20
+ get_metadata(self) -> openenv.core.env_server.types.EnvironmentMetadata
21
+ reset(self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs: Any) -> ~ObsT
22
+ reset_async(self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs: Any) -> ~ObsT
23
+ step(self, action: ~ActT, timeout_s: Optional[float] = None, **kwargs: Any) -> ~ObsT
24
+ step_async(self, action: ~ActT, timeout_s: Optional[float] = None, **kwargs: Any) -> ~ObsT
server/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """server/__init__.py — marks server/ as a Python package."""
server/failures.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server/failures.py — 8 failure types with injection logic and metric evolution patterns.
3
+
4
+ Each failure type has:
5
+ - A distinctive metric temporal shape (how metrics evolve per tick)
6
+ - Config error subtypes (startup vs runtime)
7
+ - Weighted distribution matching real-world incident data
8
+
9
+ Sources: Google SRE postmortems, Netflix Hystrix, AWS incident reports.
10
+ See Docs/DataResearch.md for full citation.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import random
16
+ from dataclasses import dataclass, field
17
+ from enum import Enum
18
+ from typing import Dict, List, Optional, Tuple
19
+
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # Failure taxonomy
23
+ # ---------------------------------------------------------------------------
24
+
25
+
26
+ class FailureType(str, Enum):
27
+ CRASH = "crash"
28
+ BAD_DEPLOY = "bad_deploy"
29
+ CONFIG_STARTUP = "config_startup" # Service can't boot
30
+ CONFIG_RUNTIME = "config_runtime" # Service runs but specific paths fail
31
+ CASCADING_LATENCY = "cascading_latency"
32
+ RESOURCE_LEAK = "resource_leak"
33
+ DB_DEGRADATION = "db_degradation"
34
+ CACHE_FAILURE = "cache_failure"
35
+ NETWORK_ERROR = "network_error"
36
+
37
+
38
+ # Weighted distribution matching Google empirical incident data
39
+ # config=32%, deploy=25%, cascade=15%, crash=10%, leak=8%, DB=5%, cache=3%, network=2%
40
+ _FAILURE_WEIGHTS: Dict[FailureType, float] = {
41
+ FailureType.CONFIG_STARTUP: 0.16,
42
+ FailureType.CONFIG_RUNTIME: 0.16,
43
+ FailureType.BAD_DEPLOY: 0.25,
44
+ FailureType.CASCADING_LATENCY: 0.15,
45
+ FailureType.CRASH: 0.10,
46
+ FailureType.RESOURCE_LEAK: 0.08,
47
+ FailureType.DB_DEGRADATION: 0.05,
48
+ FailureType.CACHE_FAILURE: 0.03,
49
+ FailureType.NETWORK_ERROR: 0.02,
50
+ }
51
+
52
+ # For multi-root incidents: avoid unlikely combinations
53
+ _INCOMPATIBLE_PAIRS = {
54
+ (FailureType.NETWORK_ERROR, FailureType.NETWORK_ERROR), # Two network errors is unrealistic
55
+ (FailureType.CACHE_FAILURE, FailureType.CACHE_FAILURE), # Two cache failures is unrealistic
56
+ }
57
+
58
+
59
+ @dataclass
60
+ class FailureSpec:
61
+ """Describes a single injected failure and its evolution parameters."""
62
+
63
+ service_id: str
64
+ failure_type: FailureType
65
+
66
+ # Error rates at various stages (used by metric evolution)
67
+ base_error_rate: float = 0.0 # Healthy baseline
68
+ peak_error_rate: float = 0.0 # At full failure
69
+ onset_ticks: int = 1 # Ticks to reach peak (1=instant, 5=gradual)
70
+
71
+ # Latency impact at peak
72
+ latency_multiplier: float = 1.0 # How much p99 multiplies at peak
73
+
74
+ # Resource impact at peak
75
+ cpu_impact: float = 0.0 # CPU increase (0–1)
76
+ memory_impact: float = 0.0 # Memory increase per tick (for leaks)
77
+ pool_saturation: float = 0.0 # Connection pool impact
78
+
79
+ # Config error subtype metadata
80
+ broken_config_key: Optional[str] = None # Which config key is wrong
81
+ broken_config_value: Optional[str] = None # What the wrong value is
82
+
83
+ # Deployment metadata (for bad_deploy)
84
+ bad_version: Optional[str] = None
85
+ good_version: Optional[str] = None
86
+
87
+ # Network error metadata
88
+ affected_region: Optional[str] = None
89
+
90
+
91
+ # ---------------------------------------------------------------------------
92
+ # Failure selection
93
+ # ---------------------------------------------------------------------------
94
+
95
+
96
+ def select_failure_type(
97
+ rng: random.Random,
98
+ exclude: Optional[List[FailureType]] = None,
99
+ ) -> FailureType:
100
+ """Sample a failure type from the empirically-weighted distribution."""
101
+ population = list(_FAILURE_WEIGHTS.keys())
102
+ weights = [_FAILURE_WEIGHTS[f] for f in population]
103
+
104
+ # Remove excluded types
105
+ if exclude:
106
+ filtered = [(f, w) for f, w in zip(population, weights) if f not in exclude]
107
+ if filtered:
108
+ population, weights = zip(*filtered)
109
+ population, weights = list(population), list(weights)
110
+
111
+ return rng.choices(population, weights=weights, k=1)[0]
112
+
113
+
114
+ def select_multi_root_failures(
115
+ rng: random.Random, count: int = 2
116
+ ) -> List[FailureType]:
117
+ """Select multiple failure types with incompatibility constraints."""
118
+ selected: List[FailureType] = []
119
+ for _ in range(count):
120
+ exclude = selected[:]
121
+ # Also exclude incompatible pairs
122
+ for s in selected:
123
+ for a, b in _INCOMPATIBLE_PAIRS:
124
+ if s == a:
125
+ exclude.append(b)
126
+ elif s == b:
127
+ exclude.append(a)
128
+ ft = select_failure_type(rng, exclude=exclude)
129
+ selected.append(ft)
130
+ return selected
131
+
132
+
133
+ # ---------------------------------------------------------------------------
134
+ # Failure specification factories
135
+ # ---------------------------------------------------------------------------
136
+
137
+
138
+ def make_crash_spec(service_id: str, rng: random.Random) -> FailureSpec:
139
+ """Service Crash: sudden 5xx spike then drop (service is dead)."""
140
+ return FailureSpec(
141
+ service_id=service_id,
142
+ failure_type=FailureType.CRASH,
143
+ base_error_rate=0.0,
144
+ peak_error_rate=rng.uniform(0.85, 1.0),
145
+ onset_ticks=1, # Instant
146
+ latency_multiplier=0.1, # Latency drops (fast fails, no waiting)
147
+ cpu_impact=0.0, # CPU near zero (process dead)
148
+ memory_impact=0.0,
149
+ )
150
+
151
+
152
+ def make_bad_deploy_spec(service_id: str, rng: random.Random) -> FailureSpec:
153
+ """Bad Deployment: step-function error increase after version change."""
154
+ return FailureSpec(
155
+ service_id=service_id,
156
+ failure_type=FailureType.BAD_DEPLOY,
157
+ base_error_rate=0.0,
158
+ peak_error_rate=rng.uniform(0.30, 0.70),
159
+ onset_ticks=1, # Step function — appears at deploy tick
160
+ latency_multiplier=rng.uniform(1.5, 3.0),
161
+ cpu_impact=rng.uniform(0.1, 0.3),
162
+ memory_impact=rng.uniform(0.05, 0.15),
163
+ bad_version="v" + str(rng.randint(2, 9)) + "." + str(rng.randint(0, 9)) + "." + str(rng.randint(1, 9)),
164
+ good_version="v1.0.0",
165
+ )
166
+
167
+
168
+ def make_config_startup_spec(service_id: str, rng: random.Random) -> FailureSpec:
169
+ """Config Error (Startup): service can't boot — zero traffic, health checks fail."""
170
+ config_keys = ["db_password", "db_host", "api_endpoint", "env_var", "config_file"]
171
+ return FailureSpec(
172
+ service_id=service_id,
173
+ failure_type=FailureType.CONFIG_STARTUP,
174
+ base_error_rate=0.0,
175
+ peak_error_rate=1.0, # 100% — service is completely down
176
+ onset_ticks=1,
177
+ latency_multiplier=0.0, # No latency, no traffic
178
+ cpu_impact=-0.9, # CPU near zero (process exited immediately)
179
+ memory_impact=-0.9,
180
+ broken_config_key=rng.choice(config_keys),
181
+ broken_config_value="WRONG_VALUE",
182
+ )
183
+
184
+
185
+ def make_config_runtime_spec(service_id: str, rng: random.Random) -> FailureSpec:
186
+ """Config Error (Runtime): service runs but specific code paths fail."""
187
+ config_keys = ["api_endpoint", "feature_flag", "timeout_ms", "retry_max"]
188
+ return FailureSpec(
189
+ service_id=service_id,
190
+ failure_type=FailureType.CONFIG_RUNTIME,
191
+ base_error_rate=0.0,
192
+ peak_error_rate=rng.uniform(0.20, 0.60),
193
+ onset_ticks=1,
194
+ latency_multiplier=rng.uniform(1.2, 2.0),
195
+ cpu_impact=0.0, # Normal resource usage
196
+ memory_impact=0.0,
197
+ broken_config_key=rng.choice(config_keys),
198
+ broken_config_value="MISCONFIGURED",
199
+ )
200
+
201
+
202
+ def make_cascading_latency_spec(service_id: str, rng: random.Random) -> FailureSpec:
203
+ """
204
+ Cascading Latency: gradual latency ramp → thread pool exhaustion.
205
+ KEY signature: p99 ramps BEFORE errors appear. CPU rises from blocked threads.
206
+ """
207
+ return FailureSpec(
208
+ service_id=service_id,
209
+ failure_type=FailureType.CASCADING_LATENCY,
210
+ base_error_rate=0.0,
211
+ peak_error_rate=rng.uniform(0.40, 0.85),
212
+ onset_ticks=rng.randint(3, 6), # Gradual ramp
213
+ latency_multiplier=rng.uniform(8.0, 20.0),
214
+ cpu_impact=rng.uniform(0.30, 0.60), # Rising CPU from blocked threads
215
+ memory_impact=rng.uniform(0.10, 0.25),
216
+ )
217
+
218
+
219
+ def make_resource_leak_spec(service_id: str, rng: random.Random) -> FailureSpec:
220
+ """Resource Leak: steady memory/CPU climb; sawtooth pattern on restarts."""
221
+ return FailureSpec(
222
+ service_id=service_id,
223
+ failure_type=FailureType.RESOURCE_LEAK,
224
+ base_error_rate=0.0,
225
+ peak_error_rate=rng.uniform(0.20, 0.50),
226
+ onset_ticks=rng.randint(5, 10), # Slow burn
227
+ latency_multiplier=rng.uniform(2.0, 5.0),
228
+ cpu_impact=0.05, # Grows per tick (applied in evolution)
229
+ memory_impact=0.06, # LINEAR RAMP — key signature
230
+ )
231
+
232
+
233
+ def make_db_degradation_spec(service_id: str, rng: random.Random) -> FailureSpec:
234
+ """DB Degradation: rising DB latency, pool saturation, app CPU paradoxically LOW."""
235
+ return FailureSpec(
236
+ service_id=service_id,
237
+ failure_type=FailureType.DB_DEGRADATION,
238
+ base_error_rate=0.0,
239
+ peak_error_rate=rng.uniform(0.30, 0.70),
240
+ onset_ticks=rng.randint(2, 4),
241
+ latency_multiplier=rng.uniform(5.0, 15.0),
242
+ cpu_impact=-0.2, # PARADOXICALLY LOW (waiting on I/O)
243
+ memory_impact=0.05,
244
+ pool_saturation=0.90, # Connection pool hits 90%+
245
+ )
246
+
247
+
248
+ def make_cache_failure_spec(service_id: str, rng: random.Random) -> FailureSpec:
249
+ """Cache Failure: hit-rate cliff → backend QPS 10-50x spike → DB overload."""
250
+ return FailureSpec(
251
+ service_id=service_id,
252
+ failure_type=FailureType.CACHE_FAILURE,
253
+ base_error_rate=0.0,
254
+ peak_error_rate=rng.uniform(0.20, 0.50),
255
+ onset_ticks=1, # CLIFF — simultaneous, not gradual
256
+ latency_multiplier=rng.uniform(3.0, 8.0),
257
+ cpu_impact=0.20,
258
+ memory_impact=0.0,
259
+ )
260
+
261
+
262
+ def make_network_error_spec(service_id: str, rng: random.Random, region: str = "us-east-1") -> FailureSpec:
263
+ """Network/Routing Error: connection failures affecting all services to this region."""
264
+ return FailureSpec(
265
+ service_id=service_id,
266
+ failure_type=FailureType.NETWORK_ERROR,
267
+ base_error_rate=0.0,
268
+ peak_error_rate=rng.uniform(0.80, 1.0),
269
+ onset_ticks=1, # Simultaneous, not hop-by-hop
270
+ latency_multiplier=0.2, # Timeout values — fixed high, then drop
271
+ cpu_impact=-0.3, # Low CPU (nothing getting through)
272
+ memory_impact=0.0,
273
+ affected_region=region,
274
+ )
275
+
276
+
277
+ _SPEC_FACTORIES = {
278
+ FailureType.CRASH: make_crash_spec,
279
+ FailureType.BAD_DEPLOY: make_bad_deploy_spec,
280
+ FailureType.CONFIG_STARTUP: make_config_startup_spec,
281
+ FailureType.CONFIG_RUNTIME: make_config_runtime_spec,
282
+ FailureType.CASCADING_LATENCY: make_cascading_latency_spec,
283
+ FailureType.RESOURCE_LEAK: make_resource_leak_spec,
284
+ FailureType.DB_DEGRADATION: make_db_degradation_spec,
285
+ FailureType.CACHE_FAILURE: make_cache_failure_spec,
286
+ FailureType.NETWORK_ERROR: make_network_error_spec,
287
+ }
288
+
289
+
290
+ def make_failure_spec(
291
+ service_id: str,
292
+ failure_type: FailureType,
293
+ rng: random.Random,
294
+ **kwargs,
295
+ ) -> FailureSpec:
296
+ """Create a FailureSpec for the given service and failure type."""
297
+ factory = _SPEC_FACTORIES[failure_type]
298
+ return factory(service_id, rng, **kwargs)
299
+
300
+
301
+ # ---------------------------------------------------------------------------
302
+ # Metric evolution: per-type temporal shapes
303
+ # ---------------------------------------------------------------------------
304
+
305
+
306
+ def compute_failure_magnitude(spec: FailureSpec, ticks_since_failure: int) -> float:
307
+ """
308
+ Return a 0.0–1.0 magnitude factor for how fully the failure has manifested.
309
+ - Instant failures (onset_ticks=1): full magnitude from tick 1
310
+ - Gradual failures: linear ramp over onset_ticks
311
+ - Resource leaks: continues growing after onset (handled separately)
312
+ """
313
+ if spec.onset_ticks <= 1:
314
+ return 1.0
315
+ return min(1.0, ticks_since_failure / spec.onset_ticks)
316
+
317
+
318
+ def apply_failure_to_metrics(
319
+ spec: FailureSpec,
320
+ ticks_since_failure: int,
321
+ base_error_rate: float,
322
+ base_p99_ms: float,
323
+ base_cpu: float,
324
+ base_memory: float,
325
+ base_pool: float,
326
+ rng: random.Random,
327
+ ) -> Tuple[float, float, float, float, float]:
328
+ """
329
+ Apply failure evolution to metrics.
330
+ Returns: (error_rate, p99_ms, cpu_pct, memory_pct, pool_pct)
331
+
332
+ Each failure type produces a DISTINCTIVE temporal shape:
333
+ - crash: instant spike → drop (service dead)
334
+ - bad_deploy: step function up at deploy tick
335
+ - config_startup: 100% error, zero traffic
336
+ - config_runtime: partial errors on affected paths
337
+ - cascading_latency: p99 ramps BEFORE errors (early warning)
338
+ - resource_leak: memory linear ramp, sawtooth CPU
339
+ - db_degradation: pool saturation, CPU paradoxically LOW
340
+ - cache_failure: cliff drop simultaneous
341
+ - network_error: cliff, then fixed-high timeout values
342
+ """
343
+ mag = compute_failure_magnitude(spec, ticks_since_failure)
344
+
345
+ # Add natural stochastic variance (±5%) — Bernoulli trial model
346
+ noise = rng.uniform(-0.03, 0.03)
347
+
348
+ ft = spec.failure_type
349
+
350
+ if ft == FailureType.CRASH:
351
+ error_rate = spec.peak_error_rate * mag + noise
352
+ p99_ms = base_p99_ms * 0.1 * mag + base_p99_ms * (1 - mag) # Drops fast
353
+ cpu_pct = max(0.0, base_cpu * (1 - 0.9 * mag))
354
+ memory_pct = base_memory
355
+ pool_pct = base_pool
356
+
357
+ elif ft == FailureType.BAD_DEPLOY:
358
+ error_rate = spec.peak_error_rate * mag + noise
359
+ p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
360
+ cpu_pct = min(100.0, base_cpu * (1 + spec.cpu_impact * mag))
361
+ memory_pct = min(100.0, base_memory * (1 + spec.memory_impact * mag))
362
+ pool_pct = base_pool
363
+
364
+ elif ft == FailureType.CONFIG_STARTUP:
365
+ error_rate = 1.0 # Always 100% — service won't start
366
+ p99_ms = 0.0 # No traffic = no latency
367
+ cpu_pct = max(0.0, base_cpu * 0.02) # Near zero
368
+ memory_pct = max(0.0, base_memory * 0.02)
369
+ pool_pct = 0.0
370
+
371
+ elif ft == FailureType.CONFIG_RUNTIME:
372
+ error_rate = spec.peak_error_rate * mag + noise
373
+ p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
374
+ cpu_pct = base_cpu # Normal — only specific paths fail
375
+ memory_pct = base_memory
376
+ pool_pct = base_pool
377
+
378
+ elif ft == FailureType.CASCADING_LATENCY:
379
+ # p99 ramps BEFORE errors — the key diagnostic signature
380
+ latency_onset_fraction = min(1.0, ticks_since_failure / max(1, spec.onset_ticks - 1))
381
+ error_onset_fraction = min(1.0, max(0.0, (ticks_since_failure - 1) / spec.onset_ticks))
382
+
383
+ error_rate = spec.peak_error_rate * error_onset_fraction + noise
384
+ p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * latency_onset_fraction)
385
+ cpu_pct = min(100.0, base_cpu * (1 + spec.cpu_impact * latency_onset_fraction))
386
+ memory_pct = min(100.0, base_memory * (1 + spec.memory_impact * latency_onset_fraction))
387
+ pool_pct = base_pool
388
+
389
+ elif ft == FailureType.RESOURCE_LEAK:
390
+ # Memory: LINEAR RAMP to limit (key signature)
391
+ # CPU: Growing GC thrash
392
+ leak_fraction = min(1.0, ticks_since_failure * 0.08) # ~12 ticks to peak
393
+ error_rate = spec.peak_error_rate * min(1.0, leak_fraction * 1.5) + noise
394
+ p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * leak_fraction)
395
+ cpu_pct = min(100.0, base_cpu * (1 + leak_fraction * 0.8)) # GC pressure
396
+ memory_pct = min(100.0, base_memory + leak_fraction * (100 - base_memory))
397
+ pool_pct = base_pool
398
+
399
+ elif ft == FailureType.DB_DEGRADATION:
400
+ error_rate = spec.peak_error_rate * mag + noise
401
+ p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
402
+ # CPU paradoxically LOW — waiting on I/O, not computing
403
+ cpu_pct = max(5.0, base_cpu * (1 + spec.cpu_impact * mag))
404
+ memory_pct = min(100.0, base_memory * (1 + spec.memory_impact * mag))
405
+ pool_pct = min(100.0, base_pool + spec.pool_saturation * mag * 100)
406
+
407
+ elif ft == FailureType.CACHE_FAILURE:
408
+ # CLIFF: simultaneous, not gradual (onset_ticks=1)
409
+ error_rate = spec.peak_error_rate * mag + noise
410
+ p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
411
+ cpu_pct = min(100.0, base_cpu * (1 + spec.cpu_impact * mag))
412
+ memory_pct = base_memory
413
+ pool_pct = base_pool
414
+
415
+ elif ft == FailureType.NETWORK_ERROR:
416
+ # Cliff: all fails simultaneously; latency = timeout values then 0
417
+ error_rate = spec.peak_error_rate * mag + noise
418
+ # Latency spikes to timeout then drops (nothing gets through)
419
+ p99_ms = base_p99_ms * 10.0 * max(0.1, 1 - ticks_since_failure * 0.3)
420
+ cpu_pct = max(2.0, base_cpu * (1 + spec.cpu_impact * mag))
421
+ memory_pct = base_memory
422
+ pool_pct = base_pool
423
+
424
+ else:
425
+ error_rate = base_error_rate
426
+ p99_ms = base_p99_ms
427
+ cpu_pct = base_cpu
428
+ memory_pct = base_memory
429
+ pool_pct = base_pool
430
+
431
+ return (
432
+ max(0.0, min(1.0, error_rate)),
433
+ max(1.0, p99_ms),
434
+ max(0.0, min(100.0, cpu_pct)),
435
+ max(0.0, min(100.0, memory_pct)),
436
+ max(0.0, min(100.0, pool_pct)),
437
+ )
server/graph.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server/graph.py — Service dependency graph generation.
3
+
4
+ Builds layered tree-like DAGs matching real production microservice topologies,
5
+ grounded in Alibaba trace analysis (depth ~3, 5% hotspot services, sparse edges).
6
+
7
+ Design principles:
8
+ - Services chosen from realistic role pools (not generic names)
9
+ - Layered: edge → identity → business → infra; edge → leaf dependencies
10
+ - Dependency edges are directed (A depends_on B = A calls B)
11
+ - ~5% of services are high-in-degree hotspots (shared cache, DB, auth)
12
+ - Sparse and tree-like; most nodes have in-degree 1
13
+ - Conditional edges have activation_probability < 1.0 (Easy: all 1.0)
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import random
19
+ from dataclasses import dataclass, field
20
+ from typing import Dict, List, Optional, Tuple
21
+
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Service role pools (realistic names, not generic)
25
+ # ---------------------------------------------------------------------------
26
+
27
+ _EDGE_POOL = [
28
+ "api-gateway",
29
+ "graphql-gateway",
30
+ "bff-web",
31
+ "bff-mobile",
32
+ "cdn-edge",
33
+ ]
34
+
35
+ _IDENTITY_POOL = [
36
+ "auth-service",
37
+ "identity-provider",
38
+ "session-service",
39
+ "oauth-service",
40
+ "token-service",
41
+ ]
42
+
43
+ _BUSINESS_POOL = [
44
+ "order-service",
45
+ "payment-service",
46
+ "inventory-service",
47
+ "catalog-service",
48
+ "pricing-service",
49
+ "cart-service",
50
+ "checkout-service",
51
+ "shipping-service",
52
+ "recommendation-service",
53
+ "search-service",
54
+ "review-service",
55
+ "subscription-service",
56
+ "billing-service",
57
+ "refund-service",
58
+ "notification-service",
59
+ ]
60
+
61
+ _INFRA_POOL = [
62
+ "postgres-primary",
63
+ "postgres-replica",
64
+ "redis-cache",
65
+ "redis-session",
66
+ "kafka-broker",
67
+ "elasticsearch",
68
+ "object-storage",
69
+ "config-service",
70
+ ]
71
+
72
+ _CROSS_CUTTING_POOL = [
73
+ "email-service",
74
+ "sms-service",
75
+ "metrics-collector",
76
+ "fraud-service",
77
+ "audit-service",
78
+ "feature-flags",
79
+ "rate-limiter",
80
+ ]
81
+
82
+
83
+ # ---------------------------------------------------------------------------
84
+ # Data structures
85
+ # ---------------------------------------------------------------------------
86
+
87
+
88
+ @dataclass
89
+ class ServiceNode:
90
+ """A service node in the dependency graph."""
91
+
92
+ id: str
93
+ layer: str # "edge" | "identity" | "business" | "infra" | "cross-cutting"
94
+
95
+ # Queueing theory baseline parameters (modified by failures at runtime)
96
+ base_arrival_rate: float = 100.0 # λ — requests/tick at baseline
97
+ base_service_time_local: float = 0.05 # S_local — seconds per request (local work)
98
+ thread_pool_size: int = 50 # T — max concurrent in-flight requests
99
+
100
+ # Default config (tunable by agent)
101
+ default_timeout_ms: int = 5000
102
+ default_retry_max: int = 3
103
+ default_retry_backoff: bool = False
104
+ default_circuit_breaker_threshold: float = 0.5
105
+ default_pool_size: int = 20
106
+
107
+ # Deployment defaults
108
+ default_replicas: int = 2
109
+ default_version: str = "v1.0.0"
110
+
111
+ # Whether this node is a "hotspot" (high in-degree shared infra)
112
+ is_hotspot: bool = False
113
+
114
+ # Whether this is a background-job node (can be pause_job target)
115
+ has_background_job: bool = False
116
+
117
+ # Whether this is a cache node (can be clear_cache target)
118
+ is_cache: bool = False
119
+
120
+ # Max replicas the agent can scale to
121
+ max_replicas: int = 8
122
+
123
+ # Region (for Hard mode multi-region topologies)
124
+ region: str = "us-east-1"
125
+
126
+
127
+ @dataclass
128
+ class DependencyEdge:
129
+ """A directed dependency edge: source depends on (calls) target."""
130
+
131
+ source: str # service that makes the call
132
+ target: str # service that receives the call
133
+
134
+ # Fraction of ticks this edge is active (1.0 = always; 0.2 = ~20% of ticks)
135
+ activation_probability: float = 1.0
136
+
137
+ # Edge type for documentation
138
+ edge_type: str = "sync" # "sync" | "async" | "optional"
139
+
140
+
141
+ @dataclass
142
+ class ServiceGraph:
143
+ """Complete service dependency graph for one episode."""
144
+
145
+ nodes: List[ServiceNode] = field(default_factory=list)
146
+ edges: List[DependencyEdge] = field(default_factory=list)
147
+
148
+ # Derived lookup structures (populated after build)
149
+ node_map: Dict[str, ServiceNode] = field(default_factory=dict)
150
+ adjacency: Dict[str, List[str]] = field(default_factory=dict) # source → [targets]
151
+ reverse_adjacency: Dict[str, List[str]] = field(default_factory=dict) # target → [callers]
152
+
153
+ # Metadata
154
+ difficulty: str = "easy"
155
+ has_multiple_regions: bool = False
156
+ regions: List[str] = field(default_factory=lambda: ["us-east-1"])
157
+ cache_services: List[str] = field(default_factory=list)
158
+ background_jobs: List[str] = field(default_factory=list)
159
+
160
+ def build_indices(self) -> None:
161
+ """Build lookup maps after nodes/edges are populated."""
162
+ self.node_map = {n.id: n for n in self.nodes}
163
+ self.adjacency = {n.id: [] for n in self.nodes}
164
+ self.reverse_adjacency = {n.id: [] for n in self.nodes}
165
+ for edge in self.edges:
166
+ self.adjacency[edge.source].append(edge.target)
167
+ self.reverse_adjacency[edge.target].append(edge.source)
168
+ self.cache_services = [n.id for n in self.nodes if n.is_cache]
169
+ self.background_jobs = [n.id for n in self.nodes if n.has_background_job]
170
+
171
+
172
+ # ---------------------------------------------------------------------------
173
+ # Graph generation functions
174
+ # ---------------------------------------------------------------------------
175
+
176
+
177
+ def _pick(pool: List[str], rng: random.Random, exclude: set) -> Optional[str]:
178
+ """Pick a random name from pool not already in exclude set."""
179
+ choices = [x for x in pool if x not in exclude]
180
+ if not choices:
181
+ return None
182
+ return rng.choice(choices)
183
+
184
+
185
+ def _make_node(
186
+ service_id: str,
187
+ layer: str,
188
+ is_hotspot: bool = False,
189
+ is_cache: bool = False,
190
+ has_background_job: bool = False,
191
+ arrival_rate: float = 100.0,
192
+ service_time: float = 0.05,
193
+ thread_pool: int = 50,
194
+ ) -> ServiceNode:
195
+ """Create a ServiceNode with sensible per-layer defaults."""
196
+ # Infra nodes handle more concurrency, edge nodes get more traffic
197
+ if layer == "edge":
198
+ arrival_rate = 500.0
199
+ thread_pool = 100
200
+ elif layer == "infra":
201
+ arrival_rate = 200.0
202
+ service_time = 0.02 # DBs are fast per-query
203
+ thread_pool = 30
204
+ if is_cache:
205
+ service_time = 0.001
206
+ thread_pool = 200
207
+
208
+ return ServiceNode(
209
+ id=service_id,
210
+ layer=layer,
211
+ base_arrival_rate=arrival_rate,
212
+ base_service_time_local=service_time,
213
+ thread_pool_size=thread_pool,
214
+ is_hotspot=is_hotspot,
215
+ is_cache=is_cache,
216
+ has_background_job=has_background_job,
217
+ )
218
+
219
+
220
+ def generate_easy_graph(rng: random.Random) -> ServiceGraph:
221
+ """
222
+ Easy: 3-5 services, linear chain.
223
+ api-gateway → order-service → postgres-primary
224
+ Agent must identify and fix one failing service in this simple topology.
225
+ """
226
+ graph = ServiceGraph(difficulty="easy")
227
+ used: set = set()
228
+
229
+ # Always have a gateway at the edge
230
+ gateway_id = "api-gateway"
231
+ used.add(gateway_id)
232
+
233
+ # Pick 1-2 business services
234
+ biz_count = rng.randint(1, 2)
235
+ biz_nodes = []
236
+ for _ in range(biz_count):
237
+ svc = _pick(_BUSINESS_POOL, rng, used)
238
+ if svc:
239
+ used.add(svc)
240
+ biz_nodes.append(svc)
241
+
242
+ # Always have one DB at the leaf
243
+ db_id = "postgres-primary"
244
+ used.add(db_id)
245
+
246
+ # Optionally add a cache
247
+ add_cache = rng.random() > 0.4
248
+ cache_id = "redis-cache" if add_cache else None
249
+ if cache_id:
250
+ used.add(cache_id)
251
+
252
+ # Build nodes
253
+ graph.nodes.append(_make_node(gateway_id, "edge"))
254
+ for biz in biz_nodes:
255
+ graph.nodes.append(_make_node(biz, "business"))
256
+ graph.nodes.append(
257
+ _make_node(db_id, "infra", is_hotspot=True, arrival_rate=200.0)
258
+ )
259
+ if cache_id:
260
+ graph.nodes.append(
261
+ _make_node(cache_id, "infra", is_hotspot=True, is_cache=True)
262
+ )
263
+
264
+ # Build linear dependency chain: gateway → biz[0] → biz[1]? → db
265
+ chain = [gateway_id] + biz_nodes + [db_id]
266
+ for i in range(len(chain) - 1):
267
+ graph.edges.append(DependencyEdge(source=chain[i], target=chain[i + 1]))
268
+
269
+ # If cache exists, business services call it (optional edge for realism)
270
+ if cache_id and biz_nodes:
271
+ for biz in biz_nodes:
272
+ graph.edges.append(
273
+ DependencyEdge(source=biz, target=cache_id, activation_probability=0.9)
274
+ )
275
+
276
+ graph.build_indices()
277
+ return graph
278
+
279
+
280
+ def generate_medium_graph(rng: random.Random) -> ServiceGraph:
281
+ """
282
+ Medium: 8-15 services, branching DAG.
283
+ gateway → auth + 3-4 domain services → shared DB + cache + kafka.
284
+ Agent must trace through the graph to find a root cause that's upstream
285
+ of the service showing the worst symptoms.
286
+ """
287
+ graph = ServiceGraph(difficulty="medium")
288
+ used: set = set()
289
+
290
+ # Edge layer: 1 gateway
291
+ gateway_id = "api-gateway"
292
+ used.add(gateway_id)
293
+ graph.nodes.append(_make_node(gateway_id, "edge"))
294
+
295
+ # Identity layer: auth (gateway always calls auth)
296
+ auth_id = "auth-service"
297
+ used.add(auth_id)
298
+ graph.nodes.append(_make_node(auth_id, "identity"))
299
+ graph.edges.append(DependencyEdge(source=gateway_id, target=auth_id))
300
+
301
+ # Business layer: 4-6 domain services fanning out from gateway
302
+ biz_count = rng.randint(4, 6)
303
+ biz_nodes = []
304
+ for _ in range(biz_count):
305
+ svc = _pick(_BUSINESS_POOL, rng, used)
306
+ if svc:
307
+ used.add(svc)
308
+ biz_nodes.append(svc)
309
+ graph.nodes.append(_make_node(svc, "business"))
310
+ graph.edges.append(DependencyEdge(source=gateway_id, target=svc))
311
+
312
+ # Infra layer: shared DB + cache (hotspot nodes)
313
+ db_id = "postgres-primary"
314
+ cache_id = "redis-cache"
315
+ used.update([db_id, cache_id])
316
+ graph.nodes.append(_make_node(db_id, "infra", is_hotspot=True, arrival_rate=300.0))
317
+ graph.nodes.append(_make_node(cache_id, "infra", is_hotspot=True, is_cache=True))
318
+
319
+ # Business services call the shared DB and cache
320
+ for biz in biz_nodes:
321
+ graph.edges.append(DependencyEdge(source=biz, target=db_id))
322
+ # Cache: most biz services call it, but with high-freq optional
323
+ graph.edges.append(
324
+ DependencyEdge(source=biz, target=cache_id, activation_probability=0.8)
325
+ )
326
+
327
+ # Optionally add kafka as an async edge (1-2 business services produce to it)
328
+ if rng.random() > 0.4:
329
+ kafka_id = "kafka-broker"
330
+ used.add(kafka_id)
331
+ graph.nodes.append(
332
+ _make_node(kafka_id, "infra", has_background_job=True)
333
+ )
334
+ producers = rng.sample(biz_nodes, min(2, len(biz_nodes)))
335
+ for p in producers:
336
+ graph.edges.append(
337
+ DependencyEdge(source=p, target=kafka_id, edge_type="async", activation_probability=0.6)
338
+ )
339
+
340
+ # Cross-cutting: add 1-2 optional services (fraud, notification) called by some biz
341
+ cross_count = rng.randint(1, 2)
342
+ for _ in range(cross_count):
343
+ svc = _pick(_CROSS_CUTTING_POOL, rng, used)
344
+ if svc and biz_nodes:
345
+ used.add(svc)
346
+ caller = rng.choice(biz_nodes)
347
+ graph.nodes.append(_make_node(svc, "cross-cutting"))
348
+ graph.edges.append(
349
+ DependencyEdge(source=caller, target=svc, activation_probability=0.3)
350
+ )
351
+
352
+ graph.build_indices()
353
+ return graph
354
+
355
+
356
+ def generate_hard_graph(rng: random.Random) -> ServiceGraph:
357
+ """
358
+ Hard: 15-30 services, complex multi-region DAG with hotspots,
359
+ conditional edges, multiple infra tiers, and background jobs.
360
+ Agent must manage a Sev-0 multi-root incident with conflicting mitigations.
361
+ """
362
+ graph = ServiceGraph(difficulty="hard", has_multiple_regions=True)
363
+ graph.regions = ["us-east-1", "us-west-2"]
364
+ used: set = set()
365
+
366
+ all_biz_nodes: List[str] = []
367
+
368
+ # Build per-region sub-graphs, then connect them
369
+ for region in graph.regions:
370
+ suffix = "-east" if "east" in region else "-west"
371
+
372
+ # Edge: one gateway per region
373
+ gw = f"api-gateway{suffix}"
374
+ used.add(gw)
375
+ node = _make_node(gw, "edge")
376
+ node.region = region
377
+ graph.nodes.append(node)
378
+
379
+ # Identity: auth per region
380
+ auth = f"auth-service{suffix}"
381
+ used.add(auth)
382
+ node = _make_node(auth, "identity")
383
+ node.region = region
384
+ graph.nodes.append(node)
385
+ graph.edges.append(DependencyEdge(source=gw, target=auth))
386
+
387
+ # Business: 4-6 services per region
388
+ region_biz: List[str] = []
389
+ for _ in range(rng.randint(4, 6)):
390
+ svc_base = _pick(_BUSINESS_POOL, rng, used)
391
+ if svc_base:
392
+ svc = f"{svc_base}{suffix}"
393
+ used.add(svc)
394
+ region_biz.append(svc)
395
+ node = _make_node(svc, "business")
396
+ node.region = region
397
+ graph.nodes.append(node)
398
+ graph.edges.append(DependencyEdge(source=gw, target=svc))
399
+
400
+ all_biz_nodes.extend(region_biz)
401
+
402
+ # Infra: per-region replicas (postgres-replica is a hotspot)
403
+ pg_replica = f"postgres-replica{suffix}"
404
+ redis_svc = f"redis-cache{suffix}"
405
+ used.update([pg_replica, redis_svc])
406
+ node = _make_node(pg_replica, "infra", is_hotspot=True)
407
+ node.region = region
408
+ graph.nodes.append(node)
409
+ node = _make_node(redis_svc, "infra", is_hotspot=True, is_cache=True)
410
+ node.region = region
411
+ graph.nodes.append(node)
412
+
413
+ for biz in region_biz:
414
+ graph.edges.append(DependencyEdge(source=biz, target=pg_replica))
415
+ graph.edges.append(
416
+ DependencyEdge(source=biz, target=redis_svc, activation_probability=0.85)
417
+ )
418
+
419
+ # Shared global infra (hotspots called by both regions)
420
+ pg_primary = "postgres-primary"
421
+ kafka = "kafka-broker"
422
+ config_svc = "config-service"
423
+ used.update([pg_primary, kafka, config_svc])
424
+
425
+ graph.nodes.append(_make_node(pg_primary, "infra", is_hotspot=True, arrival_rate=500.0))
426
+ graph.nodes.append(_make_node(kafka, "infra", has_background_job=True))
427
+ graph.nodes.append(_make_node(config_svc, "infra", is_hotspot=True))
428
+
429
+ # Replicas call primary (replication)
430
+ for region in graph.regions:
431
+ suffix = "-east" if "east" in region else "-west"
432
+ graph.edges.append(
433
+ DependencyEdge(source=f"postgres-replica{suffix}", target=pg_primary)
434
+ )
435
+
436
+ # Business services use kafka for async events and config-service for feature flags
437
+ for biz in all_biz_nodes:
438
+ if rng.random() > 0.5:
439
+ graph.edges.append(
440
+ DependencyEdge(source=biz, target=kafka, edge_type="async", activation_probability=0.5)
441
+ )
442
+ graph.edges.append(
443
+ DependencyEdge(source=biz, target=config_svc, activation_probability=0.2)
444
+ )
445
+
446
+ # Cross-cutting services (low-freq optional edges)
447
+ for _ in range(rng.randint(2, 3)):
448
+ svc = _pick(_CROSS_CUTTING_POOL, rng, used)
449
+ if svc and all_biz_nodes:
450
+ used.add(svc)
451
+ caller = rng.choice(all_biz_nodes)
452
+ graph.nodes.append(_make_node(svc, "cross-cutting"))
453
+ graph.edges.append(
454
+ DependencyEdge(source=caller, target=svc, activation_probability=0.25)
455
+ )
456
+
457
+ graph.build_indices()
458
+ return graph
459
+
460
+
461
+ def generate_graph(difficulty: str, rng: random.Random) -> ServiceGraph:
462
+ """Generate a service dependency graph for the given difficulty level."""
463
+ if difficulty == "easy":
464
+ return generate_easy_graph(rng)
465
+ elif difficulty == "medium":
466
+ return generate_medium_graph(rng)
467
+ elif difficulty == "hard":
468
+ return generate_hard_graph(rng)
469
+ else:
470
+ raise ValueError(f"Unknown difficulty: {difficulty!r}. Must be easy|medium|hard.")