File size: 25,940 Bytes
5fe9036
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
"""
Task: Concurrent Faults with Misleading Evidence
To add a new task, copy this file, modify the SCENARIO definition, and place it in tasks/.
The task loader will auto-discover it.
"""

from env.scenario import IncidentScenario, RequiredFix, ServiceConfig
from models import RootCauseCategory, ServiceStatus


SCENARIO = IncidentScenario(
    task_id="hard",
    name="Concurrent Faults with Misleading Evidence",
    difficulty="hard",
    max_steps=35,
    incident_summary=(
        "SEV-1 incident declared at 04:00 UTC. Multiple services affected simultaneously. "
        "payment-service is completely down after a recent deploy. auth-service showing intermittent "
        "timeouts and session validation failures. notification-service queue backing up. "
        "user-service has config warnings. api-gateway showing >30% error rate across multiple "
        "endpoints. Need to identify ALL root causes and restore full system health."
    ),

    services={
        "api-gateway": ServiceConfig(
            status=ServiceStatus.DEGRADED, deps=["auth-service", "user-service", "payment-service"],
            version="v1.12.0", replicas=3,
        ),
        "auth-service": ServiceConfig(
            status=ServiceStatus.DEGRADED, deps=["cache-redis"],
            version="v2.14.0", replicas=2,
        ),
        "user-service": ServiceConfig(
            status=ServiceStatus.HEALTHY, deps=["db-postgres"],
            version="v4.2.1", replicas=2,
        ),
        "payment-service": ServiceConfig(
            status=ServiceStatus.DOWN, deps=["db-postgres"],
            version="v3.8.2", replicas=2, is_root_cause=True, fault_type="bad_deploy",
        ),
        "db-postgres": ServiceConfig(
            status=ServiceStatus.HEALTHY, deps=[],
            version="v15.4", replicas=1,
        ),
        "cache-redis": ServiceConfig(
            status=ServiceStatus.DEGRADED, deps=[],
            version="v7.2.4", replicas=1, is_root_cause=True, fault_type="memory_leak",
        ),
        "notification-service": ServiceConfig(
            status=ServiceStatus.DEGRADED, deps=["auth-service"],
            version="v1.5.0", replicas=1,
        ),
    },

    initial_alerts=[
        "[ALERT SEV-1] api-gateway: error rate >30%, multiple downstream failures detected",
        "[ALERT SEV-1] payment-service: health check failing, 0/2 pods ready, CrashLoopBackOff",
        "[ALERT SEV-2] auth-service: intermittent 500 errors, session validation latency >3s",
        "[ALERT SEV-2] notification-service: email delivery queue depth >2000, processing stalled",
        "[ALERT SEV-3] user-service: config validation warning (non-critical)",
    ],

    logs={
        "payment-service": [
            "2026-04-06T04:00:00Z INFO  [payment-service] Deploying v3.8.2 (previous: v3.8.1)",
            "2026-04-06T04:00:01Z INFO  [payment-service] Container image pulled: registry.internal/payment-service:v3.8.2",
            "2026-04-06T04:00:02Z INFO  [payment-service] Pod payment-service-7d4f8b-xk9m2 starting...",
            "2026-04-06T04:00:03Z INFO  [payment-service] Starting health check sequence...",
            "2026-04-06T04:00:04Z INFO  [payment-service] Loading configuration from ConfigMap...",
            "2026-04-06T04:00:05Z INFO  [payment-service] Initializing payment validation module v2 (new in v3.8.2)",
            "2026-04-06T04:00:05Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
            "2026-04-06T04:00:05Z ERROR [payment-service] Stack trace:",
            "    at com.acme.payment.validator.PaymentValidatorV2.initialize(PaymentValidatorV2.java:42)",
            "    at com.acme.payment.bootstrap.ServiceBootstrap.initModules(ServiceBootstrap.java:118)",
            "    at com.acme.payment.bootstrap.ServiceBootstrap.start(ServiceBootstrap.java:55)",
            "    at com.acme.payment.Main.main(Main.java:12)",
            "2026-04-06T04:00:06Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
            "2026-04-06T04:00:06Z INFO  [payment-service] Shutdown hook triggered, cleaning up...",
            "2026-04-06T04:00:07Z INFO  [payment-service] Health check endpoint /healthz returning 503",
            "2026-04-06T04:00:10Z WARN  [payment-service] Kubernetes: pod payment-service-7d4f8b-xk9m2 failed readiness probe (1/3)",
            "2026-04-06T04:00:20Z WARN  [payment-service] Kubernetes: pod payment-service-7d4f8b-xk9m2 failed readiness probe (2/3)",
            "2026-04-06T04:00:30Z ERROR [payment-service] Kubernetes: pod payment-service-7d4f8b-xk9m2 marked NotReady, removed from service",
            "2026-04-06T04:00:31Z INFO  [payment-service] Kubernetes: restarting pod (CrashLoopBackOff)",
            "2026-04-06T04:00:35Z INFO  [payment-service] Starting health check sequence...",
            "2026-04-06T04:00:37Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
            "2026-04-06T04:00:37Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
            "2026-04-06T04:00:38Z INFO  [payment-service] Kubernetes: restarting pod (CrashLoopBackOff)",
            "2026-04-06T04:00:45Z INFO  [payment-service] Starting health check sequence...",
            "2026-04-06T04:00:47Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
            "2026-04-06T04:00:47Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
            "2026-04-06T04:01:00Z ERROR [payment-service] CrashLoopBackOff: backing off 60s before next restart",
            "2026-04-06T04:02:05Z INFO  [payment-service] Starting health check sequence...",
            "2026-04-06T04:02:07Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
            "2026-04-06T04:02:07Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
            "2026-04-06T04:02:10Z ERROR [payment-service] CrashLoopBackOff: backing off 120s before next restart",
        ],
        "cache-redis": [
            "2026-04-06T03:00:00Z INFO  [cache-redis] Memory usage: 2.8GB/4.0GB (70%) β€” within operational range",
            "2026-04-06T03:05:00Z INFO  [cache-redis] Memory usage: 2.9GB/4.0GB (72%)",
            "2026-04-06T03:10:00Z INFO  [cache-redis] Memory usage: 3.0GB/4.0GB (75%)",
            "2026-04-06T03:15:00Z INFO  [cache-redis] Memory usage: 3.1GB/4.0GB (77%)",
            "2026-04-06T03:20:00Z INFO  [cache-redis] Memory usage: 3.2GB/4.0GB (80%)",
            "2026-04-06T03:25:00Z INFO  [cache-redis] Memory usage: 3.3GB/4.0GB (82%)",
            "2026-04-06T03:30:00Z WARN  [cache-redis] Memory usage: 3.4GB/4.0GB (85%) β€” approaching maxmemory threshold",
            "2026-04-06T03:30:01Z INFO  [cache-redis] Eviction policy: allkeys-lru activated",
            "2026-04-06T03:30:05Z WARN  [cache-redis] Evicting 1200 keys/sec to maintain memory budget",
            "2026-04-06T03:35:00Z WARN  [cache-redis] Memory usage: 3.5GB/4.0GB (87%) despite active eviction",
            "2026-04-06T03:40:00Z WARN  [cache-redis] Memory usage: 3.6GB/4.0GB (90%)",
            "2026-04-06T03:45:00Z WARN  [cache-redis] Memory usage: 3.7GB/4.0GB (92%) despite active eviction",
            "2026-04-06T03:45:01Z WARN  [cache-redis] Eviction rate insufficient: incoming writes (2.1GB/hr) exceed eviction rate (1.5GB/hr)",
            "2026-04-06T03:45:02Z WARN  [cache-redis] Key namespace auth:session:* most affected β€” 60% of evictions from this prefix",
            "2026-04-06T03:50:00Z WARN  [cache-redis] Memory usage: 3.8GB/4.0GB (95%)",
            "2026-04-06T03:55:00Z ERROR [cache-redis] Memory usage: 3.82GB/4.0GB (95.5%)",
            "2026-04-06T04:00:00Z ERROR [cache-redis] Memory usage: 3.85GB/4.0GB (96%) β€” critical threshold",
            "2026-04-06T04:00:01Z ERROR [cache-redis] Rejecting 12% of SET commands due to memory pressure",
            "2026-04-06T04:00:02Z WARN  [cache-redis] Client auth-service reporting increased cache misses (hit ratio: 35%, normal: 90%)",
            "2026-04-06T04:00:05Z ERROR [cache-redis] Memory fragmentation ratio: 1.8 (healthy: <1.5) β€” possible memory leak in module",
            "2026-04-06T04:00:10Z WARN  [cache-redis] Resident memory growing despite aggressive eviction β€” suspect leaked allocations in Lua script engine",
            "2026-04-06T04:00:15Z ERROR [cache-redis] Rejecting 18% of SET commands due to memory pressure",
        ],
        "auth-service": [
            "2026-04-06T03:00:00Z INFO  [auth-service] Request: POST /auth/token uid=user_4421 -> cache HIT (12ms)",
            "2026-04-06T03:00:05Z INFO  [auth-service] Request: POST /auth/verify uid=user_8832 -> cache HIT (10ms)",
            "2026-04-06T03:15:00Z INFO  [auth-service] Request: POST /auth/token uid=user_3310 -> cache HIT (11ms)",
            "2026-04-06T03:30:00Z INFO  [auth-service] Request: POST /auth/token uid=user_5571 -> cache HIT (13ms)",
            "2026-04-06T03:45:00Z WARN  [auth-service] Cache miss for session sid=c9f21a β€” falling back to db-postgres lookup (280ms)",
            "2026-04-06T03:45:02Z INFO  [auth-service] Request: POST /auth/token uid=user_7712 -> cache HIT (14ms)",
            "2026-04-06T03:45:05Z WARN  [auth-service] Cache miss rate elevated: 45% (normal: <10%)",
            "2026-04-06T03:45:10Z WARN  [auth-service] Cache miss for session sid=d4e82b β€” falling back to db-postgres lookup (320ms)",
            "2026-04-06T03:50:00Z WARN  [auth-service] DB connection pool: 28/30 active (falling back to DB for most session lookups)",
            "2026-04-06T03:55:00Z WARN  [auth-service] Cache miss rate: 55% β€” DB fallback path overloaded",
            "2026-04-06T04:00:00Z ERROR [auth-service] Cache write rejected by redis: OOM command not allowed when used memory > maxmemory",
            "2026-04-06T04:00:01Z WARN  [auth-service] 65% of requests hitting DB fallback path β€” latency p99 = 3200ms",
            "2026-04-06T04:00:03Z ERROR [auth-service] Request timeout: POST /auth/verify uid=user_8832 (DB fallback overloaded)",
            "2026-04-06T04:00:05Z ERROR [auth-service] Request timeout: POST /auth/token uid=user_2209 (DB fallback overloaded)",
            "2026-04-06T04:00:08Z WARN  [auth-service] DB connection pool: 30/30 active (SATURATED)",
            "2026-04-06T04:00:10Z WARN  [auth-service] Degraded mode: session validation averaging 1800ms (SLA: 200ms)",
            "2026-04-06T04:00:15Z ERROR [auth-service] 5 request timeouts in last 60 seconds",
        ],
        "user-service": [
            "2026-04-06T03:30:00Z INFO  [user-service] Config reload triggered by configmap update",
            "2026-04-06T03:30:01Z WARN  [user-service] Config validation: feature flag 'enable_profile_v2' references unknown experiment 'profile_redesign_q2'",
            "2026-04-06T03:30:01Z WARN  [user-service] Config validation: deprecated field 'legacy_avatar_url' present β€” will be removed in v4.0",
            "2026-04-06T03:30:02Z INFO  [user-service] Config applied successfully (2 warnings, 0 errors)",
            "2026-04-06T03:30:03Z INFO  [user-service] All endpoints healthy, no service disruption during config reload",
            "2026-04-06T03:30:10Z INFO  [user-service] GET /users/profile uid=user_4421 -> 200 (28ms)",
            "2026-04-06T03:45:00Z INFO  [user-service] GET /users/profile uid=user_1101 -> 200 (30ms)",
            "2026-04-06T03:45:05Z INFO  [user-service] PUT /users/profile uid=user_3310 -> 200 (82ms)",
            "2026-04-06T04:00:00Z INFO  [user-service] GET /users/profile uid=user_1101 -> 200 (28ms)",
            "2026-04-06T04:00:01Z INFO  [user-service] PUT /users/profile uid=user_3310 -> 200 (95ms)",
            "2026-04-06T04:00:05Z INFO  [user-service] GET /users/settings uid=user_5571 -> 200 (26ms)",
            "2026-04-06T04:00:10Z INFO  [user-service] Health check /healthz -> 200 OK",
        ],
        "notification-service": [
            "2026-04-06T03:45:00Z INFO  [notification-service] Auth token validated for batch #4445 (48ms)",
            "2026-04-06T03:45:01Z INFO  [notification-service] Email batch #4445 sent successfully (15 emails)",
            "2026-04-06T04:00:00Z WARN  [notification-service] Auth token validation taking 2800ms (SLA: 500ms)",
            "2026-04-06T04:00:02Z WARN  [notification-service] Email delivery queue depth: 2400 (normal: <100)",
            "2026-04-06T04:00:05Z ERROR [notification-service] Failed to validate sender auth for notification batch #8832 β€” auth-service timeout",
            "2026-04-06T04:00:06Z WARN  [notification-service] Pausing email delivery until auth validation recovers",
            "2026-04-06T04:00:10Z WARN  [notification-service] Queue depth growing: 2800 pending emails",
            "2026-04-06T04:00:15Z ERROR [notification-service] Auth validation timeout for batch #8833",
            "2026-04-06T04:00:20Z WARN  [notification-service] Queue depth: 3200 β€” SLA breach imminent for time-sensitive notifications",
        ],
        "api-gateway": [
            "2026-04-06T03:59:55Z INFO  [api-gateway] Route: POST /api/v2/login -> auth-service (200, 45ms)",
            "2026-04-06T03:59:58Z INFO  [api-gateway] Route: POST /api/v2/pay -> payment-service (200, 92ms)",
            "2026-04-06T04:00:01Z ERROR [api-gateway] Route: POST /api/v2/pay -> payment-service (503, connection refused)",
            "2026-04-06T04:00:02Z WARN  [api-gateway] Route: POST /api/v2/login -> auth-service (200, 1800ms) β€” slow",
            "2026-04-06T04:00:03Z INFO  [api-gateway] Route: GET /api/v2/user/profile -> user-service (200, 28ms)",
            "2026-04-06T04:00:05Z ERROR [api-gateway] Route: POST /api/v2/pay -> payment-service (503, connection refused)",
            "2026-04-06T04:00:06Z WARN  [api-gateway] Circuit breaker OPEN for payment-service (failures=5, threshold=5)",
            "2026-04-06T04:00:08Z ERROR [api-gateway] Route: POST /api/v2/verify -> auth-service (504, timeout after 5000ms)",
            "2026-04-06T04:00:10Z INFO  [api-gateway] Route: GET /api/v2/user/settings -> user-service (200, 25ms)",
            "2026-04-06T04:00:12Z ERROR [api-gateway] Route: POST /api/v2/pay -> payment-service (503, circuit breaker open)",
            "2026-04-06T04:00:15Z WARN  [api-gateway] Route: POST /api/v2/login -> auth-service (200, 3200ms) β€” very slow",
            "2026-04-06T04:00:18Z ERROR [api-gateway] Route: POST /api/v2/verify -> auth-service (504, timeout after 5000ms)",
            "2026-04-06T04:00:20Z INFO  [api-gateway] Route: GET /api/v2/user/profile -> user-service (200, 30ms)",
        ],
        "db-postgres": [
            "2026-04-06T03:55:00Z INFO  [db-postgres] Active connections: 42/100",
            "2026-04-06T04:00:00Z INFO  [db-postgres] Active connections: 58/100",
            "2026-04-06T04:00:01Z INFO  [db-postgres] Checkpoint starting: time-based",
            "2026-04-06T04:00:03Z INFO  [db-postgres] Checkpoint complete: wrote 1450 buffers (9.8%)",
            "2026-04-06T04:00:05Z INFO  [db-postgres] Higher than normal read load β€” auth-service fallback queries detected",
            "2026-04-06T04:00:10Z INFO  [db-postgres] Active connections: 62/100 β€” elevated but within limits",
            "2026-04-06T04:00:15Z INFO  [db-postgres] No deadlocks detected. Lock wait queue empty.",
            "2026-04-06T04:00:20Z INFO  [db-postgres] Autovacuum: processing table sessions (dead tuples: 850)",
        ],
    },

    metrics={
        "payment-service": [
            {"timestamp": "2026-04-06T03:55:00Z", "cpu_pct": 18, "mem_pct": 40, "latency_p50": 88, "latency_p99": 155, "error_rate": 0.001, "pods_ready": 2, "pods_total": 2},
            {"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 0, "mem_pct": 0, "latency_p50": 0, "latency_p99": 0, "error_rate": 1.0, "pods_ready": 0, "pods_total": 2},
        ],
        "cache-redis": [
            {"timestamp": "2026-04-06T02:00:00Z", "mem_gb": 2.4, "mem_pct": 60, "hit_ratio": 0.92, "evictions_per_s": 0, "connections": 45, "fragmentation_ratio": 1.1},
            {"timestamp": "2026-04-06T02:30:00Z", "mem_gb": 2.6, "mem_pct": 65, "hit_ratio": 0.91, "evictions_per_s": 0, "connections": 46, "fragmentation_ratio": 1.2},
            {"timestamp": "2026-04-06T03:00:00Z", "mem_gb": 2.8, "mem_pct": 70, "hit_ratio": 0.90, "evictions_per_s": 5, "connections": 47, "fragmentation_ratio": 1.3},
            {"timestamp": "2026-04-06T03:30:00Z", "mem_gb": 3.4, "mem_pct": 85, "hit_ratio": 0.72, "evictions_per_s": 1200, "connections": 48, "fragmentation_ratio": 1.5},
            {"timestamp": "2026-04-06T03:45:00Z", "mem_gb": 3.7, "mem_pct": 92, "hit_ratio": 0.55, "evictions_per_s": 1800, "connections": 48, "fragmentation_ratio": 1.7},
            {"timestamp": "2026-04-06T04:00:00Z", "mem_gb": 3.85, "mem_pct": 96, "hit_ratio": 0.35, "evictions_per_s": 2200, "connections": 47, "fragmentation_ratio": 1.8},
        ],
        "auth-service": [
            {"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 22, "mem_pct": 58, "latency_p50": 12, "latency_p99": 45, "error_rate": 0.001, "cache_hit_ratio": 0.90, "db_fallback_pct": 0.10},
            {"timestamp": "2026-04-06T03:30:00Z", "cpu_pct": 28, "mem_pct": 60, "latency_p50": 25, "latency_p99": 180, "error_rate": 0.005, "cache_hit_ratio": 0.72, "db_fallback_pct": 0.28},
            {"timestamp": "2026-04-06T03:45:00Z", "cpu_pct": 35, "mem_pct": 62, "latency_p50": 120, "latency_p99": 1200, "error_rate": 0.05, "cache_hit_ratio": 0.55, "db_fallback_pct": 0.45},
            {"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 42, "mem_pct": 65, "latency_p50": 800, "latency_p99": 3200, "error_rate": 0.15, "cache_hit_ratio": 0.35, "db_fallback_pct": 0.65},
        ],
        "user-service": [
            {"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 28, "latency_p99": 75, "error_rate": 0.001},
            {"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 30, "latency_p99": 82, "error_rate": 0.001},
        ],
        "notification-service": [
            {"timestamp": "2026-04-06T03:45:00Z", "cpu_pct": 12, "mem_pct": 30, "queue_depth": 15, "auth_validation_ms": 48, "emails_sent_per_min": 120},
            {"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 14, "mem_pct": 32, "queue_depth": 2400, "auth_validation_ms": 2800, "emails_sent_per_min": 5},
        ],
        "api-gateway": [
            {"timestamp": "2026-04-06T03:55:00Z", "cpu_pct": 20, "mem_pct": 45, "latency_p50": 35, "latency_p99": 95, "error_rate": 0.002, "5xx_rate": 0.001},
            {"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 28, "mem_pct": 48, "latency_p50": 120, "latency_p99": 5200, "error_rate": 0.35, "5xx_rate": 0.32},
        ],
        "db-postgres": [
            {"timestamp": "2026-04-06T03:55:00Z", "cpu_pct": 35, "mem_pct": 55, "connections": 42, "active_locks": 2, "deadlocks": 0, "write_iops": 1200, "read_iops": 3500},
            {"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 45, "mem_pct": 58, "connections": 62, "active_locks": 3, "deadlocks": 0, "write_iops": 1100, "read_iops": 4800},
        ],
    },

    traces={
        "payment-service": [
            "No recent traces β€” service is down (CrashLoopBackOff). Last successful trace (before deploy):",
            "Trace: POST /api/v2/pay (txn=pay_9901, total=92ms) β€” v3.8.1",
            "  β”œβ”€ payment-service.validateRequest()      8ms",
            "  β”œβ”€ payment-service.checkBalance()         25ms  (SELECT -> db-postgres)",
            "  β”œβ”€ payment-service.insertTransaction()    40ms  (INSERT -> db-postgres)",
            "  └─ payment-service.sendConfirmation()     19ms",
        ],
        "auth-service": [
            "Trace: POST /auth/verify (uid=user_8832, total=3200ms)",
            "  β”œβ”€ auth-service.checkSessionCache()       8ms    (cache-redis MISS)",
            "  β”œβ”€ auth-service.fallbackDBLookup()        2900ms (db-postgres β€” under load from fallback traffic)",
            "  β”œβ”€ auth-service.validateToken()            45ms",
            "  └─ auth-service.writeBackToCache()         FAILED (redis OOM rejected write)",
        ],
        "notification-service": [
            "Trace: POST /notifications/send (batch=#8832, total=5200ms) β€” TIMEOUT",
            "  β”œβ”€ notification-service.prepareBatch()     12ms",
            "  β”œβ”€ notification-service.validateAuth()     5000ms (-> auth-service TIMEOUT)",
            "  └─ notification-service.sendEmails()       never reached",
        ],
    },

    deploy_history={
        "payment-service": [
            "v3.8.2  deployed 2026-04-06T04:00:00Z  status=CrashLoopBackOff  (deployed 15 min ago)",
            "v3.8.1  deployed 2026-04-03T14:00:00Z  status=superseded  (was stable for 3 days)",
            "v3.8.0  deployed 2026-03-28T10:00:00Z  status=superseded",
        ],
        "auth-service": [
            "v2.14.0  deployed 2026-04-01T10:00:00Z  status=stable  (running 5 days, no issues)",
        ],
        "cache-redis": [
            "v7.2.4  deployed 2026-03-20T09:00:00Z  status=stable  (running 17 days)",
        ],
        "user-service": [
            "v4.2.1  deployed 2026-04-05T16:00:00Z  status=stable  (running 12 hours)",
        ],
    },

    runbooks={
        "payment-service": (
            "## payment-service Runbook\n"
            "- Crash on startup / CrashLoopBackOff: Check recent deploys. If the latest deploy\n"
            "  introduced the crash, rollback to previous known-good version:\n"
            "  rollback_deploy(service='payment-service', target_version='<previous_version>')\n"
            "  Check deploy history for the last stable version.\n"
            "- Transaction timeouts: Check db-postgres connection pool and lock status.\n"
            "- High latency: Check downstream service health (db-postgres)."
        ),
        "cache-redis": (
            "## cache-redis Runbook\n"
            "- Memory pressure / approaching maxmemory: Check memory trend in metrics.\n"
            "  If memory grows despite eviction, likely a memory leak.\n"
            "  Short-term fix: restart_service to clear leaked memory.\n"
            "  Alternative: scale_up to add more replicas and distribute load.\n"
            "- Elevated miss ratio: If caused by memory pressure/eviction storm, fix memory issue first.\n"
            "  If caused by TTL expiry batch, wait for cache to warm back up."
        ),
        "auth-service": (
            "## auth-service Runbook\n"
            "- High latency / DB fallback: Check cache-redis health. If redis is degraded,\n"
            "  auth-service falls back to DB lookups which are 10-50x slower.\n"
            "  Fix redis first β€” auth-service will recover automatically.\n"
            "- Cache write failures: Redis may be rejecting writes due to OOM. Check redis memory."
        ),
        "notification-service": (
            "## notification-service Runbook\n"
            "- Queue backing up: Usually caused by auth-service degradation. Notification-service\n"
            "  validates sender auth before sending. If auth is slow, queue grows.\n"
            "  Fix auth-service first β€” queue will drain automatically."
        ),
    },

    configs={
        "payment-service": {
            "current": "DB_POOL_SIZE=50\nDB_TIMEOUT=5000\nRETRY_COUNT=3\nVALIDATOR_VERSION=v2\nFEATURE_NEW_VALIDATION=true",
            "previous": "DB_POOL_SIZE=50\nDB_TIMEOUT=5000\nRETRY_COUNT=3\nVALIDATOR_VERSION=v1\nFEATURE_NEW_VALIDATION=false",
            "diff": "Changed VALIDATOR_VERSION from v1 to v2, enabled FEATURE_NEW_VALIDATION (part of v3.8.2 deploy)",
        },
        "user-service": {
            "current": "FEATURE_PROFILE_V2=true\nLEGACY_AVATAR_URL=https://cdn.example.com/avatars\nDB_POOL_SIZE=30",
            "previous": "FEATURE_PROFILE_V2=false\nDB_POOL_SIZE=30",
            "diff": "Added FEATURE_PROFILE_V2=true and LEGACY_AVATAR_URL (config change 30 min ago). 2 validation warnings but applied successfully.",
        },
        "cache-redis": {
            "current": "maxmemory=4gb\nmaxmemory-policy=allkeys-lru\ntimeout=300\ntcp-keepalive=60",
            "previous": "maxmemory=4gb\nmaxmemory-policy=allkeys-lru\ntimeout=300\ntcp-keepalive=60",
            "diff": "No changes β€” config has not been modified recently.",
        },
    },

    dependencies={
        "api-gateway": ["auth-service", "user-service", "payment-service"],
        "auth-service": ["cache-redis"],
        "user-service": ["db-postgres"],
        "payment-service": ["db-postgres"],
        "db-postgres": [],
        "cache-redis": [],
        "notification-service": ["auth-service"],
    },

    root_cause_services=["payment-service", "cache-redis"],
    root_cause_categories=[RootCauseCategory.BAD_DEPLOY, RootCauseCategory.MEMORY_LEAK],
    required_fixes=[
        RequiredFix(action="rollback_deploy", service="payment-service", target_version="v3.8.1"),
        RequiredFix(action="restart_service", service="cache-redis"),
    ],
    diagnosis_keywords=[
        "payment-service", "deploy", "rollback", "v3.8.2", "v3.8.1", "NullPointerException", "crash",
        "cache-redis", "memory", "leak", "eviction", "auth-service", "fallback",
    ],

    weights={
        "correct_service": 0.15,
        "correct_category": 0.10,
        "correct_fix": 0.15,
        "secondary_fix": 0.20,
        "diagnosis_text": 0.15,
        "investigation": 0.10,
        "wrong_penalty": 0.05,
    },
)