Spaces:
Sleeping
Sleeping
File size: 25,940 Bytes
5fe9036 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 | """
Task: Concurrent Faults with Misleading Evidence
To add a new task, copy this file, modify the SCENARIO definition, and place it in tasks/.
The task loader will auto-discover it.
"""
from env.scenario import IncidentScenario, RequiredFix, ServiceConfig
from models import RootCauseCategory, ServiceStatus
SCENARIO = IncidentScenario(
task_id="hard",
name="Concurrent Faults with Misleading Evidence",
difficulty="hard",
max_steps=35,
incident_summary=(
"SEV-1 incident declared at 04:00 UTC. Multiple services affected simultaneously. "
"payment-service is completely down after a recent deploy. auth-service showing intermittent "
"timeouts and session validation failures. notification-service queue backing up. "
"user-service has config warnings. api-gateway showing >30% error rate across multiple "
"endpoints. Need to identify ALL root causes and restore full system health."
),
services={
"api-gateway": ServiceConfig(
status=ServiceStatus.DEGRADED, deps=["auth-service", "user-service", "payment-service"],
version="v1.12.0", replicas=3,
),
"auth-service": ServiceConfig(
status=ServiceStatus.DEGRADED, deps=["cache-redis"],
version="v2.14.0", replicas=2,
),
"user-service": ServiceConfig(
status=ServiceStatus.HEALTHY, deps=["db-postgres"],
version="v4.2.1", replicas=2,
),
"payment-service": ServiceConfig(
status=ServiceStatus.DOWN, deps=["db-postgres"],
version="v3.8.2", replicas=2, is_root_cause=True, fault_type="bad_deploy",
),
"db-postgres": ServiceConfig(
status=ServiceStatus.HEALTHY, deps=[],
version="v15.4", replicas=1,
),
"cache-redis": ServiceConfig(
status=ServiceStatus.DEGRADED, deps=[],
version="v7.2.4", replicas=1, is_root_cause=True, fault_type="memory_leak",
),
"notification-service": ServiceConfig(
status=ServiceStatus.DEGRADED, deps=["auth-service"],
version="v1.5.0", replicas=1,
),
},
initial_alerts=[
"[ALERT SEV-1] api-gateway: error rate >30%, multiple downstream failures detected",
"[ALERT SEV-1] payment-service: health check failing, 0/2 pods ready, CrashLoopBackOff",
"[ALERT SEV-2] auth-service: intermittent 500 errors, session validation latency >3s",
"[ALERT SEV-2] notification-service: email delivery queue depth >2000, processing stalled",
"[ALERT SEV-3] user-service: config validation warning (non-critical)",
],
logs={
"payment-service": [
"2026-04-06T04:00:00Z INFO [payment-service] Deploying v3.8.2 (previous: v3.8.1)",
"2026-04-06T04:00:01Z INFO [payment-service] Container image pulled: registry.internal/payment-service:v3.8.2",
"2026-04-06T04:00:02Z INFO [payment-service] Pod payment-service-7d4f8b-xk9m2 starting...",
"2026-04-06T04:00:03Z INFO [payment-service] Starting health check sequence...",
"2026-04-06T04:00:04Z INFO [payment-service] Loading configuration from ConfigMap...",
"2026-04-06T04:00:05Z INFO [payment-service] Initializing payment validation module v2 (new in v3.8.2)",
"2026-04-06T04:00:05Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
"2026-04-06T04:00:05Z ERROR [payment-service] Stack trace:",
" at com.acme.payment.validator.PaymentValidatorV2.initialize(PaymentValidatorV2.java:42)",
" at com.acme.payment.bootstrap.ServiceBootstrap.initModules(ServiceBootstrap.java:118)",
" at com.acme.payment.bootstrap.ServiceBootstrap.start(ServiceBootstrap.java:55)",
" at com.acme.payment.Main.main(Main.java:12)",
"2026-04-06T04:00:06Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
"2026-04-06T04:00:06Z INFO [payment-service] Shutdown hook triggered, cleaning up...",
"2026-04-06T04:00:07Z INFO [payment-service] Health check endpoint /healthz returning 503",
"2026-04-06T04:00:10Z WARN [payment-service] Kubernetes: pod payment-service-7d4f8b-xk9m2 failed readiness probe (1/3)",
"2026-04-06T04:00:20Z WARN [payment-service] Kubernetes: pod payment-service-7d4f8b-xk9m2 failed readiness probe (2/3)",
"2026-04-06T04:00:30Z ERROR [payment-service] Kubernetes: pod payment-service-7d4f8b-xk9m2 marked NotReady, removed from service",
"2026-04-06T04:00:31Z INFO [payment-service] Kubernetes: restarting pod (CrashLoopBackOff)",
"2026-04-06T04:00:35Z INFO [payment-service] Starting health check sequence...",
"2026-04-06T04:00:37Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
"2026-04-06T04:00:37Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
"2026-04-06T04:00:38Z INFO [payment-service] Kubernetes: restarting pod (CrashLoopBackOff)",
"2026-04-06T04:00:45Z INFO [payment-service] Starting health check sequence...",
"2026-04-06T04:00:47Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
"2026-04-06T04:00:47Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
"2026-04-06T04:01:00Z ERROR [payment-service] CrashLoopBackOff: backing off 60s before next restart",
"2026-04-06T04:02:05Z INFO [payment-service] Starting health check sequence...",
"2026-04-06T04:02:07Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
"2026-04-06T04:02:07Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
"2026-04-06T04:02:10Z ERROR [payment-service] CrashLoopBackOff: backing off 120s before next restart",
],
"cache-redis": [
"2026-04-06T03:00:00Z INFO [cache-redis] Memory usage: 2.8GB/4.0GB (70%) β within operational range",
"2026-04-06T03:05:00Z INFO [cache-redis] Memory usage: 2.9GB/4.0GB (72%)",
"2026-04-06T03:10:00Z INFO [cache-redis] Memory usage: 3.0GB/4.0GB (75%)",
"2026-04-06T03:15:00Z INFO [cache-redis] Memory usage: 3.1GB/4.0GB (77%)",
"2026-04-06T03:20:00Z INFO [cache-redis] Memory usage: 3.2GB/4.0GB (80%)",
"2026-04-06T03:25:00Z INFO [cache-redis] Memory usage: 3.3GB/4.0GB (82%)",
"2026-04-06T03:30:00Z WARN [cache-redis] Memory usage: 3.4GB/4.0GB (85%) β approaching maxmemory threshold",
"2026-04-06T03:30:01Z INFO [cache-redis] Eviction policy: allkeys-lru activated",
"2026-04-06T03:30:05Z WARN [cache-redis] Evicting 1200 keys/sec to maintain memory budget",
"2026-04-06T03:35:00Z WARN [cache-redis] Memory usage: 3.5GB/4.0GB (87%) despite active eviction",
"2026-04-06T03:40:00Z WARN [cache-redis] Memory usage: 3.6GB/4.0GB (90%)",
"2026-04-06T03:45:00Z WARN [cache-redis] Memory usage: 3.7GB/4.0GB (92%) despite active eviction",
"2026-04-06T03:45:01Z WARN [cache-redis] Eviction rate insufficient: incoming writes (2.1GB/hr) exceed eviction rate (1.5GB/hr)",
"2026-04-06T03:45:02Z WARN [cache-redis] Key namespace auth:session:* most affected β 60% of evictions from this prefix",
"2026-04-06T03:50:00Z WARN [cache-redis] Memory usage: 3.8GB/4.0GB (95%)",
"2026-04-06T03:55:00Z ERROR [cache-redis] Memory usage: 3.82GB/4.0GB (95.5%)",
"2026-04-06T04:00:00Z ERROR [cache-redis] Memory usage: 3.85GB/4.0GB (96%) β critical threshold",
"2026-04-06T04:00:01Z ERROR [cache-redis] Rejecting 12% of SET commands due to memory pressure",
"2026-04-06T04:00:02Z WARN [cache-redis] Client auth-service reporting increased cache misses (hit ratio: 35%, normal: 90%)",
"2026-04-06T04:00:05Z ERROR [cache-redis] Memory fragmentation ratio: 1.8 (healthy: <1.5) β possible memory leak in module",
"2026-04-06T04:00:10Z WARN [cache-redis] Resident memory growing despite aggressive eviction β suspect leaked allocations in Lua script engine",
"2026-04-06T04:00:15Z ERROR [cache-redis] Rejecting 18% of SET commands due to memory pressure",
],
"auth-service": [
"2026-04-06T03:00:00Z INFO [auth-service] Request: POST /auth/token uid=user_4421 -> cache HIT (12ms)",
"2026-04-06T03:00:05Z INFO [auth-service] Request: POST /auth/verify uid=user_8832 -> cache HIT (10ms)",
"2026-04-06T03:15:00Z INFO [auth-service] Request: POST /auth/token uid=user_3310 -> cache HIT (11ms)",
"2026-04-06T03:30:00Z INFO [auth-service] Request: POST /auth/token uid=user_5571 -> cache HIT (13ms)",
"2026-04-06T03:45:00Z WARN [auth-service] Cache miss for session sid=c9f21a β falling back to db-postgres lookup (280ms)",
"2026-04-06T03:45:02Z INFO [auth-service] Request: POST /auth/token uid=user_7712 -> cache HIT (14ms)",
"2026-04-06T03:45:05Z WARN [auth-service] Cache miss rate elevated: 45% (normal: <10%)",
"2026-04-06T03:45:10Z WARN [auth-service] Cache miss for session sid=d4e82b β falling back to db-postgres lookup (320ms)",
"2026-04-06T03:50:00Z WARN [auth-service] DB connection pool: 28/30 active (falling back to DB for most session lookups)",
"2026-04-06T03:55:00Z WARN [auth-service] Cache miss rate: 55% β DB fallback path overloaded",
"2026-04-06T04:00:00Z ERROR [auth-service] Cache write rejected by redis: OOM command not allowed when used memory > maxmemory",
"2026-04-06T04:00:01Z WARN [auth-service] 65% of requests hitting DB fallback path β latency p99 = 3200ms",
"2026-04-06T04:00:03Z ERROR [auth-service] Request timeout: POST /auth/verify uid=user_8832 (DB fallback overloaded)",
"2026-04-06T04:00:05Z ERROR [auth-service] Request timeout: POST /auth/token uid=user_2209 (DB fallback overloaded)",
"2026-04-06T04:00:08Z WARN [auth-service] DB connection pool: 30/30 active (SATURATED)",
"2026-04-06T04:00:10Z WARN [auth-service] Degraded mode: session validation averaging 1800ms (SLA: 200ms)",
"2026-04-06T04:00:15Z ERROR [auth-service] 5 request timeouts in last 60 seconds",
],
"user-service": [
"2026-04-06T03:30:00Z INFO [user-service] Config reload triggered by configmap update",
"2026-04-06T03:30:01Z WARN [user-service] Config validation: feature flag 'enable_profile_v2' references unknown experiment 'profile_redesign_q2'",
"2026-04-06T03:30:01Z WARN [user-service] Config validation: deprecated field 'legacy_avatar_url' present β will be removed in v4.0",
"2026-04-06T03:30:02Z INFO [user-service] Config applied successfully (2 warnings, 0 errors)",
"2026-04-06T03:30:03Z INFO [user-service] All endpoints healthy, no service disruption during config reload",
"2026-04-06T03:30:10Z INFO [user-service] GET /users/profile uid=user_4421 -> 200 (28ms)",
"2026-04-06T03:45:00Z INFO [user-service] GET /users/profile uid=user_1101 -> 200 (30ms)",
"2026-04-06T03:45:05Z INFO [user-service] PUT /users/profile uid=user_3310 -> 200 (82ms)",
"2026-04-06T04:00:00Z INFO [user-service] GET /users/profile uid=user_1101 -> 200 (28ms)",
"2026-04-06T04:00:01Z INFO [user-service] PUT /users/profile uid=user_3310 -> 200 (95ms)",
"2026-04-06T04:00:05Z INFO [user-service] GET /users/settings uid=user_5571 -> 200 (26ms)",
"2026-04-06T04:00:10Z INFO [user-service] Health check /healthz -> 200 OK",
],
"notification-service": [
"2026-04-06T03:45:00Z INFO [notification-service] Auth token validated for batch #4445 (48ms)",
"2026-04-06T03:45:01Z INFO [notification-service] Email batch #4445 sent successfully (15 emails)",
"2026-04-06T04:00:00Z WARN [notification-service] Auth token validation taking 2800ms (SLA: 500ms)",
"2026-04-06T04:00:02Z WARN [notification-service] Email delivery queue depth: 2400 (normal: <100)",
"2026-04-06T04:00:05Z ERROR [notification-service] Failed to validate sender auth for notification batch #8832 β auth-service timeout",
"2026-04-06T04:00:06Z WARN [notification-service] Pausing email delivery until auth validation recovers",
"2026-04-06T04:00:10Z WARN [notification-service] Queue depth growing: 2800 pending emails",
"2026-04-06T04:00:15Z ERROR [notification-service] Auth validation timeout for batch #8833",
"2026-04-06T04:00:20Z WARN [notification-service] Queue depth: 3200 β SLA breach imminent for time-sensitive notifications",
],
"api-gateway": [
"2026-04-06T03:59:55Z INFO [api-gateway] Route: POST /api/v2/login -> auth-service (200, 45ms)",
"2026-04-06T03:59:58Z INFO [api-gateway] Route: POST /api/v2/pay -> payment-service (200, 92ms)",
"2026-04-06T04:00:01Z ERROR [api-gateway] Route: POST /api/v2/pay -> payment-service (503, connection refused)",
"2026-04-06T04:00:02Z WARN [api-gateway] Route: POST /api/v2/login -> auth-service (200, 1800ms) β slow",
"2026-04-06T04:00:03Z INFO [api-gateway] Route: GET /api/v2/user/profile -> user-service (200, 28ms)",
"2026-04-06T04:00:05Z ERROR [api-gateway] Route: POST /api/v2/pay -> payment-service (503, connection refused)",
"2026-04-06T04:00:06Z WARN [api-gateway] Circuit breaker OPEN for payment-service (failures=5, threshold=5)",
"2026-04-06T04:00:08Z ERROR [api-gateway] Route: POST /api/v2/verify -> auth-service (504, timeout after 5000ms)",
"2026-04-06T04:00:10Z INFO [api-gateway] Route: GET /api/v2/user/settings -> user-service (200, 25ms)",
"2026-04-06T04:00:12Z ERROR [api-gateway] Route: POST /api/v2/pay -> payment-service (503, circuit breaker open)",
"2026-04-06T04:00:15Z WARN [api-gateway] Route: POST /api/v2/login -> auth-service (200, 3200ms) β very slow",
"2026-04-06T04:00:18Z ERROR [api-gateway] Route: POST /api/v2/verify -> auth-service (504, timeout after 5000ms)",
"2026-04-06T04:00:20Z INFO [api-gateway] Route: GET /api/v2/user/profile -> user-service (200, 30ms)",
],
"db-postgres": [
"2026-04-06T03:55:00Z INFO [db-postgres] Active connections: 42/100",
"2026-04-06T04:00:00Z INFO [db-postgres] Active connections: 58/100",
"2026-04-06T04:00:01Z INFO [db-postgres] Checkpoint starting: time-based",
"2026-04-06T04:00:03Z INFO [db-postgres] Checkpoint complete: wrote 1450 buffers (9.8%)",
"2026-04-06T04:00:05Z INFO [db-postgres] Higher than normal read load β auth-service fallback queries detected",
"2026-04-06T04:00:10Z INFO [db-postgres] Active connections: 62/100 β elevated but within limits",
"2026-04-06T04:00:15Z INFO [db-postgres] No deadlocks detected. Lock wait queue empty.",
"2026-04-06T04:00:20Z INFO [db-postgres] Autovacuum: processing table sessions (dead tuples: 850)",
],
},
metrics={
"payment-service": [
{"timestamp": "2026-04-06T03:55:00Z", "cpu_pct": 18, "mem_pct": 40, "latency_p50": 88, "latency_p99": 155, "error_rate": 0.001, "pods_ready": 2, "pods_total": 2},
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 0, "mem_pct": 0, "latency_p50": 0, "latency_p99": 0, "error_rate": 1.0, "pods_ready": 0, "pods_total": 2},
],
"cache-redis": [
{"timestamp": "2026-04-06T02:00:00Z", "mem_gb": 2.4, "mem_pct": 60, "hit_ratio": 0.92, "evictions_per_s": 0, "connections": 45, "fragmentation_ratio": 1.1},
{"timestamp": "2026-04-06T02:30:00Z", "mem_gb": 2.6, "mem_pct": 65, "hit_ratio": 0.91, "evictions_per_s": 0, "connections": 46, "fragmentation_ratio": 1.2},
{"timestamp": "2026-04-06T03:00:00Z", "mem_gb": 2.8, "mem_pct": 70, "hit_ratio": 0.90, "evictions_per_s": 5, "connections": 47, "fragmentation_ratio": 1.3},
{"timestamp": "2026-04-06T03:30:00Z", "mem_gb": 3.4, "mem_pct": 85, "hit_ratio": 0.72, "evictions_per_s": 1200, "connections": 48, "fragmentation_ratio": 1.5},
{"timestamp": "2026-04-06T03:45:00Z", "mem_gb": 3.7, "mem_pct": 92, "hit_ratio": 0.55, "evictions_per_s": 1800, "connections": 48, "fragmentation_ratio": 1.7},
{"timestamp": "2026-04-06T04:00:00Z", "mem_gb": 3.85, "mem_pct": 96, "hit_ratio": 0.35, "evictions_per_s": 2200, "connections": 47, "fragmentation_ratio": 1.8},
],
"auth-service": [
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 22, "mem_pct": 58, "latency_p50": 12, "latency_p99": 45, "error_rate": 0.001, "cache_hit_ratio": 0.90, "db_fallback_pct": 0.10},
{"timestamp": "2026-04-06T03:30:00Z", "cpu_pct": 28, "mem_pct": 60, "latency_p50": 25, "latency_p99": 180, "error_rate": 0.005, "cache_hit_ratio": 0.72, "db_fallback_pct": 0.28},
{"timestamp": "2026-04-06T03:45:00Z", "cpu_pct": 35, "mem_pct": 62, "latency_p50": 120, "latency_p99": 1200, "error_rate": 0.05, "cache_hit_ratio": 0.55, "db_fallback_pct": 0.45},
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 42, "mem_pct": 65, "latency_p50": 800, "latency_p99": 3200, "error_rate": 0.15, "cache_hit_ratio": 0.35, "db_fallback_pct": 0.65},
],
"user-service": [
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 28, "latency_p99": 75, "error_rate": 0.001},
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 30, "latency_p99": 82, "error_rate": 0.001},
],
"notification-service": [
{"timestamp": "2026-04-06T03:45:00Z", "cpu_pct": 12, "mem_pct": 30, "queue_depth": 15, "auth_validation_ms": 48, "emails_sent_per_min": 120},
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 14, "mem_pct": 32, "queue_depth": 2400, "auth_validation_ms": 2800, "emails_sent_per_min": 5},
],
"api-gateway": [
{"timestamp": "2026-04-06T03:55:00Z", "cpu_pct": 20, "mem_pct": 45, "latency_p50": 35, "latency_p99": 95, "error_rate": 0.002, "5xx_rate": 0.001},
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 28, "mem_pct": 48, "latency_p50": 120, "latency_p99": 5200, "error_rate": 0.35, "5xx_rate": 0.32},
],
"db-postgres": [
{"timestamp": "2026-04-06T03:55:00Z", "cpu_pct": 35, "mem_pct": 55, "connections": 42, "active_locks": 2, "deadlocks": 0, "write_iops": 1200, "read_iops": 3500},
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 45, "mem_pct": 58, "connections": 62, "active_locks": 3, "deadlocks": 0, "write_iops": 1100, "read_iops": 4800},
],
},
traces={
"payment-service": [
"No recent traces β service is down (CrashLoopBackOff). Last successful trace (before deploy):",
"Trace: POST /api/v2/pay (txn=pay_9901, total=92ms) β v3.8.1",
" ββ payment-service.validateRequest() 8ms",
" ββ payment-service.checkBalance() 25ms (SELECT -> db-postgres)",
" ββ payment-service.insertTransaction() 40ms (INSERT -> db-postgres)",
" ββ payment-service.sendConfirmation() 19ms",
],
"auth-service": [
"Trace: POST /auth/verify (uid=user_8832, total=3200ms)",
" ββ auth-service.checkSessionCache() 8ms (cache-redis MISS)",
" ββ auth-service.fallbackDBLookup() 2900ms (db-postgres β under load from fallback traffic)",
" ββ auth-service.validateToken() 45ms",
" ββ auth-service.writeBackToCache() FAILED (redis OOM rejected write)",
],
"notification-service": [
"Trace: POST /notifications/send (batch=#8832, total=5200ms) β TIMEOUT",
" ββ notification-service.prepareBatch() 12ms",
" ββ notification-service.validateAuth() 5000ms (-> auth-service TIMEOUT)",
" ββ notification-service.sendEmails() never reached",
],
},
deploy_history={
"payment-service": [
"v3.8.2 deployed 2026-04-06T04:00:00Z status=CrashLoopBackOff (deployed 15 min ago)",
"v3.8.1 deployed 2026-04-03T14:00:00Z status=superseded (was stable for 3 days)",
"v3.8.0 deployed 2026-03-28T10:00:00Z status=superseded",
],
"auth-service": [
"v2.14.0 deployed 2026-04-01T10:00:00Z status=stable (running 5 days, no issues)",
],
"cache-redis": [
"v7.2.4 deployed 2026-03-20T09:00:00Z status=stable (running 17 days)",
],
"user-service": [
"v4.2.1 deployed 2026-04-05T16:00:00Z status=stable (running 12 hours)",
],
},
runbooks={
"payment-service": (
"## payment-service Runbook\n"
"- Crash on startup / CrashLoopBackOff: Check recent deploys. If the latest deploy\n"
" introduced the crash, rollback to previous known-good version:\n"
" rollback_deploy(service='payment-service', target_version='<previous_version>')\n"
" Check deploy history for the last stable version.\n"
"- Transaction timeouts: Check db-postgres connection pool and lock status.\n"
"- High latency: Check downstream service health (db-postgres)."
),
"cache-redis": (
"## cache-redis Runbook\n"
"- Memory pressure / approaching maxmemory: Check memory trend in metrics.\n"
" If memory grows despite eviction, likely a memory leak.\n"
" Short-term fix: restart_service to clear leaked memory.\n"
" Alternative: scale_up to add more replicas and distribute load.\n"
"- Elevated miss ratio: If caused by memory pressure/eviction storm, fix memory issue first.\n"
" If caused by TTL expiry batch, wait for cache to warm back up."
),
"auth-service": (
"## auth-service Runbook\n"
"- High latency / DB fallback: Check cache-redis health. If redis is degraded,\n"
" auth-service falls back to DB lookups which are 10-50x slower.\n"
" Fix redis first β auth-service will recover automatically.\n"
"- Cache write failures: Redis may be rejecting writes due to OOM. Check redis memory."
),
"notification-service": (
"## notification-service Runbook\n"
"- Queue backing up: Usually caused by auth-service degradation. Notification-service\n"
" validates sender auth before sending. If auth is slow, queue grows.\n"
" Fix auth-service first β queue will drain automatically."
),
},
configs={
"payment-service": {
"current": "DB_POOL_SIZE=50\nDB_TIMEOUT=5000\nRETRY_COUNT=3\nVALIDATOR_VERSION=v2\nFEATURE_NEW_VALIDATION=true",
"previous": "DB_POOL_SIZE=50\nDB_TIMEOUT=5000\nRETRY_COUNT=3\nVALIDATOR_VERSION=v1\nFEATURE_NEW_VALIDATION=false",
"diff": "Changed VALIDATOR_VERSION from v1 to v2, enabled FEATURE_NEW_VALIDATION (part of v3.8.2 deploy)",
},
"user-service": {
"current": "FEATURE_PROFILE_V2=true\nLEGACY_AVATAR_URL=https://cdn.example.com/avatars\nDB_POOL_SIZE=30",
"previous": "FEATURE_PROFILE_V2=false\nDB_POOL_SIZE=30",
"diff": "Added FEATURE_PROFILE_V2=true and LEGACY_AVATAR_URL (config change 30 min ago). 2 validation warnings but applied successfully.",
},
"cache-redis": {
"current": "maxmemory=4gb\nmaxmemory-policy=allkeys-lru\ntimeout=300\ntcp-keepalive=60",
"previous": "maxmemory=4gb\nmaxmemory-policy=allkeys-lru\ntimeout=300\ntcp-keepalive=60",
"diff": "No changes β config has not been modified recently.",
},
},
dependencies={
"api-gateway": ["auth-service", "user-service", "payment-service"],
"auth-service": ["cache-redis"],
"user-service": ["db-postgres"],
"payment-service": ["db-postgres"],
"db-postgres": [],
"cache-redis": [],
"notification-service": ["auth-service"],
},
root_cause_services=["payment-service", "cache-redis"],
root_cause_categories=[RootCauseCategory.BAD_DEPLOY, RootCauseCategory.MEMORY_LEAK],
required_fixes=[
RequiredFix(action="rollback_deploy", service="payment-service", target_version="v3.8.1"),
RequiredFix(action="restart_service", service="cache-redis"),
],
diagnosis_keywords=[
"payment-service", "deploy", "rollback", "v3.8.2", "v3.8.1", "NullPointerException", "crash",
"cache-redis", "memory", "leak", "eviction", "auth-service", "fallback",
],
weights={
"correct_service": 0.15,
"correct_category": 0.10,
"correct_fix": 0.15,
"secondary_fix": 0.20,
"diagnosis_text": 0.15,
"investigation": 0.10,
"wrong_penalty": 0.05,
},
)
|