Spaces:
Sleeping
Sleeping
File size: 21,183 Bytes
5fe9036 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 | """
Task: Cascading Database Deadlock
To add a new task, copy this file, modify the SCENARIO definition, and place it in tasks/.
The task loader will auto-discover it.
"""
from env.scenario import IncidentScenario, RequiredFix, ServiceConfig
from models import RootCauseCategory, ServiceStatus
SCENARIO = IncidentScenario(
task_id="medium",
name="Cascading Database Deadlock",
difficulty="medium",
max_steps=25,
incident_summary=(
"Multiple alerts fired at 03:05 UTC. payment-service and user-service both showing elevated "
"error rates and latency. Transaction timeouts increasing. cache-redis also flagged with "
"elevated miss ratio. Need to identify root cause and restore write path."
),
services={
"api-gateway": ServiceConfig(
status=ServiceStatus.DEGRADED, deps=["auth-service", "user-service", "payment-service"],
version="v1.12.0", replicas=3,
),
"auth-service": ServiceConfig(
status=ServiceStatus.HEALTHY, deps=["cache-redis"],
version="v2.14.0", replicas=2,
),
"user-service": ServiceConfig(
status=ServiceStatus.DEGRADED, deps=["db-postgres"],
version="v4.2.1", replicas=2,
),
"payment-service": ServiceConfig(
status=ServiceStatus.DEGRADED, deps=["db-postgres"],
version="v3.8.1", replicas=2,
),
"db-postgres": ServiceConfig(
status=ServiceStatus.DEGRADED, deps=[],
version="v15.4", replicas=1, is_root_cause=True, fault_type="db_deadlock",
),
"cache-redis": ServiceConfig(
status=ServiceStatus.HEALTHY, deps=[],
version="v7.2.4", replicas=1,
),
"notification-service": ServiceConfig(
status=ServiceStatus.HEALTHY, deps=["auth-service"],
version="v1.5.0", replicas=1,
),
},
initial_alerts=[
"[ALERT SEV-2] payment-service: transaction timeouts >15%, p99 latency >2s",
"[ALERT SEV-2] user-service: elevated error rate on profile updates",
"[ALERT SEV-3] cache-redis: cache miss ratio elevated (informational)",
],
logs={
"payment-service": [
"2026-04-06T03:00:01Z INFO [payment-service] Processing payment txn=pay_8832 amount=$45.00 -> db-postgres",
"2026-04-06T03:00:02Z INFO [payment-service] Payment completed txn=pay_8832 latency=85ms",
"2026-04-06T03:00:10Z INFO [payment-service] Processing payment txn=pay_1120 amount=$12.99 -> db-postgres",
"2026-04-06T03:00:11Z INFO [payment-service] Payment completed txn=pay_1120 latency=92ms",
"2026-04-06T03:00:20Z INFO [payment-service] Processing payment txn=pay_3341 amount=$199.00 -> db-postgres",
"2026-04-06T03:00:21Z INFO [payment-service] Payment completed txn=pay_3341 latency=78ms",
"2026-04-06T03:01:00Z INFO [payment-service] Health check /healthz -> 200 OK",
"2026-04-06T03:02:00Z INFO [payment-service] Processing payment txn=pay_5590 amount=$25.00 -> db-postgres",
"2026-04-06T03:02:01Z INFO [payment-service] Payment completed txn=pay_5590 latency=95ms",
"2026-04-06T03:03:00Z INFO [payment-service] Processing payment txn=pay_6612 amount=$150.00 -> db-postgres",
"2026-04-06T03:03:01Z INFO [payment-service] Payment completed txn=pay_6612 latency=88ms",
"2026-04-06T03:04:00Z INFO [payment-service] Health check /healthz -> 200 OK",
"2026-04-06T03:05:00Z INFO [payment-service] Processing payment txn=pay_7789 amount=$55.00 -> db-postgres",
"2026-04-06T03:05:12Z WARN [payment-service] Slow query: INSERT INTO transactions (...) took 3200ms (threshold: 500ms)",
"2026-04-06T03:05:15Z INFO [payment-service] Payment completed txn=pay_7789 latency=3250ms",
"2026-04-06T03:05:16Z INFO [payment-service] Processing payment txn=pay_1120 amount=$67.00 -> db-postgres",
"2026-04-06T03:05:18Z WARN [payment-service] DB connection pool: 48/50 active (96% utilized)",
"2026-04-06T03:05:20Z ERROR [payment-service] Transaction timeout: txn=pay_4455 exceeded 5000ms deadline",
"2026-04-06T03:05:20Z ERROR [payment-service] Retrying txn=pay_4455 (attempt 2/3)",
"2026-04-06T03:05:25Z ERROR [payment-service] Transaction timeout: txn=pay_4455 exceeded 5000ms deadline (retry 2)",
"2026-04-06T03:05:25Z ERROR [payment-service] Transaction failed permanently: txn=pay_4455 after 3 retries",
"2026-04-06T03:05:26Z WARN [payment-service] DB connection pool: 50/50 active (SATURATED) β new requests queuing",
"2026-04-06T03:05:28Z ERROR [payment-service] Connection acquisition timeout: waited 10s for available connection",
"2026-04-06T03:05:30Z INFO [payment-service] Read query SELECT balance WHERE user_id=... completed in 45ms",
"2026-04-06T03:05:32Z ERROR [payment-service] Transaction timeout: txn=pay_6691 exceeded 5000ms deadline",
"2026-04-06T03:05:33Z WARN [payment-service] Circuit breaker WARNING for db-postgres writes (failures=8/10 threshold)",
"2026-04-06T03:05:35Z ERROR [payment-service] Transaction timeout: txn=pay_7801 exceeded 5000ms deadline",
"2026-04-06T03:05:40Z ERROR [payment-service] Transaction timeout: txn=pay_8912 exceeded 5000ms deadline",
"2026-04-06T03:06:00Z ERROR [payment-service] Connection acquisition timeout: waited 15s for available connection",
"2026-04-06T03:07:00Z ERROR [payment-service] 12 transactions failed in last 5 minutes. Write path severely degraded.",
"2026-04-06T03:08:00Z ERROR [payment-service] 15 transactions failed in last 5 minutes. Write path severely degraded.",
],
"user-service": [
"2026-04-06T03:00:01Z INFO [user-service] GET /users/profile uid=user_4421 -> 200 (32ms)",
"2026-04-06T03:00:05Z INFO [user-service] GET /users/settings uid=user_8832 -> 200 (28ms)",
"2026-04-06T03:00:10Z INFO [user-service] PUT /users/profile uid=user_3310 -> 200 (85ms)",
"2026-04-06T03:01:00Z INFO [user-service] GET /users/profile uid=user_1101 -> 200 (30ms)",
"2026-04-06T03:02:00Z INFO [user-service] PUT /users/settings uid=user_5571 -> 200 (78ms)",
"2026-04-06T03:03:00Z INFO [user-service] GET /users/profile uid=user_7712 -> 200 (27ms)",
"2026-04-06T03:04:00Z INFO [user-service] GET /users/profile uid=user_2209 -> 200 (31ms)",
"2026-04-06T03:05:10Z INFO [user-service] GET /users/profile uid=user_9901 -> 200 (29ms)",
"2026-04-06T03:05:15Z INFO [user-service] GET /users/profile uid=user_6633 -> 200 (26ms)",
"2026-04-06T03:05:18Z WARN [user-service] Slow mutation: UPDATE users SET email=... took 4100ms",
"2026-04-06T03:05:20Z ERROR [user-service] Profile update failed: uid=user_8832 β database lock acquisition timeout",
"2026-04-06T03:05:22Z INFO [user-service] GET /users/profile uid=user_1101 -> 200 (28ms)",
"2026-04-06T03:05:25Z ERROR [user-service] Profile update failed: uid=user_3310 β database lock acquisition timeout",
"2026-04-06T03:05:26Z WARN [user-service] Write operations failing at 60% rate, reads unaffected",
"2026-04-06T03:05:30Z INFO [user-service] GET /users/profile uid=user_4482 -> 200 (30ms)",
"2026-04-06T03:06:00Z ERROR [user-service] Profile update failed: uid=user_5510 β database lock acquisition timeout",
"2026-04-06T03:06:05Z INFO [user-service] GET /users/settings uid=user_7781 -> 200 (25ms)",
"2026-04-06T03:07:00Z WARN [user-service] Write operations failing at 75% rate, reads unaffected",
],
"db-postgres": [
"2026-04-06T02:55:00Z INFO [db-postgres] Connection from analytics-cron@10.0.3.42: BEGIN; SELECT ... FROM transactions JOIN users ... (full table scan)",
"2026-04-06T02:55:01Z INFO [db-postgres] Query plan: Seq Scan on transactions (rows=2.4M, cost=45000..89000)",
"2026-04-06T02:55:01Z WARN [db-postgres] Long-running transaction txid=8830012 holding RowExclusiveLock on transactions table",
"2026-04-06T02:56:00Z INFO [db-postgres] Active connections: 55/100",
"2026-04-06T02:58:00Z INFO [db-postgres] Active connections: 68/100",
"2026-04-06T03:00:00Z INFO [db-postgres] Checkpoint starting: time-based",
"2026-04-06T03:00:02Z INFO [db-postgres] Checkpoint complete: wrote 1204 buffers (8.2%)",
"2026-04-06T03:00:05Z INFO [db-postgres] Active connections: 70/100",
"2026-04-06T03:02:00Z INFO [db-postgres] Active connections: 78/100",
"2026-04-06T03:04:00Z INFO [db-postgres] Active connections: 88/100",
"2026-04-06T03:05:10Z WARN [db-postgres] Deadlock detected: process 4821 (payment-service) waiting for RowExclusiveLock on transactions, blocked by process 4455 (analytics-cron)",
"2026-04-06T03:05:10Z WARN [db-postgres] Deadlock detected: process 4830 (user-service) waiting for RowExclusiveLock on users, blocked by process 4455 (analytics-cron)",
"2026-04-06T03:05:11Z INFO [db-postgres] Active connections: 95/100 (analytics-cron holding 1, payment-service pool 50, user-service pool 30, other 14)",
"2026-04-06T03:05:15Z WARN [db-postgres] Long-running transaction txid=8830012 has been active for 10m15s β consider terminating",
"2026-04-06T03:05:20Z WARN [db-postgres] Lock wait queue depth: 12 processes waiting",
"2026-04-06T03:06:00Z INFO [db-postgres] SELECT queries completing normally (read path unaffected)",
"2026-04-06T03:06:30Z WARN [db-postgres] Connection pool nearing limit: 98/100 active",
"2026-04-06T03:07:00Z WARN [db-postgres] Lock wait queue depth: 18 processes waiting β growing",
"2026-04-06T03:08:00Z ERROR [db-postgres] Connection limit reached: 100/100 β rejecting new connections",
],
"auth-service": [
"2026-04-06T03:00:00Z INFO [auth-service] Request processed: POST /auth/token uid=user_8832 latency=42ms",
"2026-04-06T03:00:05Z INFO [auth-service] Cache hit for session sid=a8f32c, returning cached token",
"2026-04-06T03:05:00Z INFO [auth-service] Request processed: POST /auth/verify uid=user_3310 latency=38ms",
"2026-04-06T03:05:10Z INFO [auth-service] Request processed: POST /auth/token uid=user_5571 latency=45ms",
"2026-04-06T03:05:30Z INFO [auth-service] Health check /healthz -> 200 OK",
"2026-04-06T03:08:00Z INFO [auth-service] Request processed: POST /auth/verify uid=user_1101 latency=40ms",
],
"cache-redis": [
"2026-04-06T03:00:00Z INFO [cache-redis] Memory usage: 1.2GB/4.0GB (30%)",
"2026-04-06T03:00:01Z INFO [cache-redis] Cache hit ratio: 82% (normal: 85-95%)",
"2026-04-06T03:02:00Z INFO [cache-redis] Cache hit ratio: 80%",
"2026-04-06T03:05:00Z INFO [cache-redis] Cache hit ratio: 78% β slight decrease",
"2026-04-06T03:05:01Z INFO [cache-redis] Key evictions: 45 in last 5m (within normal range)",
"2026-04-06T03:05:02Z WARN [cache-redis] Cache miss ratio elevated for prefix auth:session:* β possible cache warming after TTL expiry batch",
"2026-04-06T03:05:10Z INFO [cache-redis] Memory usage: 1.3GB/4.0GB (32%) β stable",
"2026-04-06T03:06:00Z INFO [cache-redis] Cache hit ratio recovering: 84%",
"2026-04-06T03:08:00Z INFO [cache-redis] Cache hit ratio: 88% β back to normal",
],
"api-gateway": [
"2026-04-06T03:00:01Z INFO [api-gateway] Route: POST /api/v2/login -> auth-service (200, 45ms)",
"2026-04-06T03:00:02Z INFO [api-gateway] Route: POST /api/v2/pay -> payment-service (200, 88ms)",
"2026-04-06T03:05:20Z WARN [api-gateway] Route: POST /api/v2/pay -> payment-service (504, 5200ms)",
"2026-04-06T03:05:22Z WARN [api-gateway] Route: PUT /api/v2/user/profile -> user-service (504, 4800ms)",
"2026-04-06T03:05:25Z INFO [api-gateway] Route: GET /api/v2/user/profile -> user-service (200, 30ms)",
"2026-04-06T03:05:30Z INFO [api-gateway] Route: POST /api/v2/login -> auth-service (200, 42ms)",
"2026-04-06T03:06:00Z ERROR [api-gateway] Route: POST /api/v2/pay -> payment-service (504, timeout)",
],
"notification-service": [
"2026-04-06T03:00:00Z INFO [notification-service] Email batch #4430 sent successfully (10 emails)",
"2026-04-06T03:05:00Z INFO [notification-service] Email batch #4435 sent successfully (7 emails)",
"2026-04-06T03:08:00Z INFO [notification-service] Health check /healthz -> 200 OK",
],
},
metrics={
"payment-service": [
{"timestamp": "2026-04-06T02:50:00Z", "cpu_pct": 20, "mem_pct": 40, "latency_p50": 80, "latency_p99": 150, "error_rate": 0.001, "db_pool_active": 15, "db_pool_max": 50},
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 20, "mem_pct": 40, "latency_p50": 85, "latency_p99": 160, "error_rate": 0.002, "db_pool_active": 18, "db_pool_max": 50},
{"timestamp": "2026-04-06T03:05:00Z", "cpu_pct": 22, "mem_pct": 41, "latency_p50": 3200, "latency_p99": 8500, "error_rate": 0.35, "db_pool_active": 50, "db_pool_max": 50},
{"timestamp": "2026-04-06T03:08:00Z", "cpu_pct": 18, "mem_pct": 40, "latency_p50": 4500, "latency_p99": "timeout", "error_rate": 0.52, "db_pool_active": 50, "db_pool_max": 50},
],
"user-service": [
{"timestamp": "2026-04-06T02:50:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 28, "latency_p99": 75, "error_rate": 0.001, "write_error_rate": 0.001},
{"timestamp": "2026-04-06T03:05:00Z", "cpu_pct": 16, "mem_pct": 35, "latency_p50": 30, "latency_p99": 4100, "error_rate": 0.18, "write_error_rate": 0.60},
{"timestamp": "2026-04-06T03:08:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 28, "latency_p99": "timeout", "error_rate": 0.25, "write_error_rate": 0.75},
],
"db-postgres": [
{"timestamp": "2026-04-06T02:50:00Z", "cpu_pct": 35, "mem_pct": 60, "connections": 45, "active_locks": 3, "lock_wait_ms_p99": 5, "write_iops": 1200, "read_iops": 3500, "deadlocks": 0},
{"timestamp": "2026-04-06T02:55:00Z", "cpu_pct": 55, "mem_pct": 62, "connections": 55, "active_locks": 8, "lock_wait_ms_p99": 15, "write_iops": 1200, "read_iops": 4200, "deadlocks": 0},
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 65, "mem_pct": 64, "connections": 70, "active_locks": 15, "lock_wait_ms_p99": 250, "write_iops": 800, "read_iops": 4000, "deadlocks": 0},
{"timestamp": "2026-04-06T03:05:00Z", "cpu_pct": 78, "mem_pct": 65, "connections": 95, "active_locks": 28, "lock_wait_ms_p99": 8500, "write_iops": 200, "read_iops": 3800, "deadlocks": 4},
{"timestamp": "2026-04-06T03:08:00Z", "cpu_pct": 80, "mem_pct": 66, "connections": 100, "active_locks": 32, "lock_wait_ms_p99": 12000, "write_iops": 50, "read_iops": 3600, "deadlocks": 12},
],
"auth-service": [
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 22, "mem_pct": 58, "latency_p50": 42, "latency_p99": 110, "error_rate": 0.001},
{"timestamp": "2026-04-06T03:08:00Z", "cpu_pct": 23, "mem_pct": 58, "latency_p50": 44, "latency_p99": 115, "error_rate": 0.001},
],
"cache-redis": [
{"timestamp": "2026-04-06T03:00:00Z", "mem_gb": 1.2, "mem_pct": 30, "hit_ratio": 0.82, "evictions_per_s": 8, "connections": 46},
{"timestamp": "2026-04-06T03:05:00Z", "mem_gb": 1.3, "mem_pct": 32, "hit_ratio": 0.78, "evictions_per_s": 12, "connections": 46},
{"timestamp": "2026-04-06T03:08:00Z", "mem_gb": 1.2, "mem_pct": 30, "hit_ratio": 0.88, "evictions_per_s": 2, "connections": 45},
],
"api-gateway": [
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 20, "mem_pct": 45, "latency_p50": 35, "latency_p99": 90, "error_rate": 0.002, "5xx_rate": 0.001},
{"timestamp": "2026-04-06T03:05:00Z", "cpu_pct": 22, "mem_pct": 46, "latency_p50": 45, "latency_p99": 5500, "error_rate": 0.18, "5xx_rate": 0.15},
{"timestamp": "2026-04-06T03:08:00Z", "cpu_pct": 23, "mem_pct": 46, "latency_p50": 50, "latency_p99": "timeout", "error_rate": 0.25, "5xx_rate": 0.22},
],
},
traces={
"payment-service": [
"Trace: POST /api/v2/pay (txn=pay_6691, total=8500ms) β TIMEOUT",
" ββ payment-service.validateRequest() 12ms",
" ββ payment-service.checkBalance() 45ms (SELECT -> db-postgres, fast)",
" ββ payment-service.insertTransaction() 8400ms (INSERT -> db-postgres, BLOCKED ON LOCK)",
" ββ payment-service.sendConfirmation() never reached (timeout)",
],
"user-service": [
"Trace: PUT /api/v2/user/profile (uid=user_8832, total=4800ms) β TIMEOUT",
" ββ user-service.validateInput() 5ms",
" ββ user-service.updateProfile() 4780ms (UPDATE -> db-postgres, BLOCKED ON LOCK)",
" ββ user-service.invalidateCache() never reached (timeout)",
],
},
deploy_history={
"payment-service": [
"v3.8.1 deployed 2026-04-03T14:00:00Z status=stable (running 3 days)",
],
"user-service": [
"v4.2.1 deployed 2026-04-05T16:00:00Z status=stable (running 11 hours)",
],
"db-postgres": [
"v15.4 deployed 2026-03-15T08:00:00Z status=stable (running 22 days)",
],
},
runbooks={
"payment-service": (
"## payment-service Runbook\n"
"- Transaction timeouts: Check db-postgres connection pool and lock status.\n"
" If db connection pool is saturated but CPU/memory are normal, likely a DB-side issue.\n"
"- High latency: Check downstream service health (db-postgres).\n"
"- Crash on startup: Check recent deploys and rollback if needed."
),
"db-postgres": (
"## db-postgres Runbook\n"
"- Deadlocks: Identify the blocking transaction using pg_stat_activity.\n"
" Kill long-running queries or restart postgres to clear all locks.\n"
"- Connection exhaustion: Check for connection leaks. Consider increasing max_connections\n"
" or terminating idle connections.\n"
"- High CPU: Check for expensive queries in pg_stat_statements. Consider adding indexes.\n"
"- Replication lag: Check network connectivity to replicas and WAL sender status."
),
"cache-redis": (
"## cache-redis Runbook\n"
"- Elevated miss ratio: Often caused by TTL expiry batches. Wait 5-10 minutes for cache\n"
" to warm back up. If miss ratio doesn't recover, check maxmemory and eviction policy.\n"
"- Memory pressure: Check for memory leaks. Scale up replicas or increase maxmemory.\n"
"- Connection issues: Check network connectivity and client pool configuration."
),
},
configs={
"db-postgres": {
"current": "max_connections=100\nshared_buffers=4GB\nwork_mem=256MB\nlock_timeout=30s\ndeadlock_timeout=1s",
"previous": "max_connections=100\nshared_buffers=4GB\nwork_mem=256MB\nlock_timeout=30s\ndeadlock_timeout=1s",
"diff": "No changes β config has not been modified recently.",
},
"payment-service": {
"current": "DB_POOL_SIZE=50\nDB_TIMEOUT=5000\nRETRY_COUNT=3\nCIRCUIT_BREAKER_THRESHOLD=10",
"previous": "DB_POOL_SIZE=50\nDB_TIMEOUT=5000\nRETRY_COUNT=3\nCIRCUIT_BREAKER_THRESHOLD=10",
"diff": "No changes β config has not been modified recently.",
},
},
dependencies={
"api-gateway": ["auth-service", "user-service", "payment-service"],
"auth-service": ["cache-redis"],
"user-service": ["db-postgres"],
"payment-service": ["db-postgres"],
"db-postgres": [],
"cache-redis": [],
"notification-service": ["auth-service"],
},
root_cause_services=["db-postgres"],
root_cause_categories=[RootCauseCategory.DB_DEADLOCK],
required_fixes=[
RequiredFix(action="restart_service", service="db-postgres"),
],
diagnosis_keywords=["db-postgres", "deadlock", "lock", "analytics-cron", "long-running", "transaction", "blocking"],
weights={
"correct_service": 0.25,
"correct_category": 0.20,
"correct_fix": 0.25,
"secondary_fix": 0.00,
"diagnosis_text": 0.10,
"investigation": 0.10,
"wrong_penalty": 0.05,
},
)
|