# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. """ Three fixed incident scenarios for the PostMortem env. Each scenario is a self-contained dict with: - description: the brief an on-call engineer would receive - services: list of service names the agent may query - logs: {service: [log lines]} - metrics: {service: {metric: value}} - traces: {trace_id: [span dicts]} - gold: the oracle answers for grading """ from typing import Dict, List, Any SCENARIOS: List[Dict[str, Any]] = [ # ---------- EASY ---------- { "task_id": "easy_oom", "difficulty": "easy", "description": ( "ALERT: Service `api` is returning HTTP 500 on ~80% of requests. " "On-call has paged you. Investigate, scope the blast radius, " "identify the root cause, mitigate, and write a status-page update." ), "services": ["api", "db", "auth"], "logs": { "api": [ "2026-04-08T19:55:01Z INFO request id=req-001 path=/v1/users", "2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space", "2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space", "2026-04-08T19:55:03Z WARN GC overhead limit exceeded", "2026-04-08T19:55:03Z ERROR pod api-7f8b restarting OOMKilled", ], "db": [ "2026-04-08T19:55:01Z INFO connection accepted from api", "2026-04-08T19:55:02Z INFO query completed in 12ms", "2026-04-08T19:55:05Z INFO idle connections=45", ], "auth": [ "2026-04-08T19:55:01Z INFO token issued", "2026-04-08T19:55:03Z INFO token validated", ], }, "metrics": { "api": {"cpu_pct": 98, "mem_pct": 99, "p99_latency_ms": 12000, "error_rate": 0.82}, "db": {"cpu_pct": 22, "mem_pct": 34, "p99_latency_ms": 18, "error_rate": 0.0}, "auth": {"cpu_pct": 8, "mem_pct": 14, "p99_latency_ms": 6, "error_rate": 0.0}, }, "traces": { "trace_abc": [ {"service": "api", "op": "GET /v1/users", "duration_ms": 11800, "error": True}, {"service": "db", "op": "SELECT users", "duration_ms": 14, "error": False}, ], }, "gold": { "scope": ["api"], "hypothesis_keywords": ["oom", "memory", "heap"], "mitigation_keywords": ["restart", "rollback", "scale", "increase heap", "increase memory"], "writeup_keywords": ["api", "memory", "restart", "resolved"], }, }, # ---------- MEDIUM ---------- { "task_id": "medium_cascade", "difficulty": "medium", "description": ( "ALERT: Checkout latency p99 has crossed 5 seconds in the last 10 minutes. " "Three services are involved: `checkout`, `payments`, `inventory`. " "Correlate across logs, metrics and traces to find the root service, " "then scope / hypothesise / mitigate / communicate." ), "services": ["checkout", "payments", "inventory"], "logs": { "checkout": [ "2026-04-08T20:10:01Z INFO POST /checkout trace_id=trace_xyz", "2026-04-08T20:10:08Z WARN downstream payments slow (7s)", "2026-04-08T20:10:08Z INFO returning 200 to client", ], "payments": [ "2026-04-08T20:10:01Z INFO charge_card trace_id=trace_xyz", "2026-04-08T20:10:06Z WARN inventory check blocking", "2026-04-08T20:10:07Z INFO charge_card success", ], "inventory": [ "2026-04-08T20:10:01Z INFO reserve_items trace_id=trace_xyz", "2026-04-08T20:10:05Z ERROR connection pool exhausted (max=20)", "2026-04-08T20:10:06Z ERROR connection pool exhausted (max=20)", "2026-04-08T20:10:06Z WARN request queued for 5400ms", ], }, "metrics": { "checkout": {"cpu_pct": 40, "mem_pct": 55, "p99_latency_ms": 7800, "error_rate": 0.01}, "payments": {"cpu_pct": 35, "mem_pct": 42, "p99_latency_ms": 6900, "error_rate": 0.0}, "inventory": {"cpu_pct": 12, "mem_pct": 28, "p99_latency_ms": 5600, "error_rate": 0.0, "conn_pool_waiting": 44}, }, "traces": { "trace_xyz": [ {"service": "checkout", "op": "POST /checkout", "duration_ms": 7800, "error": False}, {"service": "payments", "op": "charge_card", "duration_ms": 6900, "error": False}, {"service": "inventory", "op": "reserve_items", "duration_ms": 5500, "error": False}, ], }, "gold": { "scope": ["checkout", "payments", "inventory"], "hypothesis_keywords": ["inventory", "connection pool", "pool exhaust", "conn"], "mitigation_keywords": ["increase pool", "pool size", "restart inventory", "scale inventory"], "writeup_keywords": ["inventory", "connection", "pool", "latency"], }, }, # ---------- HARD ---------- { "task_id": "hard_dns", "difficulty": "hard", "description": ( "ALERT: Intermittent 503s across multiple services (`web`, `api`, `worker`). " "A deploy of `api` went out 10 minutes ago and is the obvious suspect. " "Correlate carefully — the real root cause may be upstream. " "Scope, hypothesise, mitigate, and write a customer-facing status update." ), "services": ["web", "api", "worker"], "logs": { "web": [ "2026-04-08T21:00:01Z INFO GET /home 200", "2026-04-08T21:00:07Z ERROR getaddrinfo ENOTFOUND api.internal", "2026-04-08T21:00:08Z ERROR getaddrinfo ENOTFOUND api.internal", "2026-04-08T21:00:09Z ERROR upstream connect timeout", ], "api": [ "2026-04-08T20:50:00Z INFO deploy v2.31.0 started", "2026-04-08T20:51:10Z INFO deploy v2.31.0 complete, healthy", "2026-04-08T21:00:07Z INFO process healthy, listening on :8080", "2026-04-08T21:00:07Z INFO request handled 200", ], "worker": [ "2026-04-08T21:00:01Z INFO picked up job id=42", "2026-04-08T21:00:06Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host", "2026-04-08T21:00:08Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host", ], }, "metrics": { "web": {"cpu_pct": 20, "mem_pct": 35, "p99_latency_ms": 9000, "error_rate": 0.45}, "api": {"cpu_pct": 14, "mem_pct": 31, "p99_latency_ms": 42, "error_rate": 0.0}, "worker": {"cpu_pct": 18, "mem_pct": 27, "p99_latency_ms": 8800, "error_rate": 0.55}, }, "traces": { "trace_qqq": [ {"service": "web", "op": "GET /home", "duration_ms": 9000, "error": True, "note": "dns resolution failed"}, ], }, "gold": { "scope": ["web", "worker"], "hypothesis_keywords": ["dns", "resolver", "10.0.0.2", "enotfound", "no such host"], "mitigation_keywords": ["restart dns", "restart resolver", "failover dns", "point to backup resolver", "flush dns"], "writeup_keywords": ["dns", "resolution", "intermittent", "503", "restored"], }, }, ] def get_scenario(index: int) -> Dict[str, Any]: return SCENARIOS[index % len(SCENARIOS)] def num_scenarios() -> int: return len(SCENARIOS)