Spaces:
Sleeping
Sleeping
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # All rights reserved. | |
| """ | |
| Three fixed incident scenarios for the PostMortem env. | |
| Each scenario is a self-contained dict with: | |
| - description: the brief an on-call engineer would receive | |
| - services: list of service names the agent may query | |
| - logs: {service: [log lines]} | |
| - metrics: {service: {metric: value}} | |
| - traces: {trace_id: [span dicts]} | |
| - gold: the oracle answers for grading | |
| """ | |
| from typing import Dict, List, Any | |
| SCENARIOS: List[Dict[str, Any]] = [ | |
| # ---------- EASY ---------- | |
| { | |
| "task_id": "easy_oom", | |
| "difficulty": "easy", | |
| "description": ( | |
| "ALERT: Service `api` is returning HTTP 500 on ~80% of requests. " | |
| "On-call has paged you. Investigate, scope the blast radius, " | |
| "identify the root cause, mitigate, and write a status-page update." | |
| ), | |
| "services": ["api", "db", "auth"], | |
| "logs": { | |
| "api": [ | |
| "2026-04-08T19:55:01Z INFO request id=req-001 path=/v1/users", | |
| "2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space", | |
| "2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space", | |
| "2026-04-08T19:55:03Z WARN GC overhead limit exceeded", | |
| "2026-04-08T19:55:03Z ERROR pod api-7f8b restarting OOMKilled", | |
| ], | |
| "db": [ | |
| "2026-04-08T19:55:01Z INFO connection accepted from api", | |
| "2026-04-08T19:55:02Z INFO query completed in 12ms", | |
| "2026-04-08T19:55:05Z INFO idle connections=45", | |
| ], | |
| "auth": [ | |
| "2026-04-08T19:55:01Z INFO token issued", | |
| "2026-04-08T19:55:03Z INFO token validated", | |
| ], | |
| }, | |
| "metrics": { | |
| "api": {"cpu_pct": 98, "mem_pct": 99, "p99_latency_ms": 12000, "error_rate": 0.82}, | |
| "db": {"cpu_pct": 22, "mem_pct": 34, "p99_latency_ms": 18, "error_rate": 0.0}, | |
| "auth": {"cpu_pct": 8, "mem_pct": 14, "p99_latency_ms": 6, "error_rate": 0.0}, | |
| }, | |
| "traces": { | |
| "trace_abc": [ | |
| {"service": "api", "op": "GET /v1/users", "duration_ms": 11800, "error": True}, | |
| {"service": "db", "op": "SELECT users", "duration_ms": 14, "error": False}, | |
| ], | |
| }, | |
| "gold": { | |
| "scope": ["api"], | |
| "hypothesis_keywords": ["oom", "memory", "heap"], | |
| "mitigation_keywords": ["restart", "rollback", "scale", "increase heap", "increase memory"], | |
| "writeup_keywords": ["api", "memory", "restart", "resolved"], | |
| }, | |
| }, | |
| # ---------- MEDIUM ---------- | |
| { | |
| "task_id": "medium_cascade", | |
| "difficulty": "medium", | |
| "description": ( | |
| "ALERT: Checkout latency p99 has crossed 5 seconds in the last 10 minutes. " | |
| "Three services are involved: `checkout`, `payments`, `inventory`. " | |
| "Correlate across logs, metrics and traces to find the root service, " | |
| "then scope / hypothesise / mitigate / communicate." | |
| ), | |
| "services": ["checkout", "payments", "inventory"], | |
| "logs": { | |
| "checkout": [ | |
| "2026-04-08T20:10:01Z INFO POST /checkout trace_id=trace_xyz", | |
| "2026-04-08T20:10:08Z WARN downstream payments slow (7s)", | |
| "2026-04-08T20:10:08Z INFO returning 200 to client", | |
| ], | |
| "payments": [ | |
| "2026-04-08T20:10:01Z INFO charge_card trace_id=trace_xyz", | |
| "2026-04-08T20:10:06Z WARN inventory check blocking", | |
| "2026-04-08T20:10:07Z INFO charge_card success", | |
| ], | |
| "inventory": [ | |
| "2026-04-08T20:10:01Z INFO reserve_items trace_id=trace_xyz", | |
| "2026-04-08T20:10:05Z ERROR connection pool exhausted (max=20)", | |
| "2026-04-08T20:10:06Z ERROR connection pool exhausted (max=20)", | |
| "2026-04-08T20:10:06Z WARN request queued for 5400ms", | |
| ], | |
| }, | |
| "metrics": { | |
| "checkout": {"cpu_pct": 40, "mem_pct": 55, "p99_latency_ms": 7800, "error_rate": 0.01}, | |
| "payments": {"cpu_pct": 35, "mem_pct": 42, "p99_latency_ms": 6900, "error_rate": 0.0}, | |
| "inventory": {"cpu_pct": 12, "mem_pct": 28, "p99_latency_ms": 5600, "error_rate": 0.0, "conn_pool_waiting": 44}, | |
| }, | |
| "traces": { | |
| "trace_xyz": [ | |
| {"service": "checkout", "op": "POST /checkout", "duration_ms": 7800, "error": False}, | |
| {"service": "payments", "op": "charge_card", "duration_ms": 6900, "error": False}, | |
| {"service": "inventory", "op": "reserve_items", "duration_ms": 5500, "error": False}, | |
| ], | |
| }, | |
| "gold": { | |
| "scope": ["checkout", "payments", "inventory"], | |
| "hypothesis_keywords": ["inventory", "connection pool", "pool exhaust", "conn"], | |
| "mitigation_keywords": ["increase pool", "pool size", "restart inventory", "scale inventory"], | |
| "writeup_keywords": ["inventory", "connection", "pool", "latency"], | |
| }, | |
| }, | |
| # ---------- HARD ---------- | |
| { | |
| "task_id": "hard_dns", | |
| "difficulty": "hard", | |
| "description": ( | |
| "ALERT: Intermittent 503s across multiple services (`web`, `api`, `worker`). " | |
| "A deploy of `api` went out 10 minutes ago and is the obvious suspect. " | |
| "Correlate carefully — the real root cause may be upstream. " | |
| "Scope, hypothesise, mitigate, and write a customer-facing status update." | |
| ), | |
| "services": ["web", "api", "worker"], | |
| "logs": { | |
| "web": [ | |
| "2026-04-08T21:00:01Z INFO GET /home 200", | |
| "2026-04-08T21:00:07Z ERROR getaddrinfo ENOTFOUND api.internal", | |
| "2026-04-08T21:00:08Z ERROR getaddrinfo ENOTFOUND api.internal", | |
| "2026-04-08T21:00:09Z ERROR upstream connect timeout", | |
| ], | |
| "api": [ | |
| "2026-04-08T20:50:00Z INFO deploy v2.31.0 started", | |
| "2026-04-08T20:51:10Z INFO deploy v2.31.0 complete, healthy", | |
| "2026-04-08T21:00:07Z INFO process healthy, listening on :8080", | |
| "2026-04-08T21:00:07Z INFO request handled 200", | |
| ], | |
| "worker": [ | |
| "2026-04-08T21:00:01Z INFO picked up job id=42", | |
| "2026-04-08T21:00:06Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host", | |
| "2026-04-08T21:00:08Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host", | |
| ], | |
| }, | |
| "metrics": { | |
| "web": {"cpu_pct": 20, "mem_pct": 35, "p99_latency_ms": 9000, "error_rate": 0.45}, | |
| "api": {"cpu_pct": 14, "mem_pct": 31, "p99_latency_ms": 42, "error_rate": 0.0}, | |
| "worker": {"cpu_pct": 18, "mem_pct": 27, "p99_latency_ms": 8800, "error_rate": 0.55}, | |
| }, | |
| "traces": { | |
| "trace_qqq": [ | |
| {"service": "web", "op": "GET /home", "duration_ms": 9000, "error": True, "note": "dns resolution failed"}, | |
| ], | |
| }, | |
| "gold": { | |
| "scope": ["web", "worker"], | |
| "hypothesis_keywords": ["dns", "resolver", "10.0.0.2", "enotfound", "no such host"], | |
| "mitigation_keywords": ["restart dns", "restart resolver", "failover dns", "point to backup resolver", "flush dns"], | |
| "writeup_keywords": ["dns", "resolution", "intermittent", "503", "restored"], | |
| }, | |
| }, | |
| ] | |
| def get_scenario(index: int) -> Dict[str, Any]: | |
| return SCENARIOS[index % len(SCENARIOS)] | |
| def num_scenarios() -> int: | |
| return len(SCENARIOS) | |