postmortem_env / server /scenarios.py
yashppawar's picture
Upload folder using huggingface_hub
b29893e verified
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
"""
Three fixed incident scenarios for the PostMortem env.
Each scenario is a self-contained dict with:
- description: the brief an on-call engineer would receive
- services: list of service names the agent may query
- logs: {service: [log lines]}
- metrics: {service: {metric: value}}
- traces: {trace_id: [span dicts]}
- gold: the oracle answers for grading
"""
from typing import Dict, List, Any
SCENARIOS: List[Dict[str, Any]] = [
# ---------- EASY ----------
{
"task_id": "easy_oom",
"difficulty": "easy",
"description": (
"ALERT: Service `api` is returning HTTP 500 on ~80% of requests. "
"On-call has paged you. Investigate, scope the blast radius, "
"identify the root cause, mitigate, and write a status-page update."
),
"services": ["api", "db", "auth"],
"logs": {
"api": [
"2026-04-08T19:55:01Z INFO request id=req-001 path=/v1/users",
"2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space",
"2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space",
"2026-04-08T19:55:03Z WARN GC overhead limit exceeded",
"2026-04-08T19:55:03Z ERROR pod api-7f8b restarting OOMKilled",
],
"db": [
"2026-04-08T19:55:01Z INFO connection accepted from api",
"2026-04-08T19:55:02Z INFO query completed in 12ms",
"2026-04-08T19:55:05Z INFO idle connections=45",
],
"auth": [
"2026-04-08T19:55:01Z INFO token issued",
"2026-04-08T19:55:03Z INFO token validated",
],
},
"metrics": {
"api": {"cpu_pct": 98, "mem_pct": 99, "p99_latency_ms": 12000, "error_rate": 0.82},
"db": {"cpu_pct": 22, "mem_pct": 34, "p99_latency_ms": 18, "error_rate": 0.0},
"auth": {"cpu_pct": 8, "mem_pct": 14, "p99_latency_ms": 6, "error_rate": 0.0},
},
"traces": {
"trace_abc": [
{"service": "api", "op": "GET /v1/users", "duration_ms": 11800, "error": True},
{"service": "db", "op": "SELECT users", "duration_ms": 14, "error": False},
],
},
"gold": {
"scope": ["api"],
"hypothesis_keywords": ["oom", "memory", "heap"],
"mitigation_keywords": ["restart", "rollback", "scale", "increase heap", "increase memory"],
"writeup_keywords": ["api", "memory", "restart", "resolved"],
},
},
# ---------- MEDIUM ----------
{
"task_id": "medium_cascade",
"difficulty": "medium",
"description": (
"ALERT: Checkout latency p99 has crossed 5 seconds in the last 10 minutes. "
"Three services are involved: `checkout`, `payments`, `inventory`. "
"Correlate across logs, metrics and traces to find the root service, "
"then scope / hypothesise / mitigate / communicate."
),
"services": ["checkout", "payments", "inventory"],
"logs": {
"checkout": [
"2026-04-08T20:10:01Z INFO POST /checkout trace_id=trace_xyz",
"2026-04-08T20:10:08Z WARN downstream payments slow (7s)",
"2026-04-08T20:10:08Z INFO returning 200 to client",
],
"payments": [
"2026-04-08T20:10:01Z INFO charge_card trace_id=trace_xyz",
"2026-04-08T20:10:06Z WARN inventory check blocking",
"2026-04-08T20:10:07Z INFO charge_card success",
],
"inventory": [
"2026-04-08T20:10:01Z INFO reserve_items trace_id=trace_xyz",
"2026-04-08T20:10:05Z ERROR connection pool exhausted (max=20)",
"2026-04-08T20:10:06Z ERROR connection pool exhausted (max=20)",
"2026-04-08T20:10:06Z WARN request queued for 5400ms",
],
},
"metrics": {
"checkout": {"cpu_pct": 40, "mem_pct": 55, "p99_latency_ms": 7800, "error_rate": 0.01},
"payments": {"cpu_pct": 35, "mem_pct": 42, "p99_latency_ms": 6900, "error_rate": 0.0},
"inventory": {"cpu_pct": 12, "mem_pct": 28, "p99_latency_ms": 5600, "error_rate": 0.0, "conn_pool_waiting": 44},
},
"traces": {
"trace_xyz": [
{"service": "checkout", "op": "POST /checkout", "duration_ms": 7800, "error": False},
{"service": "payments", "op": "charge_card", "duration_ms": 6900, "error": False},
{"service": "inventory", "op": "reserve_items", "duration_ms": 5500, "error": False},
],
},
"gold": {
"scope": ["checkout", "payments", "inventory"],
"hypothesis_keywords": ["inventory", "connection pool", "pool exhaust", "conn"],
"mitigation_keywords": ["increase pool", "pool size", "restart inventory", "scale inventory"],
"writeup_keywords": ["inventory", "connection", "pool", "latency"],
},
},
# ---------- HARD ----------
{
"task_id": "hard_dns",
"difficulty": "hard",
"description": (
"ALERT: Intermittent 503s across multiple services (`web`, `api`, `worker`). "
"A deploy of `api` went out 10 minutes ago and is the obvious suspect. "
"Correlate carefully — the real root cause may be upstream. "
"Scope, hypothesise, mitigate, and write a customer-facing status update."
),
"services": ["web", "api", "worker"],
"logs": {
"web": [
"2026-04-08T21:00:01Z INFO GET /home 200",
"2026-04-08T21:00:07Z ERROR getaddrinfo ENOTFOUND api.internal",
"2026-04-08T21:00:08Z ERROR getaddrinfo ENOTFOUND api.internal",
"2026-04-08T21:00:09Z ERROR upstream connect timeout",
],
"api": [
"2026-04-08T20:50:00Z INFO deploy v2.31.0 started",
"2026-04-08T20:51:10Z INFO deploy v2.31.0 complete, healthy",
"2026-04-08T21:00:07Z INFO process healthy, listening on :8080",
"2026-04-08T21:00:07Z INFO request handled 200",
],
"worker": [
"2026-04-08T21:00:01Z INFO picked up job id=42",
"2026-04-08T21:00:06Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host",
"2026-04-08T21:00:08Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host",
],
},
"metrics": {
"web": {"cpu_pct": 20, "mem_pct": 35, "p99_latency_ms": 9000, "error_rate": 0.45},
"api": {"cpu_pct": 14, "mem_pct": 31, "p99_latency_ms": 42, "error_rate": 0.0},
"worker": {"cpu_pct": 18, "mem_pct": 27, "p99_latency_ms": 8800, "error_rate": 0.55},
},
"traces": {
"trace_qqq": [
{"service": "web", "op": "GET /home", "duration_ms": 9000, "error": True, "note": "dns resolution failed"},
],
},
"gold": {
"scope": ["web", "worker"],
"hypothesis_keywords": ["dns", "resolver", "10.0.0.2", "enotfound", "no such host"],
"mitigation_keywords": ["restart dns", "restart resolver", "failover dns", "point to backup resolver", "flush dns"],
"writeup_keywords": ["dns", "resolution", "intermittent", "503", "restored"],
},
},
]
def get_scenario(index: int) -> Dict[str, Any]:
return SCENARIOS[index % len(SCENARIOS)]
def num_scenarios() -> int:
return len(SCENARIOS)