Spaces:

yashppawar
/

postmortem_env

Sleeping

App Files Files Community

postmortem_env / server /scenarios.py

yashppawar

Upload folder using huggingface_hub

b29893e verified 8 days ago

raw

history blame contribute delete

7.88 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.

	"""
	Three fixed incident scenarios for the PostMortem env.

	Each scenario is a self-contained dict with:
	- description: the brief an on-call engineer would receive
	- services: list of service names the agent may query
	- logs: {service: [log lines]}
	- metrics: {service: {metric: value}}
	- traces: {trace_id: [span dicts]}
	- gold: the oracle answers for grading
	"""

	from typing import Dict, List, Any

	SCENARIOS: List[Dict[str, Any]] = [
	# ---------- EASY ----------
	{
	"task_id": "easy_oom",
	"difficulty": "easy",
	"description": (
	"ALERT: Service `api` is returning HTTP 500 on ~80% of requests. "
	"On-call has paged you. Investigate, scope the blast radius, "
	"identify the root cause, mitigate, and write a status-page update."
	),
	"services": ["api", "db", "auth"],
	"logs": {
	"api": [
	"2026-04-08T19:55:01Z INFO request id=req-001 path=/v1/users",
	"2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space",
	"2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space",
	"2026-04-08T19:55:03Z WARN GC overhead limit exceeded",
	"2026-04-08T19:55:03Z ERROR pod api-7f8b restarting OOMKilled",
	],
	"db": [
	"2026-04-08T19:55:01Z INFO connection accepted from api",
	"2026-04-08T19:55:02Z INFO query completed in 12ms",
	"2026-04-08T19:55:05Z INFO idle connections=45",
	],
	"auth": [
	"2026-04-08T19:55:01Z INFO token issued",
	"2026-04-08T19:55:03Z INFO token validated",
	],
	},
	"metrics": {
	"api": {"cpu_pct": 98, "mem_pct": 99, "p99_latency_ms": 12000, "error_rate": 0.82},
	"db": {"cpu_pct": 22, "mem_pct": 34, "p99_latency_ms": 18, "error_rate": 0.0},
	"auth": {"cpu_pct": 8, "mem_pct": 14, "p99_latency_ms": 6, "error_rate": 0.0},
	},
	"traces": {
	"trace_abc": [
	{"service": "api", "op": "GET /v1/users", "duration_ms": 11800, "error": True},
	{"service": "db", "op": "SELECT users", "duration_ms": 14, "error": False},
	],
	},
	"gold": {
	"scope": ["api"],
	"hypothesis_keywords": ["oom", "memory", "heap"],
	"mitigation_keywords": ["restart", "rollback", "scale", "increase heap", "increase memory"],
	"writeup_keywords": ["api", "memory", "restart", "resolved"],
	},
	},

	# ---------- MEDIUM ----------
	{
	"task_id": "medium_cascade",
	"difficulty": "medium",
	"description": (
	"ALERT: Checkout latency p99 has crossed 5 seconds in the last 10 minutes. "
	"Three services are involved: `checkout`, `payments`, `inventory`. "
	"Correlate across logs, metrics and traces to find the root service, "
	"then scope / hypothesise / mitigate / communicate."
	),
	"services": ["checkout", "payments", "inventory"],
	"logs": {
	"checkout": [
	"2026-04-08T20:10:01Z INFO POST /checkout trace_id=trace_xyz",
	"2026-04-08T20:10:08Z WARN downstream payments slow (7s)",
	"2026-04-08T20:10:08Z INFO returning 200 to client",
	],
	"payments": [
	"2026-04-08T20:10:01Z INFO charge_card trace_id=trace_xyz",
	"2026-04-08T20:10:06Z WARN inventory check blocking",
	"2026-04-08T20:10:07Z INFO charge_card success",
	],
	"inventory": [
	"2026-04-08T20:10:01Z INFO reserve_items trace_id=trace_xyz",
	"2026-04-08T20:10:05Z ERROR connection pool exhausted (max=20)",
	"2026-04-08T20:10:06Z ERROR connection pool exhausted (max=20)",
	"2026-04-08T20:10:06Z WARN request queued for 5400ms",
	],
	},
	"metrics": {
	"checkout": {"cpu_pct": 40, "mem_pct": 55, "p99_latency_ms": 7800, "error_rate": 0.01},
	"payments": {"cpu_pct": 35, "mem_pct": 42, "p99_latency_ms": 6900, "error_rate": 0.0},
	"inventory": {"cpu_pct": 12, "mem_pct": 28, "p99_latency_ms": 5600, "error_rate": 0.0, "conn_pool_waiting": 44},
	},
	"traces": {
	"trace_xyz": [
	{"service": "checkout", "op": "POST /checkout", "duration_ms": 7800, "error": False},
	{"service": "payments", "op": "charge_card", "duration_ms": 6900, "error": False},
	{"service": "inventory", "op": "reserve_items", "duration_ms": 5500, "error": False},
	],
	},
	"gold": {
	"scope": ["checkout", "payments", "inventory"],
	"hypothesis_keywords": ["inventory", "connection pool", "pool exhaust", "conn"],
	"mitigation_keywords": ["increase pool", "pool size", "restart inventory", "scale inventory"],
	"writeup_keywords": ["inventory", "connection", "pool", "latency"],
	},
	},

	# ---------- HARD ----------
	{
	"task_id": "hard_dns",
	"difficulty": "hard",
	"description": (
	"ALERT: Intermittent 503s across multiple services (`web`, `api`, `worker`). "
	"A deploy of `api` went out 10 minutes ago and is the obvious suspect. "
	"Correlate carefully — the real root cause may be upstream. "
	"Scope, hypothesise, mitigate, and write a customer-facing status update."
	),
	"services": ["web", "api", "worker"],
	"logs": {
	"web": [
	"2026-04-08T21:00:01Z INFO GET /home 200",
	"2026-04-08T21:00:07Z ERROR getaddrinfo ENOTFOUND api.internal",
	"2026-04-08T21:00:08Z ERROR getaddrinfo ENOTFOUND api.internal",
	"2026-04-08T21:00:09Z ERROR upstream connect timeout",
	],
	"api": [
	"2026-04-08T20:50:00Z INFO deploy v2.31.0 started",
	"2026-04-08T20:51:10Z INFO deploy v2.31.0 complete, healthy",
	"2026-04-08T21:00:07Z INFO process healthy, listening on :8080",
	"2026-04-08T21:00:07Z INFO request handled 200",
	],
	"worker": [
	"2026-04-08T21:00:01Z INFO picked up job id=42",
	"2026-04-08T21:00:06Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host",
	"2026-04-08T21:00:08Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host",
	],
	},
	"metrics": {
	"web": {"cpu_pct": 20, "mem_pct": 35, "p99_latency_ms": 9000, "error_rate": 0.45},
	"api": {"cpu_pct": 14, "mem_pct": 31, "p99_latency_ms": 42, "error_rate": 0.0},
	"worker": {"cpu_pct": 18, "mem_pct": 27, "p99_latency_ms": 8800, "error_rate": 0.55},
	},
	"traces": {
	"trace_qqq": [
	{"service": "web", "op": "GET /home", "duration_ms": 9000, "error": True, "note": "dns resolution failed"},
	],
	},
	"gold": {
	"scope": ["web", "worker"],
	"hypothesis_keywords": ["dns", "resolver", "10.0.0.2", "enotfound", "no such host"],
	"mitigation_keywords": ["restart dns", "restart resolver", "failover dns", "point to backup resolver", "flush dns"],
	"writeup_keywords": ["dns", "resolution", "intermittent", "503", "restored"],
	},
	},
	]


	def get_scenario(index: int) -> Dict[str, Any]:
	return SCENARIOS[index % len(SCENARIOS)]


	def num_scenarios() -> int:
	return len(SCENARIOS)