# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

"""
Three fixed incident scenarios for the PostMortem env.

Each scenario is a self-contained dict with:
  - description:    the brief an on-call engineer would receive
  - services:       list of service names the agent may query
  - logs:           {service: [log lines]}
  - metrics:        {service: {metric: value}}
  - traces:         {trace_id: [span dicts]}
  - gold:           the oracle answers for grading
"""

from typing import Dict, List, Any

SCENARIOS: List[Dict[str, Any]] = [
    # ---------- EASY ----------
    {
        "task_id": "easy_oom",
        "difficulty": "easy",
        "description": (
            "ALERT: Service `api` is returning HTTP 500 on ~80% of requests. "
            "On-call has paged you. Investigate, scope the blast radius, "
            "identify the root cause, mitigate, and write a status-page update."
        ),
        "services": ["api", "db", "auth"],
        "logs": {
            "api": [
                "2026-04-08T19:55:01Z INFO  request id=req-001 path=/v1/users",
                "2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space",
                "2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space",
                "2026-04-08T19:55:03Z WARN  GC overhead limit exceeded",
                "2026-04-08T19:55:03Z ERROR pod api-7f8b restarting OOMKilled",
            ],
            "db": [
                "2026-04-08T19:55:01Z INFO  connection accepted from api",
                "2026-04-08T19:55:02Z INFO  query completed in 12ms",
                "2026-04-08T19:55:05Z INFO  idle connections=45",
            ],
            "auth": [
                "2026-04-08T19:55:01Z INFO  token issued",
                "2026-04-08T19:55:03Z INFO  token validated",
            ],
        },
        "metrics": {
            "api": {"cpu_pct": 98, "mem_pct": 99, "p99_latency_ms": 12000, "error_rate": 0.82},
            "db": {"cpu_pct": 22, "mem_pct": 34, "p99_latency_ms": 18, "error_rate": 0.0},
            "auth": {"cpu_pct": 8, "mem_pct": 14, "p99_latency_ms": 6, "error_rate": 0.0},
        },
        "traces": {
            "trace_abc": [
                {"service": "api", "op": "GET /v1/users", "duration_ms": 11800, "error": True},
                {"service": "db", "op": "SELECT users", "duration_ms": 14, "error": False},
            ],
        },
        "gold": {
            "scope": ["api"],
            "hypothesis_keywords": ["oom", "memory", "heap"],
            "mitigation_keywords": ["restart", "rollback", "scale", "increase heap", "increase memory"],
            "writeup_keywords": ["api", "memory", "restart", "resolved"],
        },
    },

    # ---------- MEDIUM ----------
    {
        "task_id": "medium_cascade",
        "difficulty": "medium",
        "description": (
            "ALERT: Checkout latency p99 has crossed 5 seconds in the last 10 minutes. "
            "Three services are involved: `checkout`, `payments`, `inventory`. "
            "Correlate across logs, metrics and traces to find the root service, "
            "then scope / hypothesise / mitigate / communicate."
        ),
        "services": ["checkout", "payments", "inventory"],
        "logs": {
            "checkout": [
                "2026-04-08T20:10:01Z INFO  POST /checkout trace_id=trace_xyz",
                "2026-04-08T20:10:08Z WARN  downstream payments slow (7s)",
                "2026-04-08T20:10:08Z INFO  returning 200 to client",
            ],
            "payments": [
                "2026-04-08T20:10:01Z INFO  charge_card trace_id=trace_xyz",
                "2026-04-08T20:10:06Z WARN  inventory check blocking",
                "2026-04-08T20:10:07Z INFO  charge_card success",
            ],
            "inventory": [
                "2026-04-08T20:10:01Z INFO  reserve_items trace_id=trace_xyz",
                "2026-04-08T20:10:05Z ERROR connection pool exhausted (max=20)",
                "2026-04-08T20:10:06Z ERROR connection pool exhausted (max=20)",
                "2026-04-08T20:10:06Z WARN  request queued for 5400ms",
            ],
        },
        "metrics": {
            "checkout": {"cpu_pct": 40, "mem_pct": 55, "p99_latency_ms": 7800, "error_rate": 0.01},
            "payments": {"cpu_pct": 35, "mem_pct": 42, "p99_latency_ms": 6900, "error_rate": 0.0},
            "inventory": {"cpu_pct": 12, "mem_pct": 28, "p99_latency_ms": 5600, "error_rate": 0.0, "conn_pool_waiting": 44},
        },
        "traces": {
            "trace_xyz": [
                {"service": "checkout", "op": "POST /checkout", "duration_ms": 7800, "error": False},
                {"service": "payments", "op": "charge_card", "duration_ms": 6900, "error": False},
                {"service": "inventory", "op": "reserve_items", "duration_ms": 5500, "error": False},
            ],
        },
        "gold": {
            "scope": ["checkout", "payments", "inventory"],
            "hypothesis_keywords": ["inventory", "connection pool", "pool exhaust", "conn"],
            "mitigation_keywords": ["increase pool", "pool size", "restart inventory", "scale inventory"],
            "writeup_keywords": ["inventory", "connection", "pool", "latency"],
        },
    },

    # ---------- HARD ----------
    {
        "task_id": "hard_dns",
        "difficulty": "hard",
        "description": (
            "ALERT: Intermittent 503s across multiple services (`web`, `api`, `worker`). "
            "A deploy of `api` went out 10 minutes ago and is the obvious suspect. "
            "Correlate carefully — the real root cause may be upstream. "
            "Scope, hypothesise, mitigate, and write a customer-facing status update."
        ),
        "services": ["web", "api", "worker"],
        "logs": {
            "web": [
                "2026-04-08T21:00:01Z INFO  GET /home 200",
                "2026-04-08T21:00:07Z ERROR getaddrinfo ENOTFOUND api.internal",
                "2026-04-08T21:00:08Z ERROR getaddrinfo ENOTFOUND api.internal",
                "2026-04-08T21:00:09Z ERROR upstream connect timeout",
            ],
            "api": [
                "2026-04-08T20:50:00Z INFO  deploy v2.31.0 started",
                "2026-04-08T20:51:10Z INFO  deploy v2.31.0 complete, healthy",
                "2026-04-08T21:00:07Z INFO  process healthy, listening on :8080",
                "2026-04-08T21:00:07Z INFO  request handled 200",
            ],
            "worker": [
                "2026-04-08T21:00:01Z INFO  picked up job id=42",
                "2026-04-08T21:00:06Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host",
                "2026-04-08T21:00:08Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host",
            ],
        },
        "metrics": {
            "web":    {"cpu_pct": 20, "mem_pct": 35, "p99_latency_ms": 9000, "error_rate": 0.45},
            "api":    {"cpu_pct": 14, "mem_pct": 31, "p99_latency_ms": 42, "error_rate": 0.0},
            "worker": {"cpu_pct": 18, "mem_pct": 27, "p99_latency_ms": 8800, "error_rate": 0.55},
        },
        "traces": {
            "trace_qqq": [
                {"service": "web", "op": "GET /home", "duration_ms": 9000, "error": True, "note": "dns resolution failed"},
            ],
        },
        "gold": {
            "scope": ["web", "worker"],
            "hypothesis_keywords": ["dns", "resolver", "10.0.0.2", "enotfound", "no such host"],
            "mitigation_keywords": ["restart dns", "restart resolver", "failover dns", "point to backup resolver", "flush dns"],
            "writeup_keywords": ["dns", "resolution", "intermittent", "503", "restored"],
        },
    },
]


def get_scenario(index: int) -> Dict[str, Any]:
    return SCENARIOS[index % len(SCENARIOS)]


def num_scenarios() -> int:
    return len(SCENARIOS)