{ "name": "Incident response", "objective": "Turn a fresh alert into a triaged, evidence-backed incident with a clear owner and a postmortem trail, without an engineer doing the mechanical correlation by hand.", "trigger": { "type": "event", "cadence_or_event": "A pager alert fires or an error budget burn crosses a threshold; then refresh every 2-5 minutes while the incident is open." }, "intake": { "sources": ["alert payload", "dashboards and logs", "traces", "recent deploys and config changes", "pager and ownership state"], "selection_rule": "Triage the alerting service only; correlate signals and assign severity from impact evidence, not the alert text alone." }, "workspace": { "isolation": "Read-only access to observability systems, change logs, and pager state; no remediation surface.", "allowed_actions": ["read metrics, logs, and traces", "open or update an incident record", "post status", "page the on-call owner"], "disallowed_actions": ["roll back", "restart services", "change config", "run remediation"] }, "context": { "required_files": ["incident runbook", "service ownership map", "severity policy"], "runtime_sources": ["live metrics", "log queries", "trace samples", "recent change log", "pager state"] }, "agents": [ { "role": "Observer", "responsibility": "Gather metrics, logs, traces, and the recent change timeline." }, { "role": "Correlator", "responsibility": "Link the alert to likely causes such as a deploy, dependency, or saturation." }, { "role": "Reporter", "responsibility": "Write a concise incident summary with severity, impact, and hypothesis." }, { "role": "Escalator", "responsibility": "Page the right owner and hand off when mitigation is required." } ], "verification": { "gates": ["severity is backed by concrete impact evidence", "the correlated cause cites specific deploys, logs, traces, or metrics", "missing telemetry is reported as unknown, never healthy"], "receipts": ["incident ID and severity", "signal timeline", "correlated cause with evidence links", "owner handoff record"] }, "state": { "artifacts": ["incident record", "postmortem seed timeline"], "update_rule": "Append signals, hypotheses, and handoffs each interval so the timeline can seed the postmortem." }, "budget": { "max_retries": 2, "max_runtime_minutes": 120 }, "escalation": { "conditions": ["customer-facing outage", "data loss risk", "security signal", "missing telemetry", "mitigation requires production action"], "destination": "On-call owner via pager with the evidence timeline" }, "exit": { "success": "A human owner accepts the incident, or it auto-resolves with a recorded evidence trail.", "stop_without_success": "Mitigation is required, telemetry is missing, or correlation cannot be grounded in evidence." } }