File size: 2,956 Bytes
9ec4919 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | {
"name": "Incident response",
"objective": "Turn a fresh alert into a triaged, evidence-backed incident with a clear owner and a postmortem trail, without an engineer doing the mechanical correlation by hand.",
"trigger": {
"type": "event",
"cadence_or_event": "A pager alert fires or an error budget burn crosses a threshold; then refresh every 2-5 minutes while the incident is open."
},
"intake": {
"sources": ["alert payload", "dashboards and logs", "traces", "recent deploys and config changes", "pager and ownership state"],
"selection_rule": "Triage the alerting service only; correlate signals and assign severity from impact evidence, not the alert text alone."
},
"workspace": {
"isolation": "Read-only access to observability systems, change logs, and pager state; no remediation surface.",
"allowed_actions": ["read metrics, logs, and traces", "open or update an incident record", "post status", "page the on-call owner"],
"disallowed_actions": ["roll back", "restart services", "change config", "run remediation"]
},
"context": {
"required_files": ["incident runbook", "service ownership map", "severity policy"],
"runtime_sources": ["live metrics", "log queries", "trace samples", "recent change log", "pager state"]
},
"agents": [
{
"role": "Observer",
"responsibility": "Gather metrics, logs, traces, and the recent change timeline."
},
{
"role": "Correlator",
"responsibility": "Link the alert to likely causes such as a deploy, dependency, or saturation."
},
{
"role": "Reporter",
"responsibility": "Write a concise incident summary with severity, impact, and hypothesis."
},
{
"role": "Escalator",
"responsibility": "Page the right owner and hand off when mitigation is required."
}
],
"verification": {
"gates": ["severity is backed by concrete impact evidence", "the correlated cause cites specific deploys, logs, traces, or metrics", "missing telemetry is reported as unknown, never healthy"],
"receipts": ["incident ID and severity", "signal timeline", "correlated cause with evidence links", "owner handoff record"]
},
"state": {
"artifacts": ["incident record", "postmortem seed timeline"],
"update_rule": "Append signals, hypotheses, and handoffs each interval so the timeline can seed the postmortem."
},
"budget": {
"max_retries": 2,
"max_runtime_minutes": 120
},
"escalation": {
"conditions": ["customer-facing outage", "data loss risk", "security signal", "missing telemetry", "mitigation requires production action"],
"destination": "On-call owner via pager with the evidence timeline"
},
"exit": {
"success": "A human owner accepts the incident, or it auto-resolves with a recorded evidence trail.",
"stop_without_success": "Mitigation is required, telemetry is missing, or correlation cannot be grounded in evidence."
}
}
|