File size: 8,401 Bytes
7e9a520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
"""Communication tool wrappers — Slack updates + postmortem generation."""

import json
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests
from jinja2 import Template


SLACK_WEBHOOK = os.getenv("SLACK_WEBHOOK_URL", "")
DISCORD_WEBHOOK = os.getenv("DISCORD_WEBHOOK_URL", "")
POSTMORTEM_DIR = Path(os.getenv("POSTMORTEM_DIR", "docs/postmortems"))

_SEV_COLOR_HEX = {"P0": "ff0000", "P1": "ff8800", "P2": "ffcc00"}
_LOG_PATH = Path("data/slack_posts.jsonl")


def _build_slack_payload(channel: str, severity: str, title: str,
                         summary: str, action_items: list[str]) -> dict:
    return {
        "channel": channel,
        "username": "atlasops-bot",
        "icon_emoji": ":rotating_light:" if severity in ("P0", "P1") else ":warning:",
        "attachments": [{
            "color": "#" + _SEV_COLOR_HEX.get(severity, "888888"),
            "title": f"[{severity}] {title}",
            "text": summary,
            "fields": (
                [{"title": "Action Items",
                  "value": "\n".join(f"• {a}" for a in action_items)}]
                if action_items else []
            ),
            "ts": int(datetime.now(timezone.utc).timestamp()),
        }],
    }


def _post_to_discord(slack_payload: dict) -> None:
    """Convert Slack payload to Discord embed and POST."""
    att = slack_payload["attachments"][0]
    color_int = int(_SEV_COLOR_HEX.get(
        att["title"].split("]")[0].lstrip("["), "888888"), 16)
    fields = [
        {"name": f["title"], "value": f["value"], "inline": False}
        for f in att.get("fields", []) if f.get("value")
    ]
    discord_payload = {
        "username": slack_payload.get("username", "atlasops-bot"),
        "embeds": [{
            "title": att["title"],
            "description": att.get("text", ""),
            "color": color_int,
            "fields": fields,
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "footer": {"text": "AtlasOps · AMD MI300X"},
        }],
    }
    requests.post(DISCORD_WEBHOOK, json=discord_payload, timeout=10).raise_for_status()


def slack_post_update(channel: str, severity: str, title: str, summary: str,
                      action_items: list[str] | None = None) -> dict[str, Any]:
    """Post an incident update.

    Always writes to local log (powers the UI feed).
    Also delivers to Slack if SLACK_WEBHOOK_URL is set.
    Also delivers to Discord if DISCORD_WEBHOOK_URL is set.
    """
    payload = _build_slack_payload(channel, severity, title, summary, action_items or [])

    # Always persist locally — powers /slack/feed in the UI
    _LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
    with _LOG_PATH.open("a", encoding="utf-8") as f:
        f.write(json.dumps(payload) + "\n")

    # Preserve a stable mode label for downstream tests/integrations that
    # differentiate "logged locally only" from external webhook delivery.
    modes: list[str] = ["logged_locally"]
    errors: list[str] = []

    if SLACK_WEBHOOK:
        try:
            r = requests.post(SLACK_WEBHOOK, json=payload, timeout=10)
            r.raise_for_status()
            modes.append("slack")
        except requests.RequestException as e:
            errors.append(f"slack: {e}")

    if DISCORD_WEBHOOK:
        try:
            _post_to_discord(payload)
            modes.append("discord")
        except requests.RequestException as e:
            errors.append(f"discord: {e}")

    return {
        "success": True,
        "mode": "+".join(modes),
        **({"errors": errors} if errors else {}),
    }


POSTMORTEM_TEMPLATE = """# Postmortem: {{ title }}

**Date:** {{ date }}
**Severity:** {{ severity }}
**Duration:** {{ duration }}
**Authors:** {{ authors }}

## Summary
{{ summary }}

## Impact
{{ impact }}

## Timeline (UTC)
{% for entry in timeline -%}
- **{{ entry.time }}** — {{ entry.event }}
{% endfor %}

## Root Cause
{{ root_cause }}

## Detection
{{ detection }}

## Resolution
{{ resolution }}

## What Went Well
{% for item in went_well -%}
- {{ item }}
{% endfor %}

## What Went Wrong
{% for item in went_wrong -%}
- {{ item }}
{% endfor %}

## Action Items
| # | Action | Owner | Priority | Due |
|---|---|---|---|---|
{% for ai in action_items -%}
| {{ loop.index }} | {{ ai.action }} | {{ ai.owner }} | {{ ai.priority }} | {{ ai.due }} |
{% endfor %}
"""


def postmortem_draft(incident: dict[str, Any], output_path: str = "") -> dict[str, Any]:
    """Generate a Cloudflare-blog quality postmortem.

    incident dict shape (all optional — auto-filled from available data):
      title, severity, duration, authors, summary, impact,
      timeline: [{time, event}], root_cause, detection, resolution,
      went_well: [str], went_wrong: [str],
      action_items: [{action, owner, priority, due}]
    """
    now = datetime.now(timezone.utc)

    # Auto-fill missing fields from nested incident data
    def _get(*keys, default=""):
        for key in keys:
            if incident.get(key):
                return incident[key]
        return default

    triage = incident.get("triage", {}) or {}
    diagnosis = incident.get("diagnosis", {}) or {}
    remediation = incident.get("remediation", {}) or {}

    title    = _get("title") or triage.get("title") or "Incident"
    severity = _get("severity") or triage.get("severity") or "Unknown"
    root_cause_raw = _get("root_cause") or diagnosis.get("root_cause") or diagnosis.get("specific") or "Under investigation"
    root_cause = root_cause_raw if isinstance(root_cause_raw, str) else json.dumps(root_cause_raw)
    resolution_raw = _get("resolution") or remediation.get("outcome") or "Resolved by on-call team"
    resolution = resolution_raw if isinstance(resolution_raw, str) else json.dumps(resolution_raw)
    actions_taken = remediation.get("actions_taken", [])

    timeline = incident.get("timeline") or [
        {"time": now.strftime("%H:%M UTC"), "event": f"Alert fired: {title}"},
        {"time": now.strftime("%H:%M UTC"), "event": "Triage agent acknowledged"},
        {"time": now.strftime("%H:%M UTC"), "event": f"Root cause identified: {root_cause[:80]}"},
        {"time": now.strftime("%H:%M UTC"), "event": "Remediation applied"},
    ]
    went_well  = incident.get("went_well") or ["Automated detection by Prometheus/Alertmanager", "AtlasOps multi-agent response < 5 min"]
    went_wrong = incident.get("went_wrong") or ["Alert was not suppressed during maintenance window"]
    action_items = incident.get("action_items") or [
        {"action": f"Add runbook for {title}", "owner": "@sre-team", "priority": "P2", "due": "2026-06-01"},
        {"action": "Review alert thresholds", "owner": "@observability", "priority": "P3", "due": "2026-06-15"},
    ]
    if actions_taken:
        action_items.insert(0, {
            "action": f"Verify fix stability: {str(actions_taken[0])[:80]}",
            "owner": "@sre-oncall", "priority": "P1",
            "due": now.strftime("%Y-%m-%d"),
        })

    data = {
        "title": title, "severity": severity,
        "duration": incident.get("duration", "< 10 min"),
        "authors": incident.get("authors", "AtlasOps automated response"),
        "summary": incident.get("summary") or f"{severity} incident: {title}. Root cause: {root_cause[:120]}. Resolution: {resolution[:120]}.",
        "impact": incident.get("impact") or f"Services affected: {triage.get('blast_radius', {}).get('services', ['unknown'])}. User impact: {triage.get('blast_radius', {}).get('user_impact_pct', 0)}%.",
        "timeline": timeline,
        "root_cause": root_cause,
        "detection": incident.get("detection") or "Prometheus alert fired → Alertmanager forwarded to AtlasOps webhook.",
        "resolution": resolution,
        "went_well": went_well,
        "went_wrong": went_wrong,
        "action_items": action_items,
    }

    template = Template(POSTMORTEM_TEMPLATE)
    rendered = template.render(date=now.date().isoformat(), **data)
    POSTMORTEM_DIR.mkdir(parents=True, exist_ok=True)
    if not output_path:
        slug = title.lower().replace(" ", "-")[:60]
        output_path = str(POSTMORTEM_DIR / f"{now.date()}-{slug}.md")
    Path(output_path).write_text(rendered, encoding="utf-8")
    return {"success": True, "path": output_path, "postmortem_path": output_path, "bytes": len(rendered)}