| """Communication tool wrappers — Slack updates + postmortem generation.""" |
|
|
| import json |
| import os |
| from datetime import datetime, timezone |
| from pathlib import Path |
| from typing import Any |
|
|
| import requests |
| from jinja2 import Template |
|
|
|
|
| SLACK_WEBHOOK = os.getenv("SLACK_WEBHOOK_URL", "") |
| DISCORD_WEBHOOK = os.getenv("DISCORD_WEBHOOK_URL", "") |
| POSTMORTEM_DIR = Path(os.getenv("POSTMORTEM_DIR", "docs/postmortems")) |
|
|
| _SEV_COLOR_HEX = {"P0": "ff0000", "P1": "ff8800", "P2": "ffcc00"} |
| _LOG_PATH = Path("data/slack_posts.jsonl") |
|
|
|
|
| def _build_slack_payload(channel: str, severity: str, title: str, |
| summary: str, action_items: list[str]) -> dict: |
| return { |
| "channel": channel, |
| "username": "atlasops-bot", |
| "icon_emoji": ":rotating_light:" if severity in ("P0", "P1") else ":warning:", |
| "attachments": [{ |
| "color": "#" + _SEV_COLOR_HEX.get(severity, "888888"), |
| "title": f"[{severity}] {title}", |
| "text": summary, |
| "fields": ( |
| [{"title": "Action Items", |
| "value": "\n".join(f"• {a}" for a in action_items)}] |
| if action_items else [] |
| ), |
| "ts": int(datetime.now(timezone.utc).timestamp()), |
| }], |
| } |
|
|
|
|
| def _post_to_discord(slack_payload: dict) -> None: |
| """Convert Slack payload to Discord embed and POST.""" |
| att = slack_payload["attachments"][0] |
| color_int = int(_SEV_COLOR_HEX.get( |
| att["title"].split("]")[0].lstrip("["), "888888"), 16) |
| fields = [ |
| {"name": f["title"], "value": f["value"], "inline": False} |
| for f in att.get("fields", []) if f.get("value") |
| ] |
| discord_payload = { |
| "username": slack_payload.get("username", "atlasops-bot"), |
| "embeds": [{ |
| "title": att["title"], |
| "description": att.get("text", ""), |
| "color": color_int, |
| "fields": fields, |
| "timestamp": datetime.now(timezone.utc).isoformat(), |
| "footer": {"text": "AtlasOps · AMD MI300X"}, |
| }], |
| } |
| requests.post(DISCORD_WEBHOOK, json=discord_payload, timeout=10).raise_for_status() |
|
|
|
|
| def slack_post_update(channel: str, severity: str, title: str, summary: str, |
| action_items: list[str] | None = None) -> dict[str, Any]: |
| """Post an incident update. |
| |
| Always writes to local log (powers the UI feed). |
| Also delivers to Slack if SLACK_WEBHOOK_URL is set. |
| Also delivers to Discord if DISCORD_WEBHOOK_URL is set. |
| """ |
| payload = _build_slack_payload(channel, severity, title, summary, action_items or []) |
|
|
| |
| _LOG_PATH.parent.mkdir(parents=True, exist_ok=True) |
| with _LOG_PATH.open("a", encoding="utf-8") as f: |
| f.write(json.dumps(payload) + "\n") |
|
|
| |
| |
| modes: list[str] = ["logged_locally"] |
| errors: list[str] = [] |
|
|
| if SLACK_WEBHOOK: |
| try: |
| r = requests.post(SLACK_WEBHOOK, json=payload, timeout=10) |
| r.raise_for_status() |
| modes.append("slack") |
| except requests.RequestException as e: |
| errors.append(f"slack: {e}") |
|
|
| if DISCORD_WEBHOOK: |
| try: |
| _post_to_discord(payload) |
| modes.append("discord") |
| except requests.RequestException as e: |
| errors.append(f"discord: {e}") |
|
|
| return { |
| "success": True, |
| "mode": "+".join(modes), |
| **({"errors": errors} if errors else {}), |
| } |
|
|
|
|
| POSTMORTEM_TEMPLATE = """# Postmortem: {{ title }} |
| |
| **Date:** {{ date }} |
| **Severity:** {{ severity }} |
| **Duration:** {{ duration }} |
| **Authors:** {{ authors }} |
| |
| ## Summary |
| {{ summary }} |
| |
| ## Impact |
| {{ impact }} |
| |
| ## Timeline (UTC) |
| {% for entry in timeline -%} |
| - **{{ entry.time }}** — {{ entry.event }} |
| {% endfor %} |
| |
| ## Root Cause |
| {{ root_cause }} |
| |
| ## Detection |
| {{ detection }} |
| |
| ## Resolution |
| {{ resolution }} |
| |
| ## What Went Well |
| {% for item in went_well -%} |
| - {{ item }} |
| {% endfor %} |
| |
| ## What Went Wrong |
| {% for item in went_wrong -%} |
| - {{ item }} |
| {% endfor %} |
| |
| ## Action Items |
| | # | Action | Owner | Priority | Due | |
| |---|---|---|---|---| |
| {% for ai in action_items -%} |
| | {{ loop.index }} | {{ ai.action }} | {{ ai.owner }} | {{ ai.priority }} | {{ ai.due }} | |
| {% endfor %} |
| """ |
|
|
|
|
| def postmortem_draft(incident: dict[str, Any], output_path: str = "") -> dict[str, Any]: |
| """Generate a Cloudflare-blog quality postmortem. |
| |
| incident dict shape (all optional — auto-filled from available data): |
| title, severity, duration, authors, summary, impact, |
| timeline: [{time, event}], root_cause, detection, resolution, |
| went_well: [str], went_wrong: [str], |
| action_items: [{action, owner, priority, due}] |
| """ |
| now = datetime.now(timezone.utc) |
|
|
| |
| def _get(*keys, default=""): |
| for key in keys: |
| if incident.get(key): |
| return incident[key] |
| return default |
|
|
| triage = incident.get("triage", {}) or {} |
| diagnosis = incident.get("diagnosis", {}) or {} |
| remediation = incident.get("remediation", {}) or {} |
|
|
| title = _get("title") or triage.get("title") or "Incident" |
| severity = _get("severity") or triage.get("severity") or "Unknown" |
| root_cause_raw = _get("root_cause") or diagnosis.get("root_cause") or diagnosis.get("specific") or "Under investigation" |
| root_cause = root_cause_raw if isinstance(root_cause_raw, str) else json.dumps(root_cause_raw) |
| resolution_raw = _get("resolution") or remediation.get("outcome") or "Resolved by on-call team" |
| resolution = resolution_raw if isinstance(resolution_raw, str) else json.dumps(resolution_raw) |
| actions_taken = remediation.get("actions_taken", []) |
|
|
| timeline = incident.get("timeline") or [ |
| {"time": now.strftime("%H:%M UTC"), "event": f"Alert fired: {title}"}, |
| {"time": now.strftime("%H:%M UTC"), "event": "Triage agent acknowledged"}, |
| {"time": now.strftime("%H:%M UTC"), "event": f"Root cause identified: {root_cause[:80]}"}, |
| {"time": now.strftime("%H:%M UTC"), "event": "Remediation applied"}, |
| ] |
| went_well = incident.get("went_well") or ["Automated detection by Prometheus/Alertmanager", "AtlasOps multi-agent response < 5 min"] |
| went_wrong = incident.get("went_wrong") or ["Alert was not suppressed during maintenance window"] |
| action_items = incident.get("action_items") or [ |
| {"action": f"Add runbook for {title}", "owner": "@sre-team", "priority": "P2", "due": "2026-06-01"}, |
| {"action": "Review alert thresholds", "owner": "@observability", "priority": "P3", "due": "2026-06-15"}, |
| ] |
| if actions_taken: |
| action_items.insert(0, { |
| "action": f"Verify fix stability: {str(actions_taken[0])[:80]}", |
| "owner": "@sre-oncall", "priority": "P1", |
| "due": now.strftime("%Y-%m-%d"), |
| }) |
|
|
| data = { |
| "title": title, "severity": severity, |
| "duration": incident.get("duration", "< 10 min"), |
| "authors": incident.get("authors", "AtlasOps automated response"), |
| "summary": incident.get("summary") or f"{severity} incident: {title}. Root cause: {root_cause[:120]}. Resolution: {resolution[:120]}.", |
| "impact": incident.get("impact") or f"Services affected: {triage.get('blast_radius', {}).get('services', ['unknown'])}. User impact: {triage.get('blast_radius', {}).get('user_impact_pct', 0)}%.", |
| "timeline": timeline, |
| "root_cause": root_cause, |
| "detection": incident.get("detection") or "Prometheus alert fired → Alertmanager forwarded to AtlasOps webhook.", |
| "resolution": resolution, |
| "went_well": went_well, |
| "went_wrong": went_wrong, |
| "action_items": action_items, |
| } |
|
|
| template = Template(POSTMORTEM_TEMPLATE) |
| rendered = template.render(date=now.date().isoformat(), **data) |
| POSTMORTEM_DIR.mkdir(parents=True, exist_ok=True) |
| if not output_path: |
| slug = title.lower().replace(" ", "-")[:60] |
| output_path = str(POSTMORTEM_DIR / f"{now.date()}-{slug}.md") |
| Path(output_path).write_text(rendered, encoding="utf-8") |
| return {"success": True, "path": output_path, "postmortem_path": output_path, "bytes": len(rendered)} |
|
|