atlasops / agents /tools /comms.py
Harikishanth R
fix: skip-kubectl + scroll + health — HF Space ready
7e9a520
"""Communication tool wrappers — Slack updates + postmortem generation."""
import json
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import requests
from jinja2 import Template
SLACK_WEBHOOK = os.getenv("SLACK_WEBHOOK_URL", "")
DISCORD_WEBHOOK = os.getenv("DISCORD_WEBHOOK_URL", "")
POSTMORTEM_DIR = Path(os.getenv("POSTMORTEM_DIR", "docs/postmortems"))
_SEV_COLOR_HEX = {"P0": "ff0000", "P1": "ff8800", "P2": "ffcc00"}
_LOG_PATH = Path("data/slack_posts.jsonl")
def _build_slack_payload(channel: str, severity: str, title: str,
summary: str, action_items: list[str]) -> dict:
return {
"channel": channel,
"username": "atlasops-bot",
"icon_emoji": ":rotating_light:" if severity in ("P0", "P1") else ":warning:",
"attachments": [{
"color": "#" + _SEV_COLOR_HEX.get(severity, "888888"),
"title": f"[{severity}] {title}",
"text": summary,
"fields": (
[{"title": "Action Items",
"value": "\n".join(f"• {a}" for a in action_items)}]
if action_items else []
),
"ts": int(datetime.now(timezone.utc).timestamp()),
}],
}
def _post_to_discord(slack_payload: dict) -> None:
"""Convert Slack payload to Discord embed and POST."""
att = slack_payload["attachments"][0]
color_int = int(_SEV_COLOR_HEX.get(
att["title"].split("]")[0].lstrip("["), "888888"), 16)
fields = [
{"name": f["title"], "value": f["value"], "inline": False}
for f in att.get("fields", []) if f.get("value")
]
discord_payload = {
"username": slack_payload.get("username", "atlasops-bot"),
"embeds": [{
"title": att["title"],
"description": att.get("text", ""),
"color": color_int,
"fields": fields,
"timestamp": datetime.now(timezone.utc).isoformat(),
"footer": {"text": "AtlasOps · AMD MI300X"},
}],
}
requests.post(DISCORD_WEBHOOK, json=discord_payload, timeout=10).raise_for_status()
def slack_post_update(channel: str, severity: str, title: str, summary: str,
action_items: list[str] | None = None) -> dict[str, Any]:
"""Post an incident update.
Always writes to local log (powers the UI feed).
Also delivers to Slack if SLACK_WEBHOOK_URL is set.
Also delivers to Discord if DISCORD_WEBHOOK_URL is set.
"""
payload = _build_slack_payload(channel, severity, title, summary, action_items or [])
# Always persist locally — powers /slack/feed in the UI
_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
with _LOG_PATH.open("a", encoding="utf-8") as f:
f.write(json.dumps(payload) + "\n")
# Preserve a stable mode label for downstream tests/integrations that
# differentiate "logged locally only" from external webhook delivery.
modes: list[str] = ["logged_locally"]
errors: list[str] = []
if SLACK_WEBHOOK:
try:
r = requests.post(SLACK_WEBHOOK, json=payload, timeout=10)
r.raise_for_status()
modes.append("slack")
except requests.RequestException as e:
errors.append(f"slack: {e}")
if DISCORD_WEBHOOK:
try:
_post_to_discord(payload)
modes.append("discord")
except requests.RequestException as e:
errors.append(f"discord: {e}")
return {
"success": True,
"mode": "+".join(modes),
**({"errors": errors} if errors else {}),
}
POSTMORTEM_TEMPLATE = """# Postmortem: {{ title }}
**Date:** {{ date }}
**Severity:** {{ severity }}
**Duration:** {{ duration }}
**Authors:** {{ authors }}
## Summary
{{ summary }}
## Impact
{{ impact }}
## Timeline (UTC)
{% for entry in timeline -%}
- **{{ entry.time }}** — {{ entry.event }}
{% endfor %}
## Root Cause
{{ root_cause }}
## Detection
{{ detection }}
## Resolution
{{ resolution }}
## What Went Well
{% for item in went_well -%}
- {{ item }}
{% endfor %}
## What Went Wrong
{% for item in went_wrong -%}
- {{ item }}
{% endfor %}
## Action Items
| # | Action | Owner | Priority | Due |
|---|---|---|---|---|
{% for ai in action_items -%}
| {{ loop.index }} | {{ ai.action }} | {{ ai.owner }} | {{ ai.priority }} | {{ ai.due }} |
{% endfor %}
"""
def postmortem_draft(incident: dict[str, Any], output_path: str = "") -> dict[str, Any]:
"""Generate a Cloudflare-blog quality postmortem.
incident dict shape (all optional — auto-filled from available data):
title, severity, duration, authors, summary, impact,
timeline: [{time, event}], root_cause, detection, resolution,
went_well: [str], went_wrong: [str],
action_items: [{action, owner, priority, due}]
"""
now = datetime.now(timezone.utc)
# Auto-fill missing fields from nested incident data
def _get(*keys, default=""):
for key in keys:
if incident.get(key):
return incident[key]
return default
triage = incident.get("triage", {}) or {}
diagnosis = incident.get("diagnosis", {}) or {}
remediation = incident.get("remediation", {}) or {}
title = _get("title") or triage.get("title") or "Incident"
severity = _get("severity") or triage.get("severity") or "Unknown"
root_cause_raw = _get("root_cause") or diagnosis.get("root_cause") or diagnosis.get("specific") or "Under investigation"
root_cause = root_cause_raw if isinstance(root_cause_raw, str) else json.dumps(root_cause_raw)
resolution_raw = _get("resolution") or remediation.get("outcome") or "Resolved by on-call team"
resolution = resolution_raw if isinstance(resolution_raw, str) else json.dumps(resolution_raw)
actions_taken = remediation.get("actions_taken", [])
timeline = incident.get("timeline") or [
{"time": now.strftime("%H:%M UTC"), "event": f"Alert fired: {title}"},
{"time": now.strftime("%H:%M UTC"), "event": "Triage agent acknowledged"},
{"time": now.strftime("%H:%M UTC"), "event": f"Root cause identified: {root_cause[:80]}"},
{"time": now.strftime("%H:%M UTC"), "event": "Remediation applied"},
]
went_well = incident.get("went_well") or ["Automated detection by Prometheus/Alertmanager", "AtlasOps multi-agent response < 5 min"]
went_wrong = incident.get("went_wrong") or ["Alert was not suppressed during maintenance window"]
action_items = incident.get("action_items") or [
{"action": f"Add runbook for {title}", "owner": "@sre-team", "priority": "P2", "due": "2026-06-01"},
{"action": "Review alert thresholds", "owner": "@observability", "priority": "P3", "due": "2026-06-15"},
]
if actions_taken:
action_items.insert(0, {
"action": f"Verify fix stability: {str(actions_taken[0])[:80]}",
"owner": "@sre-oncall", "priority": "P1",
"due": now.strftime("%Y-%m-%d"),
})
data = {
"title": title, "severity": severity,
"duration": incident.get("duration", "< 10 min"),
"authors": incident.get("authors", "AtlasOps automated response"),
"summary": incident.get("summary") or f"{severity} incident: {title}. Root cause: {root_cause[:120]}. Resolution: {resolution[:120]}.",
"impact": incident.get("impact") or f"Services affected: {triage.get('blast_radius', {}).get('services', ['unknown'])}. User impact: {triage.get('blast_radius', {}).get('user_impact_pct', 0)}%.",
"timeline": timeline,
"root_cause": root_cause,
"detection": incident.get("detection") or "Prometheus alert fired → Alertmanager forwarded to AtlasOps webhook.",
"resolution": resolution,
"went_well": went_well,
"went_wrong": went_wrong,
"action_items": action_items,
}
template = Template(POSTMORTEM_TEMPLATE)
rendered = template.render(date=now.date().isoformat(), **data)
POSTMORTEM_DIR.mkdir(parents=True, exist_ok=True)
if not output_path:
slug = title.lower().replace(" ", "-")[:60]
output_path = str(POSTMORTEM_DIR / f"{now.date()}-{slug}.md")
Path(output_path).write_text(rendered, encoding="utf-8")
return {"success": True, "path": output_path, "postmortem_path": output_path, "bytes": len(rendered)}