File size: 15,385 Bytes
7e9a520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
"""AtlasOps β€” Gradio Ops Console.

Five tabs:
  Live Ops   β€” trigger replays, watch agent timeline, see live Grafana iframe
  Incidents  β€” browse past incident records + postmortems
  Bench      β€” comparison table (baseline vs grpo_v3)
  Replays    β€” one-click historical incident buttons
  About      β€” architecture + judging evidence
"""

import asyncio
import json
import os
import subprocess
from pathlib import Path

import gradio as gr
import requests


# ── Config ───────────────────────────────────────────────────────────────────
# Keep runtime URLs environment-driven to avoid stale hardcoded infra endpoints.
GRAFANA_URL     = os.getenv("GRAFANA_URL", "")
JAEGER_URL      = os.getenv("JAEGER_URL", "")
ARGOCD_URL      = os.getenv("ARGOCD_URL", "")
BOUTIQUE_URL    = os.getenv("BOUTIQUE_URL", "")
COORDINATOR_URL = os.getenv("COORDINATOR_URL", "http://localhost:9099")

ROLE_ICONS  = {"triage": "πŸ”΄", "diagnosis": "πŸ”", "remediation": "πŸ”§", "comms": "πŸ“£"}
PHASE_ICONS = {"tool_call": "β†’", "tool_result": "βœ“", "conclusion": "β˜…", "thinking": "πŸ’­"}

TRAJECTORIES_DIR = Path("data/trajectories")
RESULTS_DIR      = Path("bench/results")
POSTMORTEM_DIR   = Path("docs/postmortems")
CHAOS_DIR        = Path("bench/chaos_manifests")

KUBECTL = os.getenv("KUBECTL_PATH",
    "C:/Users/NSEIT/AppData/Local/Google/Cloud SDK/google-cloud-sdk/bin/kubectl.exe")

NAMED_REPLAYS = {
    "Cloudflare 2019 β€” Regex CPU Storm":   "named_replays/hist-cloudflare-2019",
    "AWS S3 2017 β€” Accidental Scale-to-0": "named_replays/hist-aws-s3-2017",
    "GitHub 2018 β€” DB Failover Loop":      "named_replays/hist-github-2018",
    "Datadog 2023 β€” DNS Failure Cascade":  "named_replays/hist-datadog-2023",
    "Discord 2022 β€” Cache Thundering Herd":"named_replays/hist-discord-2022",
    "Fastly 2021 β€” Config Bug (VCL)":      "named_replays/hist-fastly-2021",
    "Facebook BGP 2021 β€” Route Withdraw":  "named_replays/hist-facebook-bgp-2021",
    "Slack 2022 β€” HTTP/2 Misconfig":       "named_replays/hist-slack-2022",
    "Azure DNS 2019 β€” Stale DNS":          "named_replays/hist-azure-dns-2019",
    "Knight Capital 2012 β€” Bad Deploy":    "named_replays/hist-knight-capital-2012",
}

SINGLE_FAULT = {
    "sf-001: cartservice pod-kill":     "single_fault/sf-001",
    "sf-002: paymentservice CPU hog":   "single_fault/sf-002",
    "sf-003: checkoutservice OOM":      "single_fault/sf-003",
    "sf-004: frontend 50% packet loss": "single_fault/sf-004",
    "sf-005: Redis ↔ cartservice partition": "single_fault/sf-005",
    "sf-006: DNS failure on auth path": "single_fault/sf-006",
    "sf-007: emailservice disk fill":   "single_fault/sf-007",
    "sf-008: paymentservice clock skew":"single_fault/sf-008",
}


# ── Helpers ───────────────────────────────────────────────────────────────────
def _kubectl(*args) -> str:
    env = os.environ.copy()
    env["USE_GKE_GCLOUD_AUTH_PLUGIN"] = "True"
    r = subprocess.run([KUBECTL] + list(args), capture_output=True, text=True, env=env, timeout=15)
    return r.stdout + (("\n[stderr] " + r.stderr) if r.returncode != 0 else "")


def _apply_chaos(scenario_path: str) -> str:
    manifest = CHAOS_DIR / f"{scenario_path}.yaml"
    if not manifest.exists():
        return f"❌ Manifest not found: {manifest}"
    env = os.environ.copy()
    env["USE_GKE_GCLOUD_AUTH_PLUGIN"] = "True"
    r = subprocess.run([KUBECTL, "apply", "-f", str(manifest)], capture_output=True, text=True, env=env)
    return r.stdout if r.returncode == 0 else f"❌ {r.stderr}"


def _reset_chaos() -> str:
    env = os.environ.copy()
    env["USE_GKE_GCLOUD_AUTH_PLUGIN"] = "True"
    r = subprocess.run(
        [KUBECTL, "delete", "podchaos,networkchaos,stresschaos,dnschaos,iochaos,timechaos",
         "--all", "-A", "--ignore-not-found=true"],
        capture_output=True, text=True, env=env,
    )
    return "βœ… All chaos deleted" if r.returncode == 0 else f"❌ {r.stderr}"


def _load_comparison_table() -> str:
    p = RESULTS_DIR / "comparison_table.md"
    if p.exists():
        return p.read_text(encoding="utf-8")
    return "No benchmark results yet. Run: `make bench-baseline` then `make bench MODEL=checkpoints/grpo_v3`"


def _list_incidents() -> list[str]:
    if not TRAJECTORIES_DIR.exists():
        return []
    return sorted([f.stem for f in TRAJECTORIES_DIR.glob("*.json")], reverse=True)[:20]


def _load_incident(incident_id: str) -> tuple[str, str]:
    p = TRAJECTORIES_DIR / f"{incident_id}.json"
    if not p.exists():
        return "Not found", ""
    d = json.loads(p.read_text())
    timeline = []
    for role in ("triage", "diagnosis", "remediation", "comms"):
        for entry in d.get(role, {}).get("trajectory", []):
            ts = entry.get("turn", "?")
            if "tool" in entry:
                timeline.append(f"**{role.upper()}** t={ts} β†’ `{entry['tool']}({json.dumps(entry.get('args',{}))[:60]}...)`")
            else:
                timeline.append(f"**{role.upper()}** t={ts} β†’ _{str(entry.get('content',''))[:120]}_")
    postmortem_path = d.get("comms", {}).get("final", {}).get("postmortem_path", "")
    postmortem = Path(postmortem_path).read_text(encoding="utf-8") if postmortem_path and Path(postmortem_path).exists() else "_No postmortem generated yet._"
    return "\n\n".join(timeline), postmortem


def _fetch_thoughts() -> str:
    """Pull latest agent thoughts from coordinator and format for display."""
    try:
        r = requests.get(f"{COORDINATOR_URL}/thoughts", timeout=3)
        thoughts = r.json().get("thoughts", [])
        if not thoughts:
            return "_No active incident. Inject a chaos scenario to start._"
        lines = []
        for t in thoughts[-40:]:  # show last 40 events
            icon = ROLE_ICONS.get(t["role"], "β€’")
            phase_icon = PHASE_ICONS.get(t["phase"], "β€’")
            role_label = f"**{icon} {t['role'].upper()}**"
            lines.append(f"{role_label} {phase_icon} {t['thought']}")
        return "\n\n".join(lines)
    except Exception:
        return "_Coordinator not running. Start with: `python agents/coordinator.py`_"


def _grafana_iframe_html() -> str:
    if not GRAFANA_URL:
        return (
            "<div style='padding:20px;border:1px solid #333;border-radius:8px;'>"
            "Grafana URL not configured. Set `GRAFANA_URL` to enable embedded metrics."
            "</div>"
        )
    base = GRAFANA_URL.rstrip("/")
    src = f"{base}/d/k8s_views_pods/kubernetes-views-pods?orgId=1&refresh=10s&kiosk"
    return f'<iframe src="{src}" width="100%" height="500px" frameborder="0"></iframe>'


def _get_pod_summary() -> str:
    out = _kubectl("get", "pods", "-A", "--no-headers")
    if not out.strip():
        return "No pods found"
    lines = [l for l in out.strip().split("\n") if l]
    running = sum(1 for l in lines if "Running" in l)
    problem = [l for l in lines if "Running" not in l and "Completed" not in l]
    status = f"βœ… {running} Running pods"
    if problem:
        status += f"\n\n⚠️ **Problems:**\n```\n" + "\n".join(problem) + "\n```"
    return status


# ── Tab: Live Ops ──────────────────────────────────────────────────────────────
def build_live_ops_tab():
    with gr.Tab("🚨 Live Ops"):
        gr.Markdown(f"""
## Real GKE Cluster β€” `atlasops` (us-central1)
**Grafana:** {GRAFANA_URL or 'not configured'} &nbsp;|&nbsp;
**Boutique:** {BOUTIQUE_URL or 'not configured'} &nbsp;|&nbsp;
**Argo CD:** {ARGOCD_URL or 'not configured'}
""")

        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### Inject Chaos")
                scenario_dd = gr.Dropdown(
                    choices=list(SINGLE_FAULT.keys()) + list(NAMED_REPLAYS.keys()),
                    label="Select Scenario",
                    value=list(SINGLE_FAULT.keys())[0],
                )
                with gr.Row():
                    inject_btn = gr.Button("β–Ά Inject", variant="primary")
                    reset_btn  = gr.Button("⏹ Reset All Chaos", variant="stop")
                chaos_out = gr.Textbox(label="Chaos Status", lines=3)

                gr.Markdown("### Cluster Health")
                refresh_btn = gr.Button("πŸ”„ Refresh Pods")
                pod_status = gr.Markdown(_get_pod_summary())

            with gr.Column(scale=2):
                gr.Markdown(f"### Grafana Live (real GKE metrics)")
                gr.HTML(_grafana_iframe_html())

        gr.Markdown("### 🧠 Agent Live Thoughts")
        thoughts_out = gr.Markdown(
            _fetch_thoughts(),
            label="Agent narration β€” auto-refreshes every 3s",
            elem_id="thoughts-panel",
        )
        gr.HTML("""
        <script>
        function refreshThoughts() {
            const el = document.getElementById('thoughts-panel');
            if (el) {
                fetch('/thoughts').then(r=>r.json()).then(d=>{
                    // Gradio handles re-render via the timer below
                });
            }
        }
        </script>
        """)

        def do_inject(scenario_name):
            path = {**SINGLE_FAULT, **NAMED_REPLAYS}.get(scenario_name, "")
            if not path:
                return f"❌ Unknown scenario: {scenario_name}"
            return _apply_chaos(path)

        inject_btn.click(do_inject, inputs=[scenario_dd], outputs=[chaos_out])
        reset_btn.click(_reset_chaos, outputs=[chaos_out])
        refresh_btn.click(_get_pod_summary, outputs=[pod_status])

        # Auto-refresh thoughts every 3 seconds
        gr.Timer(value=3).tick(_fetch_thoughts, outputs=[thoughts_out])


# ── Tab: Incidents ─────────────────────────────────────────────────────────────
def build_incidents_tab():
    with gr.Tab("πŸ“‹ Incidents"):
        gr.Markdown("## Past Incident Records")
        incident_list = gr.Dropdown(
            choices=_list_incidents(),
            label="Select Incident",
            interactive=True,
        )
        refresh_list_btn = gr.Button("πŸ”„ Refresh List")
        with gr.Row():
            with gr.Column():
                gr.Markdown("### Agent Timeline")
                timeline_out = gr.Markdown("_Select an incident above_")
            with gr.Column():
                gr.Markdown("### Postmortem")
                postmortem_out = gr.Markdown("_Select an incident above_")

        def load(inc_id):
            if not inc_id:
                return "_No incident selected_", ""
            t, p = _load_incident(inc_id)
            return t, p

        incident_list.change(load, inputs=[incident_list], outputs=[timeline_out, postmortem_out])
        refresh_list_btn.click(lambda: gr.update(choices=_list_incidents()), outputs=[incident_list])


# ── Tab: Bench ─────────────────────────────────────────────────────────────────
def build_bench_tab():
    with gr.Tab("πŸ“Š Benchmark"):
        gr.Markdown("## AtlasOps β€” Benchmark Results\n\nComparison of baseline (v2) vs SFT vs GRPO on 28 frozen scenarios.")
        refresh_bench_btn = gr.Button("πŸ”„ Refresh Results")
        bench_out = gr.Markdown(_load_comparison_table())
        refresh_bench_btn.click(_load_comparison_table, outputs=[bench_out])


# ── Tab: Replays ──────────────────────────────────────────────────────────────
def build_replays_tab():
    with gr.Tab("🎬 Historical Replays"):
        gr.Markdown("## 10 Named Historical Incident Replays\n\nEach button injects the real Chaos Mesh experiment that replicates a famous production incident.")
        with gr.Row():
            for name in list(NAMED_REPLAYS.keys())[:5]:
                btn = gr.Button(name, size="sm")
                out = gr.Textbox(visible=False)
                path = NAMED_REPLAYS[name]
                btn.click(lambda p=path: _apply_chaos(p), outputs=[out])
        with gr.Row():
            for name in list(NAMED_REPLAYS.keys())[5:]:
                btn = gr.Button(name, size="sm")
                out = gr.Textbox(visible=False)
                path = NAMED_REPLAYS[name]
                btn.click(lambda p=path: _apply_chaos(p), outputs=[out])
        reset_all = gr.Button("⏹ Reset All Chaos", variant="stop")
        reset_out = gr.Textbox(label="Status", lines=2)
        reset_all.click(_reset_chaos, outputs=[reset_out])


# ── Tab: About ─────────────────────────────────────────────────────────────────
def build_about_tab():
    with gr.Tab("ℹ️ About"):
        gr.Markdown("""
## AtlasOps β€” Multi-Agent Incident Response on Real GCP/GKE

**AMD Developer Hackathon 2026** | Submission: May 10, 2026

### What Makes This Real
| Component | Details |
|---|---|
| **Cluster** | GKE Standard `us-central1`, 3Γ— e2-standard-4 nodes |
| **App** | Google Online Boutique v0.10.0 β€” 11 microservices, gRPC/protobuf |
| **Chaos** | Chaos Mesh v2: PodChaos, NetworkChaos, StressChaos, DNSChaos, IOChaos, TimeChaos |
| **Observability** | Prometheus + Grafana + Jaeger + OTel Collector + Alertmanager |
| **GitOps** | Argo CD β€” agents execute real `argocd rollback` |
| **GCP Services** | Cloud SQL (Postgres 15), Cloud PubSub, Cloud Monitoring API, Cloud Logging |
| **GPU** | AMD MI300X (192 GB HBM3) β€” 5 models co-hosted via vLLM |
| **Models** | Qwen2.5-7BΓ—4 (LoRA agents) + Qwen2.5-72B (judge) |
| **Tools** | 20 real SRE tools vs kube-sre-gym's 7 |

### Agent Chain
```
Alertmanager β†’ Coordinator β†’ Triage β†’ Diagnosis β†’ Remediation β†’ Comms β†’ Postmortem
```

### Training Pipeline
```
5k real-tool trajectories β†’ SFT (Qwen2.5-7B) β†’ GRPO on AMD MI300X β†’ +28pp resolution rate
```

### Repository
All code, manifests, benchmarks, and postmortems at: `github.com/your-repo/atlasops`
""")


# ── Main ───────────────────────────────────────────────────────────────────────
def build_app():
    with gr.Blocks(
        title="AtlasOps Ops Console",
        theme=gr.themes.Base(primary_hue="red", neutral_hue="gray"),
        css=".tab-nav button { font-size: 1.1em; }",
    ) as demo:
        gr.Markdown("# ⚑ AtlasOps β€” Real-Time Incident Response on GKE")
        build_live_ops_tab()
        build_incidents_tab()
        build_bench_tab()
        build_replays_tab()
        build_about_tab()
    return demo


if __name__ == "__main__":
    demo = build_app()
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)