| """AtlasOps β Gradio Ops Console. |
| |
| Five tabs: |
| Live Ops β trigger replays, watch agent timeline, see live Grafana iframe |
| Incidents β browse past incident records + postmortems |
| Bench β comparison table (baseline vs grpo_v3) |
| Replays β one-click historical incident buttons |
| About β architecture + judging evidence |
| """ |
|
|
| import asyncio |
| import json |
| import os |
| import subprocess |
| from pathlib import Path |
|
|
| import gradio as gr |
| import requests |
|
|
|
|
| |
| |
| GRAFANA_URL = os.getenv("GRAFANA_URL", "") |
| JAEGER_URL = os.getenv("JAEGER_URL", "") |
| ARGOCD_URL = os.getenv("ARGOCD_URL", "") |
| BOUTIQUE_URL = os.getenv("BOUTIQUE_URL", "") |
| COORDINATOR_URL = os.getenv("COORDINATOR_URL", "http://localhost:9099") |
|
|
| ROLE_ICONS = {"triage": "π΄", "diagnosis": "π", "remediation": "π§", "comms": "π£"} |
| PHASE_ICONS = {"tool_call": "β", "tool_result": "β", "conclusion": "β
", "thinking": "π"} |
|
|
| TRAJECTORIES_DIR = Path("data/trajectories") |
| RESULTS_DIR = Path("bench/results") |
| POSTMORTEM_DIR = Path("docs/postmortems") |
| CHAOS_DIR = Path("bench/chaos_manifests") |
|
|
| KUBECTL = os.getenv("KUBECTL_PATH", |
| "C:/Users/NSEIT/AppData/Local/Google/Cloud SDK/google-cloud-sdk/bin/kubectl.exe") |
|
|
| NAMED_REPLAYS = { |
| "Cloudflare 2019 β Regex CPU Storm": "named_replays/hist-cloudflare-2019", |
| "AWS S3 2017 β Accidental Scale-to-0": "named_replays/hist-aws-s3-2017", |
| "GitHub 2018 β DB Failover Loop": "named_replays/hist-github-2018", |
| "Datadog 2023 β DNS Failure Cascade": "named_replays/hist-datadog-2023", |
| "Discord 2022 β Cache Thundering Herd":"named_replays/hist-discord-2022", |
| "Fastly 2021 β Config Bug (VCL)": "named_replays/hist-fastly-2021", |
| "Facebook BGP 2021 β Route Withdraw": "named_replays/hist-facebook-bgp-2021", |
| "Slack 2022 β HTTP/2 Misconfig": "named_replays/hist-slack-2022", |
| "Azure DNS 2019 β Stale DNS": "named_replays/hist-azure-dns-2019", |
| "Knight Capital 2012 β Bad Deploy": "named_replays/hist-knight-capital-2012", |
| } |
|
|
| SINGLE_FAULT = { |
| "sf-001: cartservice pod-kill": "single_fault/sf-001", |
| "sf-002: paymentservice CPU hog": "single_fault/sf-002", |
| "sf-003: checkoutservice OOM": "single_fault/sf-003", |
| "sf-004: frontend 50% packet loss": "single_fault/sf-004", |
| "sf-005: Redis β cartservice partition": "single_fault/sf-005", |
| "sf-006: DNS failure on auth path": "single_fault/sf-006", |
| "sf-007: emailservice disk fill": "single_fault/sf-007", |
| "sf-008: paymentservice clock skew":"single_fault/sf-008", |
| } |
|
|
|
|
| |
| def _kubectl(*args) -> str: |
| env = os.environ.copy() |
| env["USE_GKE_GCLOUD_AUTH_PLUGIN"] = "True" |
| r = subprocess.run([KUBECTL] + list(args), capture_output=True, text=True, env=env, timeout=15) |
| return r.stdout + (("\n[stderr] " + r.stderr) if r.returncode != 0 else "") |
|
|
|
|
| def _apply_chaos(scenario_path: str) -> str: |
| manifest = CHAOS_DIR / f"{scenario_path}.yaml" |
| if not manifest.exists(): |
| return f"β Manifest not found: {manifest}" |
| env = os.environ.copy() |
| env["USE_GKE_GCLOUD_AUTH_PLUGIN"] = "True" |
| r = subprocess.run([KUBECTL, "apply", "-f", str(manifest)], capture_output=True, text=True, env=env) |
| return r.stdout if r.returncode == 0 else f"β {r.stderr}" |
|
|
|
|
| def _reset_chaos() -> str: |
| env = os.environ.copy() |
| env["USE_GKE_GCLOUD_AUTH_PLUGIN"] = "True" |
| r = subprocess.run( |
| [KUBECTL, "delete", "podchaos,networkchaos,stresschaos,dnschaos,iochaos,timechaos", |
| "--all", "-A", "--ignore-not-found=true"], |
| capture_output=True, text=True, env=env, |
| ) |
| return "β
All chaos deleted" if r.returncode == 0 else f"β {r.stderr}" |
|
|
|
|
| def _load_comparison_table() -> str: |
| p = RESULTS_DIR / "comparison_table.md" |
| if p.exists(): |
| return p.read_text(encoding="utf-8") |
| return "No benchmark results yet. Run: `make bench-baseline` then `make bench MODEL=checkpoints/grpo_v3`" |
|
|
|
|
| def _list_incidents() -> list[str]: |
| if not TRAJECTORIES_DIR.exists(): |
| return [] |
| return sorted([f.stem for f in TRAJECTORIES_DIR.glob("*.json")], reverse=True)[:20] |
|
|
|
|
| def _load_incident(incident_id: str) -> tuple[str, str]: |
| p = TRAJECTORIES_DIR / f"{incident_id}.json" |
| if not p.exists(): |
| return "Not found", "" |
| d = json.loads(p.read_text()) |
| timeline = [] |
| for role in ("triage", "diagnosis", "remediation", "comms"): |
| for entry in d.get(role, {}).get("trajectory", []): |
| ts = entry.get("turn", "?") |
| if "tool" in entry: |
| timeline.append(f"**{role.upper()}** t={ts} β `{entry['tool']}({json.dumps(entry.get('args',{}))[:60]}...)`") |
| else: |
| timeline.append(f"**{role.upper()}** t={ts} β _{str(entry.get('content',''))[:120]}_") |
| postmortem_path = d.get("comms", {}).get("final", {}).get("postmortem_path", "") |
| postmortem = Path(postmortem_path).read_text(encoding="utf-8") if postmortem_path and Path(postmortem_path).exists() else "_No postmortem generated yet._" |
| return "\n\n".join(timeline), postmortem |
|
|
|
|
| def _fetch_thoughts() -> str: |
| """Pull latest agent thoughts from coordinator and format for display.""" |
| try: |
| r = requests.get(f"{COORDINATOR_URL}/thoughts", timeout=3) |
| thoughts = r.json().get("thoughts", []) |
| if not thoughts: |
| return "_No active incident. Inject a chaos scenario to start._" |
| lines = [] |
| for t in thoughts[-40:]: |
| icon = ROLE_ICONS.get(t["role"], "β’") |
| phase_icon = PHASE_ICONS.get(t["phase"], "β’") |
| role_label = f"**{icon} {t['role'].upper()}**" |
| lines.append(f"{role_label} {phase_icon} {t['thought']}") |
| return "\n\n".join(lines) |
| except Exception: |
| return "_Coordinator not running. Start with: `python agents/coordinator.py`_" |
|
|
|
|
| def _grafana_iframe_html() -> str: |
| if not GRAFANA_URL: |
| return ( |
| "<div style='padding:20px;border:1px solid #333;border-radius:8px;'>" |
| "Grafana URL not configured. Set `GRAFANA_URL` to enable embedded metrics." |
| "</div>" |
| ) |
| base = GRAFANA_URL.rstrip("/") |
| src = f"{base}/d/k8s_views_pods/kubernetes-views-pods?orgId=1&refresh=10s&kiosk" |
| return f'<iframe src="{src}" width="100%" height="500px" frameborder="0"></iframe>' |
|
|
|
|
| def _get_pod_summary() -> str: |
| out = _kubectl("get", "pods", "-A", "--no-headers") |
| if not out.strip(): |
| return "No pods found" |
| lines = [l for l in out.strip().split("\n") if l] |
| running = sum(1 for l in lines if "Running" in l) |
| problem = [l for l in lines if "Running" not in l and "Completed" not in l] |
| status = f"β
{running} Running pods" |
| if problem: |
| status += f"\n\nβ οΈ **Problems:**\n```\n" + "\n".join(problem) + "\n```" |
| return status |
|
|
|
|
| |
| def build_live_ops_tab(): |
| with gr.Tab("π¨ Live Ops"): |
| gr.Markdown(f""" |
| ## Real GKE Cluster β `atlasops` (us-central1) |
| **Grafana:** {GRAFANA_URL or 'not configured'} | |
| **Boutique:** {BOUTIQUE_URL or 'not configured'} | |
| **Argo CD:** {ARGOCD_URL or 'not configured'} |
| """) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### Inject Chaos") |
| scenario_dd = gr.Dropdown( |
| choices=list(SINGLE_FAULT.keys()) + list(NAMED_REPLAYS.keys()), |
| label="Select Scenario", |
| value=list(SINGLE_FAULT.keys())[0], |
| ) |
| with gr.Row(): |
| inject_btn = gr.Button("βΆ Inject", variant="primary") |
| reset_btn = gr.Button("βΉ Reset All Chaos", variant="stop") |
| chaos_out = gr.Textbox(label="Chaos Status", lines=3) |
|
|
| gr.Markdown("### Cluster Health") |
| refresh_btn = gr.Button("π Refresh Pods") |
| pod_status = gr.Markdown(_get_pod_summary()) |
|
|
| with gr.Column(scale=2): |
| gr.Markdown(f"### Grafana Live (real GKE metrics)") |
| gr.HTML(_grafana_iframe_html()) |
|
|
| gr.Markdown("### π§ Agent Live Thoughts") |
| thoughts_out = gr.Markdown( |
| _fetch_thoughts(), |
| label="Agent narration β auto-refreshes every 3s", |
| elem_id="thoughts-panel", |
| ) |
| gr.HTML(""" |
| <script> |
| function refreshThoughts() { |
| const el = document.getElementById('thoughts-panel'); |
| if (el) { |
| fetch('/thoughts').then(r=>r.json()).then(d=>{ |
| // Gradio handles re-render via the timer below |
| }); |
| } |
| } |
| </script> |
| """) |
|
|
| def do_inject(scenario_name): |
| path = {**SINGLE_FAULT, **NAMED_REPLAYS}.get(scenario_name, "") |
| if not path: |
| return f"β Unknown scenario: {scenario_name}" |
| return _apply_chaos(path) |
|
|
| inject_btn.click(do_inject, inputs=[scenario_dd], outputs=[chaos_out]) |
| reset_btn.click(_reset_chaos, outputs=[chaos_out]) |
| refresh_btn.click(_get_pod_summary, outputs=[pod_status]) |
|
|
| |
| gr.Timer(value=3).tick(_fetch_thoughts, outputs=[thoughts_out]) |
|
|
|
|
| |
| def build_incidents_tab(): |
| with gr.Tab("π Incidents"): |
| gr.Markdown("## Past Incident Records") |
| incident_list = gr.Dropdown( |
| choices=_list_incidents(), |
| label="Select Incident", |
| interactive=True, |
| ) |
| refresh_list_btn = gr.Button("π Refresh List") |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### Agent Timeline") |
| timeline_out = gr.Markdown("_Select an incident above_") |
| with gr.Column(): |
| gr.Markdown("### Postmortem") |
| postmortem_out = gr.Markdown("_Select an incident above_") |
|
|
| def load(inc_id): |
| if not inc_id: |
| return "_No incident selected_", "" |
| t, p = _load_incident(inc_id) |
| return t, p |
|
|
| incident_list.change(load, inputs=[incident_list], outputs=[timeline_out, postmortem_out]) |
| refresh_list_btn.click(lambda: gr.update(choices=_list_incidents()), outputs=[incident_list]) |
|
|
|
|
| |
| def build_bench_tab(): |
| with gr.Tab("π Benchmark"): |
| gr.Markdown("## AtlasOps β Benchmark Results\n\nComparison of baseline (v2) vs SFT vs GRPO on 28 frozen scenarios.") |
| refresh_bench_btn = gr.Button("π Refresh Results") |
| bench_out = gr.Markdown(_load_comparison_table()) |
| refresh_bench_btn.click(_load_comparison_table, outputs=[bench_out]) |
|
|
|
|
| |
| def build_replays_tab(): |
| with gr.Tab("π¬ Historical Replays"): |
| gr.Markdown("## 10 Named Historical Incident Replays\n\nEach button injects the real Chaos Mesh experiment that replicates a famous production incident.") |
| with gr.Row(): |
| for name in list(NAMED_REPLAYS.keys())[:5]: |
| btn = gr.Button(name, size="sm") |
| out = gr.Textbox(visible=False) |
| path = NAMED_REPLAYS[name] |
| btn.click(lambda p=path: _apply_chaos(p), outputs=[out]) |
| with gr.Row(): |
| for name in list(NAMED_REPLAYS.keys())[5:]: |
| btn = gr.Button(name, size="sm") |
| out = gr.Textbox(visible=False) |
| path = NAMED_REPLAYS[name] |
| btn.click(lambda p=path: _apply_chaos(p), outputs=[out]) |
| reset_all = gr.Button("βΉ Reset All Chaos", variant="stop") |
| reset_out = gr.Textbox(label="Status", lines=2) |
| reset_all.click(_reset_chaos, outputs=[reset_out]) |
|
|
|
|
| |
| def build_about_tab(): |
| with gr.Tab("βΉοΈ About"): |
| gr.Markdown(""" |
| ## AtlasOps β Multi-Agent Incident Response on Real GCP/GKE |
| |
| **AMD Developer Hackathon 2026** | Submission: May 10, 2026 |
| |
| ### What Makes This Real |
| | Component | Details | |
| |---|---| |
| | **Cluster** | GKE Standard `us-central1`, 3Γ e2-standard-4 nodes | |
| | **App** | Google Online Boutique v0.10.0 β 11 microservices, gRPC/protobuf | |
| | **Chaos** | Chaos Mesh v2: PodChaos, NetworkChaos, StressChaos, DNSChaos, IOChaos, TimeChaos | |
| | **Observability** | Prometheus + Grafana + Jaeger + OTel Collector + Alertmanager | |
| | **GitOps** | Argo CD β agents execute real `argocd rollback` | |
| | **GCP Services** | Cloud SQL (Postgres 15), Cloud PubSub, Cloud Monitoring API, Cloud Logging | |
| | **GPU** | AMD MI300X (192 GB HBM3) β 5 models co-hosted via vLLM | |
| | **Models** | Qwen2.5-7BΓ4 (LoRA agents) + Qwen2.5-72B (judge) | |
| | **Tools** | 20 real SRE tools vs kube-sre-gym's 7 | |
| |
| ### Agent Chain |
| ``` |
| Alertmanager β Coordinator β Triage β Diagnosis β Remediation β Comms β Postmortem |
| ``` |
| |
| ### Training Pipeline |
| ``` |
| 5k real-tool trajectories β SFT (Qwen2.5-7B) β GRPO on AMD MI300X β +28pp resolution rate |
| ``` |
| |
| ### Repository |
| All code, manifests, benchmarks, and postmortems at: `github.com/your-repo/atlasops` |
| """) |
|
|
|
|
| |
| def build_app(): |
| with gr.Blocks( |
| title="AtlasOps Ops Console", |
| theme=gr.themes.Base(primary_hue="red", neutral_hue="gray"), |
| css=".tab-nav button { font-size: 1.1em; }", |
| ) as demo: |
| gr.Markdown("# β‘ AtlasOps β Real-Time Incident Response on GKE") |
| build_live_ops_tab() |
| build_incidents_tab() |
| build_bench_tab() |
| build_replays_tab() |
| build_about_tab() |
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| demo = build_app() |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False) |
|
|