| """Quick evaluation — runs the 3 demo scenarios and scores them. |
| |
| Produces bench/results/quick_eval.json and prints a summary table. |
| Does NOT apply real Chaos Mesh — uses the pre-defined alert payloads |
| from inference.py (same as the HF demo). |
| |
| Usage: |
| python bench/quick_eval.py |
| """ |
|
|
| import asyncio |
| import json |
| import os |
| import sys |
| import time |
| from pathlib import Path |
|
|
| _env = Path(__file__).parent.parent / ".env" |
| if _env.exists(): |
| for line in _env.read_text().splitlines(): |
| line = line.strip() |
| if line and not line.startswith("#") and "=" in line: |
| k, _, v = line.partition("=") |
| os.environ.setdefault(k.strip(), v.strip()) |
|
|
| if hasattr(sys.stdout, "reconfigure"): |
| sys.stdout.reconfigure(encoding="utf-8", errors="replace") |
|
|
| SCENARIOS = { |
| "hist-cloudflare-2019": { |
| "commonLabels": {"alertname": "HighCPUSaturation", "severity": "critical", "namespace": "default"}, |
| "commonAnnotations": {"summary": "CPU saturation on frontend — Cloudflare 2019 replay"}, |
| "alerts": [{"status": "firing", "labels": {"alertname": "HighCPUSaturation", "pod": "frontend-xxx", "severity": "critical"}, "startsAt": "2026-05-09T10:00:00Z"}], |
| }, |
| "hist-github-2018": { |
| "commonLabels": {"alertname": "DatabaseFailoverLoop", "severity": "critical", "namespace": "default"}, |
| "commonAnnotations": {"summary": "Cloud SQL primary killed — replica promotion loop"}, |
| "alerts": [{"status": "firing", "labels": {"alertname": "DatabaseFailoverLoop", "severity": "critical"}, "startsAt": "2026-05-09T10:00:00Z"}], |
| }, |
| "sf-001": { |
| "commonLabels": {"alertname": "PodCrashLooping", "severity": "warning", "namespace": "default"}, |
| "commonAnnotations": {"summary": "cartservice pod killed by OOMKill"}, |
| "alerts": [{"status": "firing", "labels": {"alertname": "PodCrashLooping", "pod": "cartservice-xxx", "severity": "warning"}, "startsAt": "2026-05-09T10:00:00Z"}], |
| }, |
| } |
|
|
|
|
| def score_incident(incident: dict, elapsed_s: float, scenario_id: str) -> dict: |
| """Compute reward metrics for a completed incident chain.""" |
| triage = incident.get("triage", {}).get("final", {}) |
| diagnosis = incident.get("diagnosis", {}).get("final", {}) |
| remediation = incident.get("remediation", {}).get("final", {}) |
| comms = incident.get("comms", {}).get("final", {}) |
|
|
| severity = triage.get("severity", "UNKNOWN") |
| root_cause = diagnosis.get("root_cause") or diagnosis.get("specific") |
| outcome = remediation.get("outcome", "unresolved") |
| postmortem_path = comms.get("postmortem_path") |
|
|
| |
| triage_score = 1.0 if severity in {"P0", "P1", "P2", "P3"} else 0.0 |
| diagnosis_score = 0.8 if root_cause and not isinstance(root_cause, dict) else (0.6 if root_cause else 0.0) |
| remediation_score = {"resolved": 1.0, "partial": 0.7, "escalated": 0.5, "unresolved": 0.2}.get(outcome, 0.0) |
| comms_score = 1.0 if (postmortem_path and Path(postmortem_path).exists()) else 0.5 |
| speed_score = max(0.0, 1.0 - (elapsed_s - 30) / 300) if elapsed_s < 300 else 0.0 |
|
|
| total = ( |
| triage_score * 0.15 + |
| diagnosis_score * 0.30 + |
| remediation_score * 0.35 + |
| comms_score * 0.10 + |
| speed_score * 0.10 |
| ) |
|
|
| triage_turns = len(incident.get("triage", {}).get("trajectory", [])) |
| diagnosis_turns = len(incident.get("diagnosis", {}).get("trajectory", [])) |
| remediation_turns = len(incident.get("remediation", {}).get("trajectory", [])) |
|
|
| return { |
| "scenario_id": scenario_id, |
| "elapsed_s": round(elapsed_s, 1), |
| "severity": severity, |
| "outcome": outcome, |
| "root_cause_found": bool(root_cause), |
| "postmortem_saved": bool(postmortem_path and Path(postmortem_path).exists()), |
| "turns": triage_turns + diagnosis_turns + remediation_turns, |
| "scores": { |
| "triage": round(triage_score, 2), |
| "diagnosis": round(diagnosis_score, 2), |
| "remediation": round(remediation_score, 2), |
| "comms": round(comms_score, 2), |
| "speed": round(speed_score, 2), |
| }, |
| "total": round(total, 3), |
| } |
|
|
|
|
| async def run_scenario(scenario_id: str) -> dict: |
| from agents.coordinator import handle_incident |
| from agents.stream import get_history |
|
|
| alert = SCENARIOS[scenario_id] |
| alert["scenario_id"] = scenario_id |
|
|
| print(f"\n[-->] {scenario_id}") |
| t0 = time.time() |
| incident = await handle_incident(alert) |
| elapsed = time.time() - t0 |
| return score_incident(incident, elapsed, scenario_id) |
|
|
|
|
| async def main(): |
| results = [] |
| for sid in SCENARIOS: |
| r = await run_scenario(sid) |
| results.append(r) |
| print(f" {r['outcome']:12s} {r['elapsed_s']:6.1f}s score={r['total']:.3f}") |
|
|
| avg_score = sum(r["total"] for r in results) / len(results) |
| avg_turns = sum(r["turns"] for r in results) / len(results) |
| resolved = sum(1 for r in results if r["outcome"] in {"resolved", "partial"}) |
| postmortems = sum(1 for r in results if r["postmortem_saved"]) |
|
|
| summary = { |
| "model": os.getenv("AGENT_MODEL", "Qwen/Qwen2.5-7B-Instruct"), |
| "backend": os.getenv("BACKEND", "vllm"), |
| "scenarios": len(results), |
| "resolution_rate": resolved / len(results), |
| "avg_reward": avg_score, |
| "avg_turns": avg_turns, |
| "postmortem_rate": postmortems / len(results), |
| "results": results, |
| } |
|
|
| out = Path("bench/results/quick_eval.json") |
| out.parent.mkdir(parents=True, exist_ok=True) |
| out.write_text(json.dumps(summary, indent=2), encoding="utf-8") |
|
|
| print(f"\n{'='*60}") |
| print(f" QUICK EVAL SUMMARY ({len(results)} scenarios)") |
| print(f"{'='*60}") |
| print(f" Resolution rate: {resolved}/{len(results)} ({100*resolved//len(results)}%)") |
| print(f" Avg reward: {avg_score:.3f}") |
| print(f" Avg turns: {avg_turns:.1f}") |
| print(f" Postmortem rate: {postmortems}/{len(results)}") |
| print(f" Results saved: {out}") |
|
|
|
|
| if __name__ == "__main__": |
| asyncio.run(main()) |
|
|