File size: 6,097 Bytes
7e9a520 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | """Quick evaluation — runs the 3 demo scenarios and scores them.
Produces bench/results/quick_eval.json and prints a summary table.
Does NOT apply real Chaos Mesh — uses the pre-defined alert payloads
from inference.py (same as the HF demo).
Usage:
python bench/quick_eval.py
"""
import asyncio
import json
import os
import sys
import time
from pathlib import Path
_env = Path(__file__).parent.parent / ".env"
if _env.exists():
for line in _env.read_text().splitlines():
line = line.strip()
if line and not line.startswith("#") and "=" in line:
k, _, v = line.partition("=")
os.environ.setdefault(k.strip(), v.strip())
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
SCENARIOS = {
"hist-cloudflare-2019": {
"commonLabels": {"alertname": "HighCPUSaturation", "severity": "critical", "namespace": "default"},
"commonAnnotations": {"summary": "CPU saturation on frontend — Cloudflare 2019 replay"},
"alerts": [{"status": "firing", "labels": {"alertname": "HighCPUSaturation", "pod": "frontend-xxx", "severity": "critical"}, "startsAt": "2026-05-09T10:00:00Z"}],
},
"hist-github-2018": {
"commonLabels": {"alertname": "DatabaseFailoverLoop", "severity": "critical", "namespace": "default"},
"commonAnnotations": {"summary": "Cloud SQL primary killed — replica promotion loop"},
"alerts": [{"status": "firing", "labels": {"alertname": "DatabaseFailoverLoop", "severity": "critical"}, "startsAt": "2026-05-09T10:00:00Z"}],
},
"sf-001": {
"commonLabels": {"alertname": "PodCrashLooping", "severity": "warning", "namespace": "default"},
"commonAnnotations": {"summary": "cartservice pod killed by OOMKill"},
"alerts": [{"status": "firing", "labels": {"alertname": "PodCrashLooping", "pod": "cartservice-xxx", "severity": "warning"}, "startsAt": "2026-05-09T10:00:00Z"}],
},
}
def score_incident(incident: dict, elapsed_s: float, scenario_id: str) -> dict:
"""Compute reward metrics for a completed incident chain."""
triage = incident.get("triage", {}).get("final", {})
diagnosis = incident.get("diagnosis", {}).get("final", {})
remediation = incident.get("remediation", {}).get("final", {})
comms = incident.get("comms", {}).get("final", {})
severity = triage.get("severity", "UNKNOWN")
root_cause = diagnosis.get("root_cause") or diagnosis.get("specific")
outcome = remediation.get("outcome", "unresolved")
postmortem_path = comms.get("postmortem_path")
# Score components (0–1 each)
triage_score = 1.0 if severity in {"P0", "P1", "P2", "P3"} else 0.0
diagnosis_score = 0.8 if root_cause and not isinstance(root_cause, dict) else (0.6 if root_cause else 0.0)
remediation_score = {"resolved": 1.0, "partial": 0.7, "escalated": 0.5, "unresolved": 0.2}.get(outcome, 0.0)
comms_score = 1.0 if (postmortem_path and Path(postmortem_path).exists()) else 0.5
speed_score = max(0.0, 1.0 - (elapsed_s - 30) / 300) if elapsed_s < 300 else 0.0
total = (
triage_score * 0.15 +
diagnosis_score * 0.30 +
remediation_score * 0.35 +
comms_score * 0.10 +
speed_score * 0.10
)
triage_turns = len(incident.get("triage", {}).get("trajectory", []))
diagnosis_turns = len(incident.get("diagnosis", {}).get("trajectory", []))
remediation_turns = len(incident.get("remediation", {}).get("trajectory", []))
return {
"scenario_id": scenario_id,
"elapsed_s": round(elapsed_s, 1),
"severity": severity,
"outcome": outcome,
"root_cause_found": bool(root_cause),
"postmortem_saved": bool(postmortem_path and Path(postmortem_path).exists()),
"turns": triage_turns + diagnosis_turns + remediation_turns,
"scores": {
"triage": round(triage_score, 2),
"diagnosis": round(diagnosis_score, 2),
"remediation": round(remediation_score, 2),
"comms": round(comms_score, 2),
"speed": round(speed_score, 2),
},
"total": round(total, 3),
}
async def run_scenario(scenario_id: str) -> dict:
from agents.coordinator import handle_incident
from agents.stream import get_history
alert = SCENARIOS[scenario_id]
alert["scenario_id"] = scenario_id
print(f"\n[-->] {scenario_id}")
t0 = time.time()
incident = await handle_incident(alert)
elapsed = time.time() - t0
return score_incident(incident, elapsed, scenario_id)
async def main():
results = []
for sid in SCENARIOS:
r = await run_scenario(sid)
results.append(r)
print(f" {r['outcome']:12s} {r['elapsed_s']:6.1f}s score={r['total']:.3f}")
avg_score = sum(r["total"] for r in results) / len(results)
avg_turns = sum(r["turns"] for r in results) / len(results)
resolved = sum(1 for r in results if r["outcome"] in {"resolved", "partial"})
postmortems = sum(1 for r in results if r["postmortem_saved"])
summary = {
"model": os.getenv("AGENT_MODEL", "Qwen/Qwen2.5-7B-Instruct"),
"backend": os.getenv("BACKEND", "vllm"),
"scenarios": len(results),
"resolution_rate": resolved / len(results),
"avg_reward": avg_score,
"avg_turns": avg_turns,
"postmortem_rate": postmortems / len(results),
"results": results,
}
out = Path("bench/results/quick_eval.json")
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(summary, indent=2), encoding="utf-8")
print(f"\n{'='*60}")
print(f" QUICK EVAL SUMMARY ({len(results)} scenarios)")
print(f"{'='*60}")
print(f" Resolution rate: {resolved}/{len(results)} ({100*resolved//len(results)}%)")
print(f" Avg reward: {avg_score:.3f}")
print(f" Avg turns: {avg_turns:.1f}")
print(f" Postmortem rate: {postmortems}/{len(results)}")
print(f" Results saved: {out}")
if __name__ == "__main__":
asyncio.run(main())
|