Spaces:

lablab-ai-amd-developer-hackathon
/

atlasops

Running

File size: 6,097 Bytes

7e9a520

"""Quick evaluation — runs the 3 demo scenarios and scores them.

Produces bench/results/quick_eval.json and prints a summary table.
Does NOT apply real Chaos Mesh — uses the pre-defined alert payloads
from inference.py (same as the HF demo).

Usage:
    python bench/quick_eval.py
"""

import asyncio
import json
import os
import sys
import time
from pathlib import Path

_env = Path(__file__).parent.parent / ".env"
if _env.exists():
    for line in _env.read_text().splitlines():
        line = line.strip()
        if line and not line.startswith("#") and "=" in line:
            k, _, v = line.partition("=")
            os.environ.setdefault(k.strip(), v.strip())

if hasattr(sys.stdout, "reconfigure"):
    sys.stdout.reconfigure(encoding="utf-8", errors="replace")

SCENARIOS = {
    "hist-cloudflare-2019": {
        "commonLabels": {"alertname": "HighCPUSaturation", "severity": "critical", "namespace": "default"},
        "commonAnnotations": {"summary": "CPU saturation on frontend — Cloudflare 2019 replay"},
        "alerts": [{"status": "firing", "labels": {"alertname": "HighCPUSaturation", "pod": "frontend-xxx", "severity": "critical"}, "startsAt": "2026-05-09T10:00:00Z"}],
    },
    "hist-github-2018": {
        "commonLabels": {"alertname": "DatabaseFailoverLoop", "severity": "critical", "namespace": "default"},
        "commonAnnotations": {"summary": "Cloud SQL primary killed — replica promotion loop"},
        "alerts": [{"status": "firing", "labels": {"alertname": "DatabaseFailoverLoop", "severity": "critical"}, "startsAt": "2026-05-09T10:00:00Z"}],
    },
    "sf-001": {
        "commonLabels": {"alertname": "PodCrashLooping", "severity": "warning", "namespace": "default"},
        "commonAnnotations": {"summary": "cartservice pod killed by OOMKill"},
        "alerts": [{"status": "firing", "labels": {"alertname": "PodCrashLooping", "pod": "cartservice-xxx", "severity": "warning"}, "startsAt": "2026-05-09T10:00:00Z"}],
    },
}


def score_incident(incident: dict, elapsed_s: float, scenario_id: str) -> dict:
    """Compute reward metrics for a completed incident chain."""
    triage = incident.get("triage", {}).get("final", {})
    diagnosis = incident.get("diagnosis", {}).get("final", {})
    remediation = incident.get("remediation", {}).get("final", {})
    comms = incident.get("comms", {}).get("final", {})

    severity = triage.get("severity", "UNKNOWN")
    root_cause = diagnosis.get("root_cause") or diagnosis.get("specific")
    outcome = remediation.get("outcome", "unresolved")
    postmortem_path = comms.get("postmortem_path")

    # Score components (0–1 each)
    triage_score = 1.0 if severity in {"P0", "P1", "P2", "P3"} else 0.0
    diagnosis_score = 0.8 if root_cause and not isinstance(root_cause, dict) else (0.6 if root_cause else 0.0)
    remediation_score = {"resolved": 1.0, "partial": 0.7, "escalated": 0.5, "unresolved": 0.2}.get(outcome, 0.0)
    comms_score = 1.0 if (postmortem_path and Path(postmortem_path).exists()) else 0.5
    speed_score = max(0.0, 1.0 - (elapsed_s - 30) / 300) if elapsed_s < 300 else 0.0

    total = (
        triage_score * 0.15 +
        diagnosis_score * 0.30 +
        remediation_score * 0.35 +
        comms_score * 0.10 +
        speed_score * 0.10
    )

    triage_turns = len(incident.get("triage", {}).get("trajectory", []))
    diagnosis_turns = len(incident.get("diagnosis", {}).get("trajectory", []))
    remediation_turns = len(incident.get("remediation", {}).get("trajectory", []))

    return {
        "scenario_id": scenario_id,
        "elapsed_s": round(elapsed_s, 1),
        "severity": severity,
        "outcome": outcome,
        "root_cause_found": bool(root_cause),
        "postmortem_saved": bool(postmortem_path and Path(postmortem_path).exists()),
        "turns": triage_turns + diagnosis_turns + remediation_turns,
        "scores": {
            "triage": round(triage_score, 2),
            "diagnosis": round(diagnosis_score, 2),
            "remediation": round(remediation_score, 2),
            "comms": round(comms_score, 2),
            "speed": round(speed_score, 2),
        },
        "total": round(total, 3),
    }


async def run_scenario(scenario_id: str) -> dict:
    from agents.coordinator import handle_incident
    from agents.stream import get_history

    alert = SCENARIOS[scenario_id]
    alert["scenario_id"] = scenario_id

    print(f"\n[-->] {scenario_id}")
    t0 = time.time()
    incident = await handle_incident(alert)
    elapsed = time.time() - t0
    return score_incident(incident, elapsed, scenario_id)


async def main():
    results = []
    for sid in SCENARIOS:
        r = await run_scenario(sid)
        results.append(r)
        print(f"      {r['outcome']:12s} {r['elapsed_s']:6.1f}s  score={r['total']:.3f}")

    avg_score = sum(r["total"] for r in results) / len(results)
    avg_turns = sum(r["turns"] for r in results) / len(results)
    resolved = sum(1 for r in results if r["outcome"] in {"resolved", "partial"})
    postmortems = sum(1 for r in results if r["postmortem_saved"])

    summary = {
        "model": os.getenv("AGENT_MODEL", "Qwen/Qwen2.5-7B-Instruct"),
        "backend": os.getenv("BACKEND", "vllm"),
        "scenarios": len(results),
        "resolution_rate": resolved / len(results),
        "avg_reward": avg_score,
        "avg_turns": avg_turns,
        "postmortem_rate": postmortems / len(results),
        "results": results,
    }

    out = Path("bench/results/quick_eval.json")
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(json.dumps(summary, indent=2), encoding="utf-8")

    print(f"\n{'='*60}")
    print(f"  QUICK EVAL SUMMARY ({len(results)} scenarios)")
    print(f"{'='*60}")
    print(f"  Resolution rate:    {resolved}/{len(results)} ({100*resolved//len(results)}%)")
    print(f"  Avg reward:         {avg_score:.3f}")
    print(f"  Avg turns:          {avg_turns:.1f}")
    print(f"  Postmortem rate:    {postmortems}/{len(results)}")
    print(f"  Results saved:      {out}")


if __name__ == "__main__":
    asyncio.run(main())