| """Compare an agent-generated report against a ground-truth root cause file. |
| |
| Usage examples: |
| |
| # Compare a saved report JSON against ground truth |
| uv run python scripts/eval_report.py \\ |
| --report report.json \\ |
| --ground-truth docs/ground_truth_pr_spike.md |
| |
| # Run the agent in replay mode, then compare |
| uv run python scripts/eval_report.py \\ |
| --scenario pr_spike \\ |
| --question "Why did PR open events spike on Jan 15?" \\ |
| --ground-truth docs/ground_truth_pr_spike.md |
| |
| # Save the agent report to a file for later comparison |
| uv run python scripts/eval_report.py \\ |
| --scenario pr_spike \\ |
| --question "Why did PR open events spike?" \\ |
| --ground-truth docs/ground_truth.md \\ |
| --save-report reports/pr_spike_report.json |
| |
| The script uses MODEL_BACKEND (and associated env vars) from .env for the |
| judge LLM call. Set MODEL_BACKEND=minimax for dev usage. |
| |
| Exit codes: 0 = pass (score >= threshold), 1 = fail, 2 = error. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import logging |
| import os |
| import sys |
| from pathlib import Path |
|
|
| from dotenv import load_dotenv |
| from langchain_core.messages import HumanMessage |
|
|
| load_dotenv() |
|
|
| |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) |
|
|
| from agent.client import get_llm |
|
|
| logger = logging.getLogger(__name__) |
| logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") |
|
|
| PASS_THRESHOLD = 6 |
|
|
| JUDGE_PROMPT = """\ |
| You are evaluating an AI agent's investigation report against a known ground truth. |
| |
| ## Ground truth (the real root cause) |
| {ground_truth} |
| |
| ## Agent report |
| {report_text} |
| |
| ## Evaluation task |
| Score how well the agent report matches the ground truth root cause. |
| Return a JSON object with exactly these fields: |
| |
| {{ |
| "root_cause_match": "yes" | "partial" | "no", |
| "score": <integer 0-10>, |
| "reasoning": "<2-3 sentences explaining the score>", |
| "missing_elements": ["<thing the report missed>", ...], |
| "false_positives": ["<incorrect claim the report made>", ...] |
| }} |
| |
| Scoring guide: |
| 9-10 Correct root cause, correct dimensions/segments, correct magnitude |
| 7-8 Correct root cause, minor gaps in supporting evidence |
| 5-6 Partially correct — right area but wrong segment or magnitude |
| 3-4 Weak — mentioned the right dimension but wrong conclusion |
| 0-2 Wrong root cause entirely |
| |
| Return ONLY valid JSON, no markdown fences, no extra text. |
| """ |
|
|
|
|
| def load_ground_truth(path: str) -> str: |
| p = Path(path) |
| if not p.exists(): |
| raise FileNotFoundError(f"Ground truth file not found: {path}") |
| return p.read_text().strip() |
|
|
|
|
| def load_report_from_file(path: str) -> dict: |
| p = Path(path) |
| if not p.exists(): |
| raise FileNotFoundError(f"Report JSON not found: {path}") |
| data = json.loads(p.read_text()) |
| if "text" not in data: |
| raise ValueError(f"Report JSON missing 'text' field: {path}") |
| return data |
|
|
|
|
| def run_agent_and_get_report(scenario_id: str, question: str) -> dict: |
| """Run the agent in replay mode and return final_report.""" |
| os.environ["MODEL_BACKEND"] = "replay" |
| os.environ["REPLAY_SCENARIO_ID"] = scenario_id |
|
|
| from agent.graph import build_graph |
| from agent.state import InvestigationState |
|
|
| logger.info("Running agent (replay mode, scenario=%s) ...", scenario_id) |
| graph = build_graph() |
| state = InvestigationState(user_question=question) |
| result: InvestigationState = graph.invoke(state) |
| if result.final_report is None: |
| raise RuntimeError("Agent finished without producing a final_report.") |
| return result.final_report |
|
|
|
|
| def call_judge(ground_truth: str, report: dict) -> dict: |
| """Call the LLM judge and return the parsed eval dict.""" |
| report_text = report.get("text", "") |
| prompt = JUDGE_PROMPT.format( |
| ground_truth=ground_truth, |
| report_text=report_text, |
| ) |
|
|
| llm = get_llm() |
| response = llm.invoke( |
| [ |
| HumanMessage(content="You are a precise evaluator. Return only JSON."), |
| HumanMessage(content=prompt), |
| ] |
| ) |
|
|
| raw = response.content.strip() |
| |
| if raw.startswith("```"): |
| raw = raw.split("```")[1] |
| if raw.startswith("json"): |
| raw = raw[4:] |
|
|
| return json.loads(raw) |
|
|
|
|
| def print_result(eval_result: dict, threshold: int) -> int: |
| """Pretty-print the eval result. Returns exit code (0=pass, 1=fail).""" |
| score = eval_result.get("score", 0) |
| match = eval_result.get("root_cause_match", "unknown") |
| reasoning = eval_result.get("reasoning", "") |
| missing = eval_result.get("missing_elements", []) |
| false_pos = eval_result.get("false_positives", []) |
|
|
| passed = score >= threshold |
| status = "PASS" if passed else "FAIL" |
|
|
| print(f"\n{'=' * 60}") |
| print(f" Eval result: {status}") |
| print(f" Score: {score}/10 (threshold: {threshold})") |
| print(f" Root cause: {match}") |
| print(f"{'=' * 60}") |
| print(f"\nReasoning:\n {reasoning}") |
|
|
| if missing: |
| print("\nMissing elements:") |
| for item in missing: |
| print(f" - {item}") |
|
|
| if false_pos: |
| print("\nFalse positives:") |
| for item in false_pos: |
| print(f" - {item}") |
|
|
| print() |
| return 0 if passed else 1 |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser( |
| description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter |
| ) |
|
|
| source = parser.add_mutually_exclusive_group(required=True) |
| source.add_argument("--report", metavar="PATH", help="Path to a saved report JSON file.") |
| source.add_argument( |
| "--scenario", metavar="ID", help="Replay scenario ID (runs agent live in replay mode)." |
| ) |
|
|
| parser.add_argument( |
| "--question", metavar="TEXT", help="User question (required with --scenario)." |
| ) |
| parser.add_argument( |
| "--ground-truth", |
| required=True, |
| metavar="PATH", |
| help="Markdown file with the known root cause.", |
| ) |
| parser.add_argument( |
| "--save-report", metavar="PATH", help="Save the agent report JSON here (optional)." |
| ) |
| parser.add_argument( |
| "--threshold", |
| type=int, |
| default=PASS_THRESHOLD, |
| metavar="N", |
| help=f"Pass score (0-10, default {PASS_THRESHOLD}).", |
| ) |
|
|
| args = parser.parse_args() |
|
|
| |
| try: |
| ground_truth = load_ground_truth(args.ground_truth) |
| except FileNotFoundError as e: |
| logger.error("%s", e) |
| return 2 |
|
|
| |
| try: |
| if args.report: |
| report = load_report_from_file(args.report) |
| else: |
| if not args.question: |
| parser.error("--question is required when using --scenario") |
| report = run_agent_and_get_report(args.scenario, args.question) |
| except Exception as e: |
| logger.error("Failed to obtain report: %s", e) |
| return 2 |
|
|
| |
| if args.save_report: |
| out = Path(args.save_report) |
| out.parent.mkdir(parents=True, exist_ok=True) |
| out.write_text(json.dumps(report, indent=2)) |
| logger.info("Report saved to %s", out) |
|
|
| |
| try: |
| eval_result = call_judge(ground_truth, report) |
| except json.JSONDecodeError as e: |
| logger.error("Judge returned invalid JSON: %s", e) |
| return 2 |
| except Exception as e: |
| logger.error("Judge call failed: %s", e) |
| return 2 |
|
|
| return print_result(eval_result, args.threshold) |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|