File size: 7,722 Bytes
5d30bdc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 | """Compare an agent-generated report against a ground-truth root cause file.
Usage examples:
# Compare a saved report JSON against ground truth
uv run python scripts/eval_report.py \\
--report report.json \\
--ground-truth docs/ground_truth_pr_spike.md
# Run the agent in replay mode, then compare
uv run python scripts/eval_report.py \\
--scenario pr_spike \\
--question "Why did PR open events spike on Jan 15?" \\
--ground-truth docs/ground_truth_pr_spike.md
# Save the agent report to a file for later comparison
uv run python scripts/eval_report.py \\
--scenario pr_spike \\
--question "Why did PR open events spike?" \\
--ground-truth docs/ground_truth.md \\
--save-report reports/pr_spike_report.json
The script uses MODEL_BACKEND (and associated env vars) from .env for the
judge LLM call. Set MODEL_BACKEND=minimax for dev usage.
Exit codes: 0 = pass (score >= threshold), 1 = fail, 2 = error.
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from langchain_core.messages import HumanMessage
load_dotenv()
# Project root on sys.path so agent.* imports work when run from repo root.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from agent.client import get_llm # noqa: E402
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
PASS_THRESHOLD = 6 # score out of 10; adjustable via --threshold
JUDGE_PROMPT = """\
You are evaluating an AI agent's investigation report against a known ground truth.
## Ground truth (the real root cause)
{ground_truth}
## Agent report
{report_text}
## Evaluation task
Score how well the agent report matches the ground truth root cause.
Return a JSON object with exactly these fields:
{{
"root_cause_match": "yes" | "partial" | "no",
"score": <integer 0-10>,
"reasoning": "<2-3 sentences explaining the score>",
"missing_elements": ["<thing the report missed>", ...],
"false_positives": ["<incorrect claim the report made>", ...]
}}
Scoring guide:
9-10 Correct root cause, correct dimensions/segments, correct magnitude
7-8 Correct root cause, minor gaps in supporting evidence
5-6 Partially correct — right area but wrong segment or magnitude
3-4 Weak — mentioned the right dimension but wrong conclusion
0-2 Wrong root cause entirely
Return ONLY valid JSON, no markdown fences, no extra text.
"""
def load_ground_truth(path: str) -> str:
p = Path(path)
if not p.exists():
raise FileNotFoundError(f"Ground truth file not found: {path}")
return p.read_text().strip()
def load_report_from_file(path: str) -> dict:
p = Path(path)
if not p.exists():
raise FileNotFoundError(f"Report JSON not found: {path}")
data = json.loads(p.read_text())
if "text" not in data:
raise ValueError(f"Report JSON missing 'text' field: {path}")
return data
def run_agent_and_get_report(scenario_id: str, question: str) -> dict:
"""Run the agent in replay mode and return final_report."""
os.environ["MODEL_BACKEND"] = "replay"
os.environ["REPLAY_SCENARIO_ID"] = scenario_id
from agent.graph import build_graph
from agent.state import InvestigationState
logger.info("Running agent (replay mode, scenario=%s) ...", scenario_id)
graph = build_graph()
state = InvestigationState(user_question=question)
result: InvestigationState = graph.invoke(state)
if result.final_report is None:
raise RuntimeError("Agent finished without producing a final_report.")
return result.final_report
def call_judge(ground_truth: str, report: dict) -> dict:
"""Call the LLM judge and return the parsed eval dict."""
report_text = report.get("text", "")
prompt = JUDGE_PROMPT.format(
ground_truth=ground_truth,
report_text=report_text,
)
llm = get_llm()
response = llm.invoke(
[
HumanMessage(content="You are a precise evaluator. Return only JSON."),
HumanMessage(content=prompt),
]
)
raw = response.content.strip()
# Strip markdown fences if the model added them despite instructions.
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
return json.loads(raw)
def print_result(eval_result: dict, threshold: int) -> int:
"""Pretty-print the eval result. Returns exit code (0=pass, 1=fail)."""
score = eval_result.get("score", 0)
match = eval_result.get("root_cause_match", "unknown")
reasoning = eval_result.get("reasoning", "")
missing = eval_result.get("missing_elements", [])
false_pos = eval_result.get("false_positives", [])
passed = score >= threshold
status = "PASS" if passed else "FAIL"
print(f"\n{'=' * 60}")
print(f" Eval result: {status}")
print(f" Score: {score}/10 (threshold: {threshold})")
print(f" Root cause: {match}")
print(f"{'=' * 60}")
print(f"\nReasoning:\n {reasoning}")
if missing:
print("\nMissing elements:")
for item in missing:
print(f" - {item}")
if false_pos:
print("\nFalse positives:")
for item in false_pos:
print(f" - {item}")
print()
return 0 if passed else 1
def main() -> int:
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
source = parser.add_mutually_exclusive_group(required=True)
source.add_argument("--report", metavar="PATH", help="Path to a saved report JSON file.")
source.add_argument(
"--scenario", metavar="ID", help="Replay scenario ID (runs agent live in replay mode)."
)
parser.add_argument(
"--question", metavar="TEXT", help="User question (required with --scenario)."
)
parser.add_argument(
"--ground-truth",
required=True,
metavar="PATH",
help="Markdown file with the known root cause.",
)
parser.add_argument(
"--save-report", metavar="PATH", help="Save the agent report JSON here (optional)."
)
parser.add_argument(
"--threshold",
type=int,
default=PASS_THRESHOLD,
metavar="N",
help=f"Pass score (0-10, default {PASS_THRESHOLD}).",
)
args = parser.parse_args()
# Load ground truth.
try:
ground_truth = load_ground_truth(args.ground_truth)
except FileNotFoundError as e:
logger.error("%s", e)
return 2
# Get the report.
try:
if args.report:
report = load_report_from_file(args.report)
else:
if not args.question:
parser.error("--question is required when using --scenario")
report = run_agent_and_get_report(args.scenario, args.question)
except Exception as e:
logger.error("Failed to obtain report: %s", e)
return 2
# Optionally save the report.
if args.save_report:
out = Path(args.save_report)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(report, indent=2))
logger.info("Report saved to %s", out)
# Judge.
try:
eval_result = call_judge(ground_truth, report)
except json.JSONDecodeError as e:
logger.error("Judge returned invalid JSON: %s", e)
return 2
except Exception as e:
logger.error("Judge call failed: %s", e)
return 2
return print_result(eval_result, args.threshold)
if __name__ == "__main__":
sys.exit(main())
|