why-agent / scripts /eval_report.py
MapoTofu9's picture
deploy: HF Spaces
5d30bdc
Raw
History Blame Contribute Delete
7.72 kB
"""Compare an agent-generated report against a ground-truth root cause file.
Usage examples:
# Compare a saved report JSON against ground truth
uv run python scripts/eval_report.py \\
--report report.json \\
--ground-truth docs/ground_truth_pr_spike.md
# Run the agent in replay mode, then compare
uv run python scripts/eval_report.py \\
--scenario pr_spike \\
--question "Why did PR open events spike on Jan 15?" \\
--ground-truth docs/ground_truth_pr_spike.md
# Save the agent report to a file for later comparison
uv run python scripts/eval_report.py \\
--scenario pr_spike \\
--question "Why did PR open events spike?" \\
--ground-truth docs/ground_truth.md \\
--save-report reports/pr_spike_report.json
The script uses MODEL_BACKEND (and associated env vars) from .env for the
judge LLM call. Set MODEL_BACKEND=minimax for dev usage.
Exit codes: 0 = pass (score >= threshold), 1 = fail, 2 = error.
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from langchain_core.messages import HumanMessage
load_dotenv()
# Project root on sys.path so agent.* imports work when run from repo root.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from agent.client import get_llm # noqa: E402
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
PASS_THRESHOLD = 6 # score out of 10; adjustable via --threshold
JUDGE_PROMPT = """\
You are evaluating an AI agent's investigation report against a known ground truth.
## Ground truth (the real root cause)
{ground_truth}
## Agent report
{report_text}
## Evaluation task
Score how well the agent report matches the ground truth root cause.
Return a JSON object with exactly these fields:
{{
"root_cause_match": "yes" | "partial" | "no",
"score": <integer 0-10>,
"reasoning": "<2-3 sentences explaining the score>",
"missing_elements": ["<thing the report missed>", ...],
"false_positives": ["<incorrect claim the report made>", ...]
}}
Scoring guide:
9-10 Correct root cause, correct dimensions/segments, correct magnitude
7-8 Correct root cause, minor gaps in supporting evidence
5-6 Partially correct — right area but wrong segment or magnitude
3-4 Weak — mentioned the right dimension but wrong conclusion
0-2 Wrong root cause entirely
Return ONLY valid JSON, no markdown fences, no extra text.
"""
def load_ground_truth(path: str) -> str:
p = Path(path)
if not p.exists():
raise FileNotFoundError(f"Ground truth file not found: {path}")
return p.read_text().strip()
def load_report_from_file(path: str) -> dict:
p = Path(path)
if not p.exists():
raise FileNotFoundError(f"Report JSON not found: {path}")
data = json.loads(p.read_text())
if "text" not in data:
raise ValueError(f"Report JSON missing 'text' field: {path}")
return data
def run_agent_and_get_report(scenario_id: str, question: str) -> dict:
"""Run the agent in replay mode and return final_report."""
os.environ["MODEL_BACKEND"] = "replay"
os.environ["REPLAY_SCENARIO_ID"] = scenario_id
from agent.graph import build_graph
from agent.state import InvestigationState
logger.info("Running agent (replay mode, scenario=%s) ...", scenario_id)
graph = build_graph()
state = InvestigationState(user_question=question)
result: InvestigationState = graph.invoke(state)
if result.final_report is None:
raise RuntimeError("Agent finished without producing a final_report.")
return result.final_report
def call_judge(ground_truth: str, report: dict) -> dict:
"""Call the LLM judge and return the parsed eval dict."""
report_text = report.get("text", "")
prompt = JUDGE_PROMPT.format(
ground_truth=ground_truth,
report_text=report_text,
)
llm = get_llm()
response = llm.invoke(
[
HumanMessage(content="You are a precise evaluator. Return only JSON."),
HumanMessage(content=prompt),
]
)
raw = response.content.strip()
# Strip markdown fences if the model added them despite instructions.
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
return json.loads(raw)
def print_result(eval_result: dict, threshold: int) -> int:
"""Pretty-print the eval result. Returns exit code (0=pass, 1=fail)."""
score = eval_result.get("score", 0)
match = eval_result.get("root_cause_match", "unknown")
reasoning = eval_result.get("reasoning", "")
missing = eval_result.get("missing_elements", [])
false_pos = eval_result.get("false_positives", [])
passed = score >= threshold
status = "PASS" if passed else "FAIL"
print(f"\n{'=' * 60}")
print(f" Eval result: {status}")
print(f" Score: {score}/10 (threshold: {threshold})")
print(f" Root cause: {match}")
print(f"{'=' * 60}")
print(f"\nReasoning:\n {reasoning}")
if missing:
print("\nMissing elements:")
for item in missing:
print(f" - {item}")
if false_pos:
print("\nFalse positives:")
for item in false_pos:
print(f" - {item}")
print()
return 0 if passed else 1
def main() -> int:
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
source = parser.add_mutually_exclusive_group(required=True)
source.add_argument("--report", metavar="PATH", help="Path to a saved report JSON file.")
source.add_argument(
"--scenario", metavar="ID", help="Replay scenario ID (runs agent live in replay mode)."
)
parser.add_argument(
"--question", metavar="TEXT", help="User question (required with --scenario)."
)
parser.add_argument(
"--ground-truth",
required=True,
metavar="PATH",
help="Markdown file with the known root cause.",
)
parser.add_argument(
"--save-report", metavar="PATH", help="Save the agent report JSON here (optional)."
)
parser.add_argument(
"--threshold",
type=int,
default=PASS_THRESHOLD,
metavar="N",
help=f"Pass score (0-10, default {PASS_THRESHOLD}).",
)
args = parser.parse_args()
# Load ground truth.
try:
ground_truth = load_ground_truth(args.ground_truth)
except FileNotFoundError as e:
logger.error("%s", e)
return 2
# Get the report.
try:
if args.report:
report = load_report_from_file(args.report)
else:
if not args.question:
parser.error("--question is required when using --scenario")
report = run_agent_and_get_report(args.scenario, args.question)
except Exception as e:
logger.error("Failed to obtain report: %s", e)
return 2
# Optionally save the report.
if args.save_report:
out = Path(args.save_report)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(report, indent=2))
logger.info("Report saved to %s", out)
# Judge.
try:
eval_result = call_judge(ground_truth, report)
except json.JSONDecodeError as e:
logger.error("Judge returned invalid JSON: %s", e)
return 2
except Exception as e:
logger.error("Judge call failed: %s", e)
return 2
return print_result(eval_result, args.threshold)
if __name__ == "__main__":
sys.exit(main())