File size: 7,722 Bytes
5d30bdc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
"""Compare an agent-generated report against a ground-truth root cause file.

Usage examples:

  # Compare a saved report JSON against ground truth
  uv run python scripts/eval_report.py \\
      --report report.json \\
      --ground-truth docs/ground_truth_pr_spike.md

  # Run the agent in replay mode, then compare
  uv run python scripts/eval_report.py \\
      --scenario pr_spike \\
      --question "Why did PR open events spike on Jan 15?" \\
      --ground-truth docs/ground_truth_pr_spike.md

  # Save the agent report to a file for later comparison
  uv run python scripts/eval_report.py \\
      --scenario pr_spike \\
      --question "Why did PR open events spike?" \\
      --ground-truth docs/ground_truth.md \\
      --save-report reports/pr_spike_report.json

The script uses MODEL_BACKEND (and associated env vars) from .env for the
judge LLM call. Set MODEL_BACKEND=minimax for dev usage.

Exit codes: 0 = pass (score >= threshold), 1 = fail, 2 = error.
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import sys
from pathlib import Path

from dotenv import load_dotenv
from langchain_core.messages import HumanMessage

load_dotenv()

# Project root on sys.path so agent.* imports work when run from repo root.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from agent.client import get_llm  # noqa: E402

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")

PASS_THRESHOLD = 6  # score out of 10; adjustable via --threshold

JUDGE_PROMPT = """\
You are evaluating an AI agent's investigation report against a known ground truth.

## Ground truth (the real root cause)
{ground_truth}

## Agent report
{report_text}

## Evaluation task
Score how well the agent report matches the ground truth root cause.
Return a JSON object with exactly these fields:

{{
  "root_cause_match": "yes" | "partial" | "no",
  "score": <integer 0-10>,
  "reasoning": "<2-3 sentences explaining the score>",
  "missing_elements": ["<thing the report missed>", ...],
  "false_positives": ["<incorrect claim the report made>", ...]
}}

Scoring guide:
  9-10  Correct root cause, correct dimensions/segments, correct magnitude
  7-8   Correct root cause, minor gaps in supporting evidence
  5-6   Partially correct — right area but wrong segment or magnitude
  3-4   Weak — mentioned the right dimension but wrong conclusion
  0-2   Wrong root cause entirely

Return ONLY valid JSON, no markdown fences, no extra text.
"""


def load_ground_truth(path: str) -> str:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Ground truth file not found: {path}")
    return p.read_text().strip()


def load_report_from_file(path: str) -> dict:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Report JSON not found: {path}")
    data = json.loads(p.read_text())
    if "text" not in data:
        raise ValueError(f"Report JSON missing 'text' field: {path}")
    return data


def run_agent_and_get_report(scenario_id: str, question: str) -> dict:
    """Run the agent in replay mode and return final_report."""
    os.environ["MODEL_BACKEND"] = "replay"
    os.environ["REPLAY_SCENARIO_ID"] = scenario_id

    from agent.graph import build_graph
    from agent.state import InvestigationState

    logger.info("Running agent (replay mode, scenario=%s) ...", scenario_id)
    graph = build_graph()
    state = InvestigationState(user_question=question)
    result: InvestigationState = graph.invoke(state)
    if result.final_report is None:
        raise RuntimeError("Agent finished without producing a final_report.")
    return result.final_report


def call_judge(ground_truth: str, report: dict) -> dict:
    """Call the LLM judge and return the parsed eval dict."""
    report_text = report.get("text", "")
    prompt = JUDGE_PROMPT.format(
        ground_truth=ground_truth,
        report_text=report_text,
    )

    llm = get_llm()
    response = llm.invoke(
        [
            HumanMessage(content="You are a precise evaluator. Return only JSON."),
            HumanMessage(content=prompt),
        ]
    )

    raw = response.content.strip()
    # Strip markdown fences if the model added them despite instructions.
    if raw.startswith("```"):
        raw = raw.split("```")[1]
        if raw.startswith("json"):
            raw = raw[4:]

    return json.loads(raw)


def print_result(eval_result: dict, threshold: int) -> int:
    """Pretty-print the eval result. Returns exit code (0=pass, 1=fail)."""
    score = eval_result.get("score", 0)
    match = eval_result.get("root_cause_match", "unknown")
    reasoning = eval_result.get("reasoning", "")
    missing = eval_result.get("missing_elements", [])
    false_pos = eval_result.get("false_positives", [])

    passed = score >= threshold
    status = "PASS" if passed else "FAIL"

    print(f"\n{'=' * 60}")
    print(f"  Eval result: {status}")
    print(f"  Score:       {score}/10  (threshold: {threshold})")
    print(f"  Root cause:  {match}")
    print(f"{'=' * 60}")
    print(f"\nReasoning:\n  {reasoning}")

    if missing:
        print("\nMissing elements:")
        for item in missing:
            print(f"  - {item}")

    if false_pos:
        print("\nFalse positives:")
        for item in false_pos:
            print(f"  - {item}")

    print()
    return 0 if passed else 1


def main() -> int:
    parser = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
    )

    source = parser.add_mutually_exclusive_group(required=True)
    source.add_argument("--report", metavar="PATH", help="Path to a saved report JSON file.")
    source.add_argument(
        "--scenario", metavar="ID", help="Replay scenario ID (runs agent live in replay mode)."
    )

    parser.add_argument(
        "--question", metavar="TEXT", help="User question (required with --scenario)."
    )
    parser.add_argument(
        "--ground-truth",
        required=True,
        metavar="PATH",
        help="Markdown file with the known root cause.",
    )
    parser.add_argument(
        "--save-report", metavar="PATH", help="Save the agent report JSON here (optional)."
    )
    parser.add_argument(
        "--threshold",
        type=int,
        default=PASS_THRESHOLD,
        metavar="N",
        help=f"Pass score (0-10, default {PASS_THRESHOLD}).",
    )

    args = parser.parse_args()

    # Load ground truth.
    try:
        ground_truth = load_ground_truth(args.ground_truth)
    except FileNotFoundError as e:
        logger.error("%s", e)
        return 2

    # Get the report.
    try:
        if args.report:
            report = load_report_from_file(args.report)
        else:
            if not args.question:
                parser.error("--question is required when using --scenario")
            report = run_agent_and_get_report(args.scenario, args.question)
    except Exception as e:
        logger.error("Failed to obtain report: %s", e)
        return 2

    # Optionally save the report.
    if args.save_report:
        out = Path(args.save_report)
        out.parent.mkdir(parents=True, exist_ok=True)
        out.write_text(json.dumps(report, indent=2))
        logger.info("Report saved to %s", out)

    # Judge.
    try:
        eval_result = call_judge(ground_truth, report)
    except json.JSONDecodeError as e:
        logger.error("Judge returned invalid JSON: %s", e)
        return 2
    except Exception as e:
        logger.error("Judge call failed: %s", e)
        return 2

    return print_result(eval_result, args.threshold)


if __name__ == "__main__":
    sys.exit(main())