Spaces:

MapoTofu9
/

why-agent

Sleeping

File size: 7,722 Bytes

5d30bdc

"""Compare an agent-generated report against a ground-truth root cause file.

Usage examples:

  # Compare a saved report JSON against ground truth
  uv run python scripts/eval_report.py \\
      --report report.json \\
      --ground-truth docs/ground_truth_pr_spike.md

  # Run the agent in replay mode, then compare
  uv run python scripts/eval_report.py \\
      --scenario pr_spike \\
      --question "Why did PR open events spike on Jan 15?" \\
      --ground-truth docs/ground_truth_pr_spike.md

  # Save the agent report to a file for later comparison
  uv run python scripts/eval_report.py \\
      --scenario pr_spike \\
      --question "Why did PR open events spike?" \\
      --ground-truth docs/ground_truth.md \\
      --save-report reports/pr_spike_report.json

The script uses MODEL_BACKEND (and associated env vars) from .env for the
judge LLM call. Set MODEL_BACKEND=minimax for dev usage.

Exit codes: 0 = pass (score >= threshold), 1 = fail, 2 = error.
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import sys
from pathlib import Path

from dotenv import load_dotenv
from langchain_core.messages import HumanMessage

load_dotenv()

# Project root on sys.path so agent.* imports work when run from repo root.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from agent.client import get_llm  # noqa: E402

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")

PASS_THRESHOLD = 6  # score out of 10; adjustable via --threshold

JUDGE_PROMPT = """\
You are evaluating an AI agent's investigation report against a known ground truth.

## Ground truth (the real root cause)
{ground_truth}

## Agent report
{report_text}

## Evaluation task
Score how well the agent report matches the ground truth root cause.
Return a JSON object with exactly these fields:

{{
  "root_cause_match": "yes" | "partial" | "no",
  "score": <integer 0-10>,
  "reasoning": "<2-3 sentences explaining the score>",
  "missing_elements": ["<thing the report missed>", ...],
  "false_positives": ["<incorrect claim the report made>", ...]
}}

Scoring guide:
  9-10  Correct root cause, correct dimensions/segments, correct magnitude
  7-8   Correct root cause, minor gaps in supporting evidence
  5-6   Partially correct — right area but wrong segment or magnitude
  3-4   Weak — mentioned the right dimension but wrong conclusion
  0-2   Wrong root cause entirely

Return ONLY valid JSON, no markdown fences, no extra text.
"""


def load_ground_truth(path: str) -> str:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Ground truth file not found: {path}")
    return p.read_text().strip()


def load_report_from_file(path: str) -> dict:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Report JSON not found: {path}")
    data = json.loads(p.read_text())
    if "text" not in data:
        raise ValueError(f"Report JSON missing 'text' field: {path}")
    return data


def run_agent_and_get_report(scenario_id: str, question: str) -> dict:
    """Run the agent in replay mode and return final_report."""
    os.environ["MODEL_BACKEND"] = "replay"
    os.environ["REPLAY_SCENARIO_ID"] = scenario_id

    from agent.graph import build_graph
    from agent.state import InvestigationState

    logger.info("Running agent (replay mode, scenario=%s) ...", scenario_id)
    graph = build_graph()
    state = InvestigationState(user_question=question)
    result: InvestigationState = graph.invoke(state)
    if result.final_report is None:
        raise RuntimeError("Agent finished without producing a final_report.")
    return result.final_report


def call_judge(ground_truth: str, report: dict) -> dict:
    """Call the LLM judge and return the parsed eval dict."""
    report_text = report.get("text", "")
    prompt = JUDGE_PROMPT.format(
        ground_truth=ground_truth,
        report_text=report_text,
    )

    llm = get_llm()
    response = llm.invoke(
        [
            HumanMessage(content="You are a precise evaluator. Return only JSON."),
            HumanMessage(content=prompt),
        ]
    )

    raw = response.content.strip()
    # Strip markdown fences if the model added them despite instructions.
    if raw.startswith("```"):
        raw = raw.split("```")[1]
        if raw.startswith("json"):
            raw = raw[4:]

    return json.loads(raw)


def print_result(eval_result: dict, threshold: int) -> int:
    """Pretty-print the eval result. Returns exit code (0=pass, 1=fail)."""
    score = eval_result.get("score", 0)
    match = eval_result.get("root_cause_match", "unknown")
    reasoning = eval_result.get("reasoning", "")
    missing = eval_result.get("missing_elements", [])
    false_pos = eval_result.get("false_positives", [])

    passed = score >= threshold
    status = "PASS" if passed else "FAIL"

    print(f"\n{'=' * 60}")
    print(f"  Eval result: {status}")
    print(f"  Score:       {score}/10  (threshold: {threshold})")
    print(f"  Root cause:  {match}")
    print(f"{'=' * 60}")
    print(f"\nReasoning:\n  {reasoning}")

    if missing:
        print("\nMissing elements:")
        for item in missing:
            print(f"  - {item}")

    if false_pos:
        print("\nFalse positives:")
        for item in false_pos:
            print(f"  - {item}")

    print()
    return 0 if passed else 1


def main() -> int:
    parser = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
    )

    source = parser.add_mutually_exclusive_group(required=True)
    source.add_argument("--report", metavar="PATH", help="Path to a saved report JSON file.")
    source.add_argument(
        "--scenario", metavar="ID", help="Replay scenario ID (runs agent live in replay mode)."
    )

    parser.add_argument(
        "--question", metavar="TEXT", help="User question (required with --scenario)."
    )
    parser.add_argument(
        "--ground-truth",
        required=True,
        metavar="PATH",
        help="Markdown file with the known root cause.",
    )
    parser.add_argument(
        "--save-report", metavar="PATH", help="Save the agent report JSON here (optional)."
    )
    parser.add_argument(
        "--threshold",
        type=int,
        default=PASS_THRESHOLD,
        metavar="N",
        help=f"Pass score (0-10, default {PASS_THRESHOLD}).",
    )

    args = parser.parse_args()

    # Load ground truth.
    try:
        ground_truth = load_ground_truth(args.ground_truth)
    except FileNotFoundError as e:
        logger.error("%s", e)
        return 2

    # Get the report.
    try:
        if args.report:
            report = load_report_from_file(args.report)
        else:
            if not args.question:
                parser.error("--question is required when using --scenario")
            report = run_agent_and_get_report(args.scenario, args.question)
    except Exception as e:
        logger.error("Failed to obtain report: %s", e)
        return 2

    # Optionally save the report.
    if args.save_report:
        out = Path(args.save_report)
        out.parent.mkdir(parents=True, exist_ok=True)
        out.write_text(json.dumps(report, indent=2))
        logger.info("Report saved to %s", out)

    # Judge.
    try:
        eval_result = call_judge(ground_truth, report)
    except json.JSONDecodeError as e:
        logger.error("Judge returned invalid JSON: %s", e)
        return 2
    except Exception as e:
        logger.error("Judge call failed: %s", e)
        return 2

    return print_result(eval_result, args.threshold)


if __name__ == "__main__":
    sys.exit(main())