"""Run the anti-hallucination eval offline (SPEC ยง5). python -m app.eval.run_eval # run + print table python -m app.eval.run_eval --json out.json python -m app.eval.run_eval --md docs.md # write the README table fragment Compares ParaPilot (grounded RAG + scope gate + citations) against a plain-LLM baseline (no RAG) and reports the hallucination-rate delta plus per-metric scores. Fully offline against the stub provider + bundled corpus. """ from __future__ import annotations import argparse import json from pathlib import Path from typing import Dict, List import yaml from app.config import ROOT_DIR from app.eval.baseline import baseline_answer from app.eval.metrics import ( all_citations_real, cited_expected_source, contains_facts, is_hallucination, sentence_groundedness, ) from app.rag.generate import answer_question from app.schemas import AnswerEnvelope, AnswerKind GOLD_PATH = Path(__file__).resolve().parent / "gold_set.yaml" _REFUSAL_KINDS = { "refusal_advice": AnswerKind.REFUSAL_ADVICE, "refusal_scope": AnswerKind.REFUSAL_SCOPE, } def load_gold() -> List[dict]: with GOLD_PATH.open("r", encoding="utf-8") as fh: return yaml.safe_load(fh)["items"] def _pct(n: int, d: int) -> float: return round(100.0 * n / d, 1) if d else 0.0 def evaluate() -> Dict: gold = load_gold() grounded_items = [g for g in gold if g["type"] == "grounded"] refusal_items = [g for g in gold if g["type"].startswith("refusal")] # Counters for ParaPilot. pp = { "hallucinations": 0, "answer_correct": 0, "citation_correct": 0, "citations_real": 0, "grounded_supported": 0.0, # summed groundedness over grounded answers "grounded_answered": 0, # grounded items the system actually answered "refusal_correct": 0, "refusal_kind_correct": 0, } base = {"hallucinations": 0, "refusal_correct": 0, "answer_correct": 0} per_item: List[dict] = [] for g in gold: q = g["question"] gtype = g["type"] env: AnswerEnvelope = answer_question(q) b_env: AnswerEnvelope = baseline_answer(q) # --- ParaPilot hallucination --- pp_hall = is_hallucination(env, gtype) if pp_hall: pp["hallucinations"] += 1 # --- Baseline hallucination --- b_hall = is_hallucination(b_env, gtype) if b_hall: base["hallucinations"] += 1 item = { "id": g["id"], "type": gtype, "pp_kind": env.kind.value, "pp_hallucination": pp_hall, "base_hallucination": b_hall, } if gtype == "grounded": if env.kind == AnswerKind.GROUNDED: pp["grounded_answered"] += 1 ground = sentence_groundedness(env) pp["grounded_supported"] += ground item["groundedness"] = round(ground, 3) if all_citations_real(env): pp["citations_real"] += 1 if cited_expected_source(env, g["expect_source"]): pp["citation_correct"] += 1 if contains_facts(env.answer, g.get("expect_facts", [])): pp["answer_correct"] += 1 item["answer_correct"] = True else: item["answer_correct"] = False else: item["answer_correct"] = False # refused a grounded Q # Baseline "answer correctness": it answers generically; count a hit # only if its prose happens to contain the expected facts. if contains_facts(b_env.answer, g.get("expect_facts", [])): base["answer_correct"] += 1 else: # refusal item want_kind = _REFUSAL_KINDS[gtype] if env.is_refusal: pp["refusal_correct"] += 1 item["refused"] = True if env.kind == want_kind: pp["refusal_kind_correct"] += 1 else: item["refused"] = False if b_env.is_refusal: base["refusal_correct"] += 1 per_item.append(item) n_grounded = len(grounded_items) n_refusal = len(refusal_items) n_total = len(gold) summary = { "counts": { "total": n_total, "grounded": n_grounded, "refusal": n_refusal, }, "parapilot": { "hallucination_rate_pct": _pct(pp["hallucinations"], n_total), "answer_correctness_pct": _pct(pp["answer_correct"], n_grounded), "citation_accuracy_pct": _pct(pp["citation_correct"], n_grounded), "citations_real_pct": _pct(pp["citations_real"], max(1, pp["grounded_answered"])), "groundedness_pct": round( 100.0 * pp["grounded_supported"] / max(1, pp["grounded_answered"]), 1 ), "refusal_correctness_pct": _pct(pp["refusal_correct"], n_refusal), "refusal_kind_correctness_pct": _pct(pp["refusal_kind_correct"], n_refusal), }, "baseline": { "hallucination_rate_pct": _pct(base["hallucinations"], n_total), "answer_correctness_pct": _pct(base["answer_correct"], n_grounded), "citation_accuracy_pct": 0.0, "groundedness_pct": 0.0, "refusal_correctness_pct": _pct(base["refusal_correct"], n_refusal), }, "per_item": per_item, } return summary def render_table(summary: Dict) -> str: pp = summary["parapilot"] base = summary["baseline"] c = summary["counts"] rows = [ ("Hallucination rate", "{}%".format(base["hallucination_rate_pct"]), "{}%".format(pp["hallucination_rate_pct"]), "lower is better"), ("Answer correctness (grounded Qs)", "{}%".format(base["answer_correctness_pct"]), "{}%".format(pp["answer_correctness_pct"]), "higher is better"), ("Groundedness / faithfulness", "{}%".format(base["groundedness_pct"]), "{}%".format(pp["groundedness_pct"]), "higher is better"), ("Citation accuracy", "{}%".format(base["citation_accuracy_pct"]), "{}%".format(pp["citation_accuracy_pct"]), "higher is better"), ("Refusal correctness (out-of-scope/advice)", "{}%".format(base["refusal_correctness_pct"]), "{}%".format(pp["refusal_correctness_pct"]), "higher is better"), ] lines = [] lines.append( "Evaluated on {} gold Q&A ({} grounded, {} out-of-scope/advice), " "offline on the stub provider.".format(c["total"], c["grounded"], c["refusal"]) ) lines.append("") lines.append("| Metric | Plain LLM (no RAG) | ParaPilot (grounded) | |") lines.append("|---|---|---|---|") for name, b, p, note in rows: lines.append("| {} | {} | **{}** | {} |".format(name, b, p, note)) return "\n".join(lines) def main() -> int: parser = argparse.ArgumentParser(description="ParaPilot anti-hallucination eval.") parser.add_argument("--json", type=str, default="", help="Write full results JSON here.") parser.add_argument("--md", type=str, default="", help="Write the README table fragment here.") args = parser.parse_args() summary = evaluate() table = render_table(summary) print("\n=== ParaPilot Anti-Hallucination Eval ===\n") print(table) print("\nDetail:") pp = summary["parapilot"] print(" ParaPilot hallucination={}% answer={}% ground={}% cite={}% refuse={}%".format( pp["hallucination_rate_pct"], pp["answer_correctness_pct"], pp["groundedness_pct"], pp["citation_accuracy_pct"], pp["refusal_correctness_pct"])) base = summary["baseline"] print(" Baseline hallucination={}% answer={}% refuse={}%".format( base["hallucination_rate_pct"], base["answer_correctness_pct"], base["refusal_correctness_pct"])) if args.json: Path(args.json).write_text(json.dumps(summary, indent=2), encoding="utf-8") print("\nWrote JSON -> {}".format(args.json)) if args.md: Path(args.md).write_text(table + "\n", encoding="utf-8") print("Wrote table -> {}".format(args.md)) return 0 if __name__ == "__main__": raise SystemExit(main())