Spaces:

LaelaZ
/

parapilot

Sleeping

File size: 8,329 Bytes

d787a09

"""Run the anti-hallucination eval offline (SPEC §5).

    python -m app.eval.run_eval                # run + print table
    python -m app.eval.run_eval --json out.json
    python -m app.eval.run_eval --md docs.md   # write the README table fragment

Compares ParaPilot (grounded RAG + scope gate + citations) against a plain-LLM
baseline (no RAG) and reports the hallucination-rate delta plus per-metric
scores. Fully offline against the stub provider + bundled corpus.
"""
from __future__ import annotations

import argparse
import json
from pathlib import Path
from typing import Dict, List

import yaml

from app.config import ROOT_DIR
from app.eval.baseline import baseline_answer
from app.eval.metrics import (
    all_citations_real,
    cited_expected_source,
    contains_facts,
    is_hallucination,
    sentence_groundedness,
)
from app.rag.generate import answer_question
from app.schemas import AnswerEnvelope, AnswerKind

GOLD_PATH = Path(__file__).resolve().parent / "gold_set.yaml"

_REFUSAL_KINDS = {
    "refusal_advice": AnswerKind.REFUSAL_ADVICE,
    "refusal_scope": AnswerKind.REFUSAL_SCOPE,
}


def load_gold() -> List[dict]:
    with GOLD_PATH.open("r", encoding="utf-8") as fh:
        return yaml.safe_load(fh)["items"]


def _pct(n: int, d: int) -> float:
    return round(100.0 * n / d, 1) if d else 0.0


def evaluate() -> Dict:
    gold = load_gold()
    grounded_items = [g for g in gold if g["type"] == "grounded"]
    refusal_items = [g for g in gold if g["type"].startswith("refusal")]

    # Counters for ParaPilot.
    pp = {
        "hallucinations": 0,
        "answer_correct": 0,
        "citation_correct": 0,
        "citations_real": 0,
        "grounded_supported": 0.0,  # summed groundedness over grounded answers
        "grounded_answered": 0,     # grounded items the system actually answered
        "refusal_correct": 0,
        "refusal_kind_correct": 0,
    }
    base = {"hallucinations": 0, "refusal_correct": 0, "answer_correct": 0}

    per_item: List[dict] = []

    for g in gold:
        q = g["question"]
        gtype = g["type"]

        env: AnswerEnvelope = answer_question(q)
        b_env: AnswerEnvelope = baseline_answer(q)

        # --- ParaPilot hallucination ---
        pp_hall = is_hallucination(env, gtype)
        if pp_hall:
            pp["hallucinations"] += 1
        # --- Baseline hallucination ---
        b_hall = is_hallucination(b_env, gtype)
        if b_hall:
            base["hallucinations"] += 1

        item = {
            "id": g["id"],
            "type": gtype,
            "pp_kind": env.kind.value,
            "pp_hallucination": pp_hall,
            "base_hallucination": b_hall,
        }

        if gtype == "grounded":
            if env.kind == AnswerKind.GROUNDED:
                pp["grounded_answered"] += 1
                ground = sentence_groundedness(env)
                pp["grounded_supported"] += ground
                item["groundedness"] = round(ground, 3)
                if all_citations_real(env):
                    pp["citations_real"] += 1
                if cited_expected_source(env, g["expect_source"]):
                    pp["citation_correct"] += 1
                if contains_facts(env.answer, g.get("expect_facts", [])):
                    pp["answer_correct"] += 1
                    item["answer_correct"] = True
                else:
                    item["answer_correct"] = False
            else:
                item["answer_correct"] = False  # refused a grounded Q
            # Baseline "answer correctness": it answers generically; count a hit
            # only if its prose happens to contain the expected facts.
            if contains_facts(b_env.answer, g.get("expect_facts", [])):
                base["answer_correct"] += 1

        else:  # refusal item
            want_kind = _REFUSAL_KINDS[gtype]
            if env.is_refusal:
                pp["refusal_correct"] += 1
                item["refused"] = True
                if env.kind == want_kind:
                    pp["refusal_kind_correct"] += 1
            else:
                item["refused"] = False
            if b_env.is_refusal:
                base["refusal_correct"] += 1

        per_item.append(item)

    n_grounded = len(grounded_items)
    n_refusal = len(refusal_items)
    n_total = len(gold)

    summary = {
        "counts": {
            "total": n_total,
            "grounded": n_grounded,
            "refusal": n_refusal,
        },
        "parapilot": {
            "hallucination_rate_pct": _pct(pp["hallucinations"], n_total),
            "answer_correctness_pct": _pct(pp["answer_correct"], n_grounded),
            "citation_accuracy_pct": _pct(pp["citation_correct"], n_grounded),
            "citations_real_pct": _pct(pp["citations_real"], max(1, pp["grounded_answered"])),
            "groundedness_pct": round(
                100.0 * pp["grounded_supported"] / max(1, pp["grounded_answered"]), 1
            ),
            "refusal_correctness_pct": _pct(pp["refusal_correct"], n_refusal),
            "refusal_kind_correctness_pct": _pct(pp["refusal_kind_correct"], n_refusal),
        },
        "baseline": {
            "hallucination_rate_pct": _pct(base["hallucinations"], n_total),
            "answer_correctness_pct": _pct(base["answer_correct"], n_grounded),
            "citation_accuracy_pct": 0.0,
            "groundedness_pct": 0.0,
            "refusal_correctness_pct": _pct(base["refusal_correct"], n_refusal),
        },
        "per_item": per_item,
    }
    return summary


def render_table(summary: Dict) -> str:
    pp = summary["parapilot"]
    base = summary["baseline"]
    c = summary["counts"]
    rows = [
        ("Hallucination rate", "{}%".format(base["hallucination_rate_pct"]),
         "{}%".format(pp["hallucination_rate_pct"]), "lower is better"),
        ("Answer correctness (grounded Qs)", "{}%".format(base["answer_correctness_pct"]),
         "{}%".format(pp["answer_correctness_pct"]), "higher is better"),
        ("Groundedness / faithfulness", "{}%".format(base["groundedness_pct"]),
         "{}%".format(pp["groundedness_pct"]), "higher is better"),
        ("Citation accuracy", "{}%".format(base["citation_accuracy_pct"]),
         "{}%".format(pp["citation_accuracy_pct"]), "higher is better"),
        ("Refusal correctness (out-of-scope/advice)", "{}%".format(base["refusal_correctness_pct"]),
         "{}%".format(pp["refusal_correctness_pct"]), "higher is better"),
    ]
    lines = []
    lines.append(
        "Evaluated on {} gold Q&A ({} grounded, {} out-of-scope/advice), "
        "offline on the stub provider.".format(c["total"], c["grounded"], c["refusal"])
    )
    lines.append("")
    lines.append("| Metric | Plain LLM (no RAG) | ParaPilot (grounded) | |")
    lines.append("|---|---|---|---|")
    for name, b, p, note in rows:
        lines.append("| {} | {} | **{}** | {} |".format(name, b, p, note))
    return "\n".join(lines)


def main() -> int:
    parser = argparse.ArgumentParser(description="ParaPilot anti-hallucination eval.")
    parser.add_argument("--json", type=str, default="", help="Write full results JSON here.")
    parser.add_argument("--md", type=str, default="", help="Write the README table fragment here.")
    args = parser.parse_args()

    summary = evaluate()
    table = render_table(summary)

    print("\n=== ParaPilot Anti-Hallucination Eval ===\n")
    print(table)
    print("\nDetail:")
    pp = summary["parapilot"]
    print("  ParaPilot   hallucination={}%  answer={}%  ground={}%  cite={}%  refuse={}%".format(
        pp["hallucination_rate_pct"], pp["answer_correctness_pct"],
        pp["groundedness_pct"], pp["citation_accuracy_pct"], pp["refusal_correctness_pct"]))
    base = summary["baseline"]
    print("  Baseline    hallucination={}%  answer={}%  refuse={}%".format(
        base["hallucination_rate_pct"], base["answer_correctness_pct"],
        base["refusal_correctness_pct"]))

    if args.json:
        Path(args.json).write_text(json.dumps(summary, indent=2), encoding="utf-8")
        print("\nWrote JSON -> {}".format(args.json))
    if args.md:
        Path(args.md).write_text(table + "\n", encoding="utf-8")
        print("Wrote table -> {}".format(args.md))

    return 0


if __name__ == "__main__":
    raise SystemExit(main())