parapilot / app /eval /run_eval.py
LaelaZ's picture
Deploy ParaPilot to HF Spaces (Docker)
d787a09 verified
"""Run the anti-hallucination eval offline (SPEC §5).
python -m app.eval.run_eval # run + print table
python -m app.eval.run_eval --json out.json
python -m app.eval.run_eval --md docs.md # write the README table fragment
Compares ParaPilot (grounded RAG + scope gate + citations) against a plain-LLM
baseline (no RAG) and reports the hallucination-rate delta plus per-metric
scores. Fully offline against the stub provider + bundled corpus.
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Dict, List
import yaml
from app.config import ROOT_DIR
from app.eval.baseline import baseline_answer
from app.eval.metrics import (
all_citations_real,
cited_expected_source,
contains_facts,
is_hallucination,
sentence_groundedness,
)
from app.rag.generate import answer_question
from app.schemas import AnswerEnvelope, AnswerKind
GOLD_PATH = Path(__file__).resolve().parent / "gold_set.yaml"
_REFUSAL_KINDS = {
"refusal_advice": AnswerKind.REFUSAL_ADVICE,
"refusal_scope": AnswerKind.REFUSAL_SCOPE,
}
def load_gold() -> List[dict]:
with GOLD_PATH.open("r", encoding="utf-8") as fh:
return yaml.safe_load(fh)["items"]
def _pct(n: int, d: int) -> float:
return round(100.0 * n / d, 1) if d else 0.0
def evaluate() -> Dict:
gold = load_gold()
grounded_items = [g for g in gold if g["type"] == "grounded"]
refusal_items = [g for g in gold if g["type"].startswith("refusal")]
# Counters for ParaPilot.
pp = {
"hallucinations": 0,
"answer_correct": 0,
"citation_correct": 0,
"citations_real": 0,
"grounded_supported": 0.0, # summed groundedness over grounded answers
"grounded_answered": 0, # grounded items the system actually answered
"refusal_correct": 0,
"refusal_kind_correct": 0,
}
base = {"hallucinations": 0, "refusal_correct": 0, "answer_correct": 0}
per_item: List[dict] = []
for g in gold:
q = g["question"]
gtype = g["type"]
env: AnswerEnvelope = answer_question(q)
b_env: AnswerEnvelope = baseline_answer(q)
# --- ParaPilot hallucination ---
pp_hall = is_hallucination(env, gtype)
if pp_hall:
pp["hallucinations"] += 1
# --- Baseline hallucination ---
b_hall = is_hallucination(b_env, gtype)
if b_hall:
base["hallucinations"] += 1
item = {
"id": g["id"],
"type": gtype,
"pp_kind": env.kind.value,
"pp_hallucination": pp_hall,
"base_hallucination": b_hall,
}
if gtype == "grounded":
if env.kind == AnswerKind.GROUNDED:
pp["grounded_answered"] += 1
ground = sentence_groundedness(env)
pp["grounded_supported"] += ground
item["groundedness"] = round(ground, 3)
if all_citations_real(env):
pp["citations_real"] += 1
if cited_expected_source(env, g["expect_source"]):
pp["citation_correct"] += 1
if contains_facts(env.answer, g.get("expect_facts", [])):
pp["answer_correct"] += 1
item["answer_correct"] = True
else:
item["answer_correct"] = False
else:
item["answer_correct"] = False # refused a grounded Q
# Baseline "answer correctness": it answers generically; count a hit
# only if its prose happens to contain the expected facts.
if contains_facts(b_env.answer, g.get("expect_facts", [])):
base["answer_correct"] += 1
else: # refusal item
want_kind = _REFUSAL_KINDS[gtype]
if env.is_refusal:
pp["refusal_correct"] += 1
item["refused"] = True
if env.kind == want_kind:
pp["refusal_kind_correct"] += 1
else:
item["refused"] = False
if b_env.is_refusal:
base["refusal_correct"] += 1
per_item.append(item)
n_grounded = len(grounded_items)
n_refusal = len(refusal_items)
n_total = len(gold)
summary = {
"counts": {
"total": n_total,
"grounded": n_grounded,
"refusal": n_refusal,
},
"parapilot": {
"hallucination_rate_pct": _pct(pp["hallucinations"], n_total),
"answer_correctness_pct": _pct(pp["answer_correct"], n_grounded),
"citation_accuracy_pct": _pct(pp["citation_correct"], n_grounded),
"citations_real_pct": _pct(pp["citations_real"], max(1, pp["grounded_answered"])),
"groundedness_pct": round(
100.0 * pp["grounded_supported"] / max(1, pp["grounded_answered"]), 1
),
"refusal_correctness_pct": _pct(pp["refusal_correct"], n_refusal),
"refusal_kind_correctness_pct": _pct(pp["refusal_kind_correct"], n_refusal),
},
"baseline": {
"hallucination_rate_pct": _pct(base["hallucinations"], n_total),
"answer_correctness_pct": _pct(base["answer_correct"], n_grounded),
"citation_accuracy_pct": 0.0,
"groundedness_pct": 0.0,
"refusal_correctness_pct": _pct(base["refusal_correct"], n_refusal),
},
"per_item": per_item,
}
return summary
def render_table(summary: Dict) -> str:
pp = summary["parapilot"]
base = summary["baseline"]
c = summary["counts"]
rows = [
("Hallucination rate", "{}%".format(base["hallucination_rate_pct"]),
"{}%".format(pp["hallucination_rate_pct"]), "lower is better"),
("Answer correctness (grounded Qs)", "{}%".format(base["answer_correctness_pct"]),
"{}%".format(pp["answer_correctness_pct"]), "higher is better"),
("Groundedness / faithfulness", "{}%".format(base["groundedness_pct"]),
"{}%".format(pp["groundedness_pct"]), "higher is better"),
("Citation accuracy", "{}%".format(base["citation_accuracy_pct"]),
"{}%".format(pp["citation_accuracy_pct"]), "higher is better"),
("Refusal correctness (out-of-scope/advice)", "{}%".format(base["refusal_correctness_pct"]),
"{}%".format(pp["refusal_correctness_pct"]), "higher is better"),
]
lines = []
lines.append(
"Evaluated on {} gold Q&A ({} grounded, {} out-of-scope/advice), "
"offline on the stub provider.".format(c["total"], c["grounded"], c["refusal"])
)
lines.append("")
lines.append("| Metric | Plain LLM (no RAG) | ParaPilot (grounded) | |")
lines.append("|---|---|---|---|")
for name, b, p, note in rows:
lines.append("| {} | {} | **{}** | {} |".format(name, b, p, note))
return "\n".join(lines)
def main() -> int:
parser = argparse.ArgumentParser(description="ParaPilot anti-hallucination eval.")
parser.add_argument("--json", type=str, default="", help="Write full results JSON here.")
parser.add_argument("--md", type=str, default="", help="Write the README table fragment here.")
args = parser.parse_args()
summary = evaluate()
table = render_table(summary)
print("\n=== ParaPilot Anti-Hallucination Eval ===\n")
print(table)
print("\nDetail:")
pp = summary["parapilot"]
print(" ParaPilot hallucination={}% answer={}% ground={}% cite={}% refuse={}%".format(
pp["hallucination_rate_pct"], pp["answer_correctness_pct"],
pp["groundedness_pct"], pp["citation_accuracy_pct"], pp["refusal_correctness_pct"]))
base = summary["baseline"]
print(" Baseline hallucination={}% answer={}% refuse={}%".format(
base["hallucination_rate_pct"], base["answer_correctness_pct"],
base["refusal_correctness_pct"]))
if args.json:
Path(args.json).write_text(json.dumps(summary, indent=2), encoding="utf-8")
print("\nWrote JSON -> {}".format(args.json))
if args.md:
Path(args.md).write_text(table + "\n", encoding="utf-8")
print("Wrote table -> {}".format(args.md))
return 0
if __name__ == "__main__":
raise SystemExit(main())