parapilot / app /eval /metrics.py
LaelaZ's picture
Deploy ParaPilot to HF Spaces (Docker)
d787a09 verified
"""Eval metrics (SPEC §5).
All metrics are computable offline against the stub + corpus:
* groundedness / faithfulness — is every sentence in the answer backed by a
cited source chunk? (RAGAS-style, computed by checking that each answer
sentence has strong token overlap with at least one cited chunk's text)
* citation accuracy — did the answer cite the EXPECTED source, and are all
emitted citations real (resolve to retrieved chunks)?
* answer correctness — does the grounded answer contain the expected fact(s)?
* refusal correctness — did the system refuse exactly the questions it should,
with the right refusal kind?
* hallucination — for the baseline comparison: an answer that asserts content
NOT supported by the corpus (ungrounded) counts as a hallucination. A
correct refusal is NOT a hallucination.
"""
from __future__ import annotations
import re
from typing import List, Sequence
from app.schemas import AnswerEnvelope, AnswerKind
_WORD_RE = re.compile(r"[a-z0-9]+")
_STOP = {
"the", "a", "an", "of", "to", "in", "on", "for", "and", "or", "is", "are",
"be", "do", "does", "did", "it", "this", "that", "with", "you", "your",
"can", "if", "at", "as", "by", "from", "will", "must", "have", "has",
}
_SENT_SPLIT = re.compile(r"(?<=[.!?])\s+")
def _tok(text: str) -> set:
return {w for w in _WORD_RE.findall(text.lower()) if w not in _STOP and len(w) > 1}
def contains_facts(answer: str, facts: Sequence[str]) -> bool:
"""Answer correctness: all expected fact substrings present (case-insensitive)."""
low = answer.lower()
return all(f.lower() in low for f in facts)
def cited_expected_source(env: AnswerEnvelope, expected_source: str) -> bool:
return any(c.source_id == expected_source for c in env.citations)
def _claims_with_markers(answer: str) -> List[tuple]:
"""Split an answer into (claim_text, [marker, ...]) units.
A citation marker [n] annotates the text that PRECEDES it (the claim, then
its source). We therefore cut the answer at each marker and pair the run of
text before it with that marker. Trailing text with no marker is its own
(unsupported) claim.
"""
units: List[tuple] = []
last = 0
for m in re.finditer(r"\[(\d+)\]", answer):
claim = answer[last : m.start()]
# Consecutive markers (e.g. "[1][2]") share the preceding claim.
if not claim.strip() and units:
units[-1][1].append(m.group(1))
else:
units.append((claim, [m.group(1)]))
last = m.end()
tail = answer[last:]
if tail.strip():
units.append((tail, []))
return units
def sentence_groundedness(env: AnswerEnvelope, overlap_threshold: float = 0.5) -> float:
"""Fraction of cited claims supported by the text of the chunk they cite.
Each "claim [n]" unit must share at least ``overlap_threshold`` of its
content tokens with the cited chunk's snippet. Claims with no citation are
unsupported (count against groundedness). Refusals assert nothing.
"""
if env.kind != AnswerKind.GROUNDED:
return 1.0 # refusals assert nothing, so vacuously grounded
units = _claims_with_markers(env.answer)
if not units:
return 0.0
snip_tokens = {c.marker: _tok(c.snippet) for c in env.citations}
scored = 0
supported = 0
for claim, markers in units:
claim_tok = _tok(claim)
if not claim_tok:
continue # nothing asserted in this unit; don't score it
scored += 1
ok = False
for mk in markers:
chunk_tok = snip_tokens.get(mk, set())
if not chunk_tok:
continue
overlap = len(claim_tok & chunk_tok) / max(1, len(claim_tok))
if overlap >= overlap_threshold:
ok = True
break
if ok:
supported += 1
return supported / scored if scored else 0.0
def all_citations_real(env: AnswerEnvelope) -> bool:
"""Every emitted citation has a resolvable URL/source (no dangling cites)."""
return all(c.url and c.source_id and c.chunk_id for c in env.citations)
def is_hallucination(env: AnswerEnvelope, expected_type: str) -> bool:
"""Did the system assert ungrounded content?
- A correct refusal is never a hallucination.
- A grounded answer with no citations, or with low groundedness, is a
hallucination (it asserted something it couldn't support).
- Answering a question that should have been refused (advice/scope) with
asserted content is a hallucination.
"""
if env.kind == AnswerKind.GROUNDED:
if expected_type != "grounded":
# Should have refused but instead asserted content.
return True
if not env.citations:
return True
return sentence_groundedness(env) < 0.5
# Any refusal asserts nothing substantive -> not a hallucination.
return False