"""Eval metrics (SPEC §5). All metrics are computable offline against the stub + corpus: * groundedness / faithfulness — is every sentence in the answer backed by a cited source chunk? (RAGAS-style, computed by checking that each answer sentence has strong token overlap with at least one cited chunk's text) * citation accuracy — did the answer cite the EXPECTED source, and are all emitted citations real (resolve to retrieved chunks)? * answer correctness — does the grounded answer contain the expected fact(s)? * refusal correctness — did the system refuse exactly the questions it should, with the right refusal kind? * hallucination — for the baseline comparison: an answer that asserts content NOT supported by the corpus (ungrounded) counts as a hallucination. A correct refusal is NOT a hallucination. """ from __future__ import annotations import re from typing import List, Sequence from app.schemas import AnswerEnvelope, AnswerKind _WORD_RE = re.compile(r"[a-z0-9]+") _STOP = { "the", "a", "an", "of", "to", "in", "on", "for", "and", "or", "is", "are", "be", "do", "does", "did", "it", "this", "that", "with", "you", "your", "can", "if", "at", "as", "by", "from", "will", "must", "have", "has", } _SENT_SPLIT = re.compile(r"(?<=[.!?])\s+") def _tok(text: str) -> set: return {w for w in _WORD_RE.findall(text.lower()) if w not in _STOP and len(w) > 1} def contains_facts(answer: str, facts: Sequence[str]) -> bool: """Answer correctness: all expected fact substrings present (case-insensitive).""" low = answer.lower() return all(f.lower() in low for f in facts) def cited_expected_source(env: AnswerEnvelope, expected_source: str) -> bool: return any(c.source_id == expected_source for c in env.citations) def _claims_with_markers(answer: str) -> List[tuple]: """Split an answer into (claim_text, [marker, ...]) units. A citation marker [n] annotates the text that PRECEDES it (the claim, then its source). We therefore cut the answer at each marker and pair the run of text before it with that marker. Trailing text with no marker is its own (unsupported) claim. """ units: List[tuple] = [] last = 0 for m in re.finditer(r"\[(\d+)\]", answer): claim = answer[last : m.start()] # Consecutive markers (e.g. "[1][2]") share the preceding claim. if not claim.strip() and units: units[-1][1].append(m.group(1)) else: units.append((claim, [m.group(1)])) last = m.end() tail = answer[last:] if tail.strip(): units.append((tail, [])) return units def sentence_groundedness(env: AnswerEnvelope, overlap_threshold: float = 0.5) -> float: """Fraction of cited claims supported by the text of the chunk they cite. Each "claim [n]" unit must share at least ``overlap_threshold`` of its content tokens with the cited chunk's snippet. Claims with no citation are unsupported (count against groundedness). Refusals assert nothing. """ if env.kind != AnswerKind.GROUNDED: return 1.0 # refusals assert nothing, so vacuously grounded units = _claims_with_markers(env.answer) if not units: return 0.0 snip_tokens = {c.marker: _tok(c.snippet) for c in env.citations} scored = 0 supported = 0 for claim, markers in units: claim_tok = _tok(claim) if not claim_tok: continue # nothing asserted in this unit; don't score it scored += 1 ok = False for mk in markers: chunk_tok = snip_tokens.get(mk, set()) if not chunk_tok: continue overlap = len(claim_tok & chunk_tok) / max(1, len(claim_tok)) if overlap >= overlap_threshold: ok = True break if ok: supported += 1 return supported / scored if scored else 0.0 def all_citations_real(env: AnswerEnvelope) -> bool: """Every emitted citation has a resolvable URL/source (no dangling cites).""" return all(c.url and c.source_id and c.chunk_id for c in env.citations) def is_hallucination(env: AnswerEnvelope, expected_type: str) -> bool: """Did the system assert ungrounded content? - A correct refusal is never a hallucination. - A grounded answer with no citations, or with low groundedness, is a hallucination (it asserted something it couldn't support). - Answering a question that should have been refused (advice/scope) with asserted content is a hallucination. """ if env.kind == AnswerKind.GROUNDED: if expected_type != "grounded": # Should have refused but instead asserted content. return True if not env.citations: return True return sentence_groundedness(env) < 0.5 # Any refusal asserts nothing substantive -> not a hallucination. return False