| """Eval metrics (SPEC §5). |
| |
| All metrics are computable offline against the stub + corpus: |
| |
| * groundedness / faithfulness — is every sentence in the answer backed by a |
| cited source chunk? (RAGAS-style, computed by checking that each answer |
| sentence has strong token overlap with at least one cited chunk's text) |
| * citation accuracy — did the answer cite the EXPECTED source, and are all |
| emitted citations real (resolve to retrieved chunks)? |
| * answer correctness — does the grounded answer contain the expected fact(s)? |
| * refusal correctness — did the system refuse exactly the questions it should, |
| with the right refusal kind? |
| * hallucination — for the baseline comparison: an answer that asserts content |
| NOT supported by the corpus (ungrounded) counts as a hallucination. A |
| correct refusal is NOT a hallucination. |
| """ |
| from __future__ import annotations |
|
|
| import re |
| from typing import List, Sequence |
|
|
| from app.schemas import AnswerEnvelope, AnswerKind |
|
|
| _WORD_RE = re.compile(r"[a-z0-9]+") |
| _STOP = { |
| "the", "a", "an", "of", "to", "in", "on", "for", "and", "or", "is", "are", |
| "be", "do", "does", "did", "it", "this", "that", "with", "you", "your", |
| "can", "if", "at", "as", "by", "from", "will", "must", "have", "has", |
| } |
| _SENT_SPLIT = re.compile(r"(?<=[.!?])\s+") |
|
|
|
|
| def _tok(text: str) -> set: |
| return {w for w in _WORD_RE.findall(text.lower()) if w not in _STOP and len(w) > 1} |
|
|
|
|
| def contains_facts(answer: str, facts: Sequence[str]) -> bool: |
| """Answer correctness: all expected fact substrings present (case-insensitive).""" |
| low = answer.lower() |
| return all(f.lower() in low for f in facts) |
|
|
|
|
| def cited_expected_source(env: AnswerEnvelope, expected_source: str) -> bool: |
| return any(c.source_id == expected_source for c in env.citations) |
|
|
|
|
| def _claims_with_markers(answer: str) -> List[tuple]: |
| """Split an answer into (claim_text, [marker, ...]) units. |
| |
| A citation marker [n] annotates the text that PRECEDES it (the claim, then |
| its source). We therefore cut the answer at each marker and pair the run of |
| text before it with that marker. Trailing text with no marker is its own |
| (unsupported) claim. |
| """ |
| units: List[tuple] = [] |
| last = 0 |
| for m in re.finditer(r"\[(\d+)\]", answer): |
| claim = answer[last : m.start()] |
| |
| if not claim.strip() and units: |
| units[-1][1].append(m.group(1)) |
| else: |
| units.append((claim, [m.group(1)])) |
| last = m.end() |
| tail = answer[last:] |
| if tail.strip(): |
| units.append((tail, [])) |
| return units |
|
|
|
|
| def sentence_groundedness(env: AnswerEnvelope, overlap_threshold: float = 0.5) -> float: |
| """Fraction of cited claims supported by the text of the chunk they cite. |
| |
| Each "claim [n]" unit must share at least ``overlap_threshold`` of its |
| content tokens with the cited chunk's snippet. Claims with no citation are |
| unsupported (count against groundedness). Refusals assert nothing. |
| """ |
| if env.kind != AnswerKind.GROUNDED: |
| return 1.0 |
|
|
| units = _claims_with_markers(env.answer) |
| if not units: |
| return 0.0 |
|
|
| snip_tokens = {c.marker: _tok(c.snippet) for c in env.citations} |
|
|
| scored = 0 |
| supported = 0 |
| for claim, markers in units: |
| claim_tok = _tok(claim) |
| if not claim_tok: |
| continue |
| scored += 1 |
| ok = False |
| for mk in markers: |
| chunk_tok = snip_tokens.get(mk, set()) |
| if not chunk_tok: |
| continue |
| overlap = len(claim_tok & chunk_tok) / max(1, len(claim_tok)) |
| if overlap >= overlap_threshold: |
| ok = True |
| break |
| if ok: |
| supported += 1 |
| return supported / scored if scored else 0.0 |
|
|
|
|
| def all_citations_real(env: AnswerEnvelope) -> bool: |
| """Every emitted citation has a resolvable URL/source (no dangling cites).""" |
| return all(c.url and c.source_id and c.chunk_id for c in env.citations) |
|
|
|
|
| def is_hallucination(env: AnswerEnvelope, expected_type: str) -> bool: |
| """Did the system assert ungrounded content? |
| |
| - A correct refusal is never a hallucination. |
| - A grounded answer with no citations, or with low groundedness, is a |
| hallucination (it asserted something it couldn't support). |
| - Answering a question that should have been refused (advice/scope) with |
| asserted content is a hallucination. |
| """ |
| if env.kind == AnswerKind.GROUNDED: |
| if expected_type != "grounded": |
| |
| return True |
| if not env.citations: |
| return True |
| return sentence_groundedness(env) < 0.5 |
| |
| return False |
|
|