Spaces:

LaelaZ
/

parapilot

Sleeping

App Files Files Community

parapilot / app /eval /metrics.py

LaelaZ

Deploy ParaPilot to HF Spaces (Docker)

d787a09 verified 5 days ago

raw

history blame contribute delete

4.97 kB

	"""Eval metrics (SPEC §5).

	All metrics are computable offline against the stub + corpus:

	* groundedness / faithfulness — is every sentence in the answer backed by a
	cited source chunk? (RAGAS-style, computed by checking that each answer
	sentence has strong token overlap with at least one cited chunk's text)
	* citation accuracy — did the answer cite the EXPECTED source, and are all
	emitted citations real (resolve to retrieved chunks)?
	* answer correctness — does the grounded answer contain the expected fact(s)?
	* refusal correctness — did the system refuse exactly the questions it should,
	with the right refusal kind?
	* hallucination — for the baseline comparison: an answer that asserts content
	NOT supported by the corpus (ungrounded) counts as a hallucination. A
	correct refusal is NOT a hallucination.
	"""
	from __future__ import annotations

	import re
	from typing import List, Sequence

	from app.schemas import AnswerEnvelope, AnswerKind

	_WORD_RE = re.compile(r"[a-z0-9]+")
	_STOP = {
	"the", "a", "an", "of", "to", "in", "on", "for", "and", "or", "is", "are",
	"be", "do", "does", "did", "it", "this", "that", "with", "you", "your",
	"can", "if", "at", "as", "by", "from", "will", "must", "have", "has",
	}
	_SENT_SPLIT = re.compile(r"(?<=[.!?])\s+")


	def _tok(text: str) -> set:
	return {w for w in _WORD_RE.findall(text.lower()) if w not in _STOP and len(w) > 1}


	def contains_facts(answer: str, facts: Sequence[str]) -> bool:
	"""Answer correctness: all expected fact substrings present (case-insensitive)."""
	low = answer.lower()
	return all(f.lower() in low for f in facts)


	def cited_expected_source(env: AnswerEnvelope, expected_source: str) -> bool:
	return any(c.source_id == expected_source for c in env.citations)


	def _claims_with_markers(answer: str) -> List[tuple]:
	"""Split an answer into (claim_text, [marker, ...]) units.

	A citation marker [n] annotates the text that PRECEDES it (the claim, then
	its source). We therefore cut the answer at each marker and pair the run of
	text before it with that marker. Trailing text with no marker is its own
	(unsupported) claim.
	"""
	units: List[tuple] = []
	last = 0
	for m in re.finditer(r"\[(\d+)\]", answer):
	claim = answer[last : m.start()]
	# Consecutive markers (e.g. "[1][2]") share the preceding claim.
	if not claim.strip() and units:
	units[-1][1].append(m.group(1))
	else:
	units.append((claim, [m.group(1)]))
	last = m.end()
	tail = answer[last:]
	if tail.strip():
	units.append((tail, []))
	return units


	def sentence_groundedness(env: AnswerEnvelope, overlap_threshold: float = 0.5) -> float:
	"""Fraction of cited claims supported by the text of the chunk they cite.

	Each "claim [n]" unit must share at least ``overlap_threshold`` of its
	content tokens with the cited chunk's snippet. Claims with no citation are
	unsupported (count against groundedness). Refusals assert nothing.
	"""
	if env.kind != AnswerKind.GROUNDED:
	return 1.0 # refusals assert nothing, so vacuously grounded

	units = _claims_with_markers(env.answer)
	if not units:
	return 0.0

	snip_tokens = {c.marker: _tok(c.snippet) for c in env.citations}

	scored = 0
	supported = 0
	for claim, markers in units:
	claim_tok = _tok(claim)
	if not claim_tok:
	continue # nothing asserted in this unit; don't score it
	scored += 1
	ok = False
	for mk in markers:
	chunk_tok = snip_tokens.get(mk, set())
	if not chunk_tok:
	continue
	overlap = len(claim_tok & chunk_tok) / max(1, len(claim_tok))
	if overlap >= overlap_threshold:
	ok = True
	break
	if ok:
	supported += 1
	return supported / scored if scored else 0.0


	def all_citations_real(env: AnswerEnvelope) -> bool:
	"""Every emitted citation has a resolvable URL/source (no dangling cites)."""
	return all(c.url and c.source_id and c.chunk_id for c in env.citations)


	def is_hallucination(env: AnswerEnvelope, expected_type: str) -> bool:
	"""Did the system assert ungrounded content?

	- A correct refusal is never a hallucination.
	- A grounded answer with no citations, or with low groundedness, is a
	hallucination (it asserted something it couldn't support).
	- Answering a question that should have been refused (advice/scope) with
	asserted content is a hallucination.
	"""
	if env.kind == AnswerKind.GROUNDED:
	if expected_type != "grounded":
	# Should have refused but instead asserted content.
	return True
	if not env.citations:
	return True
	return sentence_groundedness(env) < 0.5
	# Any refusal asserts nothing substantive -> not a hallucination.
	return False