Add hallucination-guard (E): pre-gate + V2 cutoff prompt + refusal post-gate

76f8341 verified 5 days ago

2.45 kB

	"""Self-refusal detection (regex-based).

	Used as the post-gate in approach E: after the LLM produces a response, scan
	the first ~300 chars for explicit-refusal patterns. If found, the model is
	admitting it doesn't have the requested fact -> we either retrieve from
	external corpus or surface the refusal honestly.

	The patterns target what R1-distill / Qwen-family chat models actually emit
	when they encounter post-cutoff or fictional-entity questions under a strict
	system prompt:
	"I am sorry, I cannot answer that"
	"I don't have information on..."
	"That is after my training cutoff"
	"As of my last update..."
	"I'm not sure..."

	Tested refusal-trigger rate (4 out-of-corpus unknowns):
	V0 loose prompt: 25%
	V1 strict prompt: 50%
	V2 cutoff-aware prompt: 75% <- recommended (see RECOMMENDED_SYSTEM)
	V3 combined (strict + cutoff): 50% (over-specified, worse)

	Residual failure: topic-hijack cases (e.g. "Who won the 2024 Nobel Physics?"
	gets rewritten to a fabricated 1986/1998 Nobel answer). Prompt engineering
	alone cannot fully suppress this. Mitigations: short-latency heuristic,
	topic-blocklist, answer-verification pass.
	"""

	import re


	_REFUSAL_RE = re.compile(
	r"\b(I (am sorry\|don'?t\|do not\|am unable\|cannot\|can'?t)\s+"
	r"(know\|have\|recall\|remember\|answer\|provide\|find\|see))"
	r"\|\b(I'?m not (sure\|aware\|certain))"
	r"\|\b(no (information\|details\|data) (on\|about\|regarding))"
	r"\|\b(as of my (training\|last update\|knowledge)\|my training data\|training cutoff)"
	r"\|\bcannot answer that\b"
	r"\|\bdon'?t have (information\|details\|data)\b"
	r"\|\bafter my (training\|knowledge) cutoff\b"
	r"\|\bI lack (information\|data)\b",
	re.IGNORECASE,
	)


	def detect_refusal(text: str, scan_chars: int = 400) -> bool:
	"""Return True if `text[:scan_chars]` contains an explicit self-refusal phrase."""
	if not text:
	return False
	return _REFUSAL_RE.search(text[:scan_chars]) is not None


	# The recommended system prompt for production: maximizes refusal rate on
	# out-of-corpus unknowns while staying short and not over-specified.
	RECOMMENDED_SYSTEM = (
	"You are a helpful assistant. Your knowledge cutoff is March 2024. "
	"For any question about events, people, or facts after March 2024, "
	"you MUST explicitly say 'That is after my training cutoff.' "
	"Do not guess. Do not fabricate. "
	"If a Context is provided, use it to answer."
	)