Photon-6B / hedge_reader.py

Photon-6B: the honest local honesty package (Lucidia/Photon)

95f3259 verified 19 days ago

12.5 kB

	"""Aperture · Hedge reader — the second off-map signal, read from the answer's WORDS.

	Iris's primary signal is the answer-logprob trajectory (`aperture.iris`): a grounded answer holds steady
	confidence, a fabricated one's confidence collapses. That signal is strong on most models — but it has a known
	dead spot (boundary b2). Some models put their honesty in the text, not the logprobs: asked about an
	entity they don't know, they emit a fluent, high-confidence refusal — "There is no record of …", "I'm not
	able to verify …", "I couldn't find any information on …". The tokens of that refusal are perfectly ordinary
	English, so the logprob trajectory stays flat and the logprob probe scores it GROUNDED. (Measured: phi-4
	refuses in text 40/40 but its in-family logprob AUROC is only 0.56 — the probe misses the very case the model
	is being most honest about.)

	This module is the complementary reader: a calibrated lexical detector of epistemic disclaimers in the
	model's OUTPUT TEXT. It is deliberately conservative — it must catch the word-hedger WITHOUT firing on a
	grounded direct answer that happens to contain a hedge token. Two tiers:

	* STRONG — the answer ASSERTS a gap or non-existence ("there is no record of", "does not exist",
	"I can't verify", "I have no information about", "I couldn't find", "this appears to be fictional").
	This is an off-map admission. Fused with the logprob probe, it fires the cert OFF-MAP.
	* SOFT — the answer qualifies a claim it nonetheless makes ("I think", "possibly", "it may be",
	"if I recall", "roughly"). Real uncertainty, but the model is still committing to an answer — so this
	only lifts the band to UNCERTAIN, never to off-map on its own.

	The phrase lists here CONSOLIDATE the (independently-grown, battle-tested) refusal/negation lists already in
	``web/engine.py::_is_refusal``, ``web/council.py``, ``web/debate.py::_NEG`` and the eval ``ABST`` tuples, plus
	the soft-hedge vocabulary from ``aperture/honesty_frames.py``. Pure-stdlib, no model, no network — it reads a
	string. Fail-open to "no hedge" on any malformed input (a reader that can't parse must not invent a signal).

	Calibration: ``read_hedge`` is anchored on a head window (the first ~280 chars, where a model that is
	declining says so) and a small set of guards that keep grounded prose ("There is no doubt that Gates founded
	Microsoft", "I think therefore I am" as a quoted answer) from tripping STRONG. See
	``tests/test_hedge_reader.py`` for the contract and the grounded-prose anti-firing cases.
	"""
	from __future__ import annotations

	import re

	# ── STRONG: the answer admits a knowledge gap / asserts non-existence (an off-map disclaimer). ──────────────
	# Curated UNION of the repo's hardened lists (engine._is_refusal, council pat, debate._NEG, eval ABST), kept
	# as lowercase substrings. Grouped only for readability; matching is a flat substring scan over the head.
	_STRONG_PHRASES: tuple[str, ...] = (
	# explicit non-existence / fictional
	"does not exist", "doesn't exist", "do not exist", "did not exist", "no such",
	"not a real", "is not a real", "not a recognized", "no recognized", "not a known", "no known",
	"appears to be fictional", "seems to be fictional", "is fictional", "purely fictional", "a fictional",
	"made up", "made-up", "imaginary", "invented name", "no real-world", "not a genuine", "no actual",
	"there is no", "there's no", "no record of", "no record", "no historical", "no widely known", "no widely",
	"false premise", "flawed premise", "not appear to exist", "no current",
	# the model cannot find / has no information
	"no information", "no reliable information", "don't have information", "do not have information",
	"i don't have information", "i do not have information", "no verifiable", "no reliable",
	"could not find", "couldn't find", "i couldn't find", "i could not find", "unable to find",
	"cannot find", "can't find", "i can't find", "i cannot find", "not finding", "no evidence",
	"not aware of", "i'm not aware", "i am not aware", "not aware of any", "not familiar with",
	"don't have any record", "do not have any record", "no information about", "any information on",
	"any information about", "not something i", "i'm not able to confirm", "not able to confirm",
	# the model cannot verify / confirm
	"unable to verify", "cannot be verified", "can't verify", "cannot verify", "i can't verify",
	"i cannot verify", "not able to verify", "unable to confirm", "cannot confirm", "can't confirm",
	"not able to confirm", "unable to provide", "cannot be known", "no one knows", "unknowable",
	"impossible to",
	# ── 2026-06-10 registry-seed additions: 29 phrasings recovered from a 20-model / 838-answer
	# adjudication (each verified ZERO false positives on 1550 real-entity answers; the lexical
	# reader had missed 333 refusals, e.g. Claude's "doesn't appear in my knowledge base") ──
	"doesn't appear in my knowledge",
	"does not appear to be a",
	"have any reliable information",
	"not a widely recognized",
	"don't have reliable information",
	"have any specific information",
	"does not correspond to",
	"doesn't appear to be a",
	"don't have specific information",
	"typo in",
	"this name doesn't",
	"might be some confusion",
	"misunderstanding or",
	"not have access to a search",
	"don't have confident information",
	"there isn't a",
	"looks like you might be referring to",
	"not widely recognized",
	"not a well-known",
	"doesn't match any",
	"not a historical figure",
	"isn't a widely",
	"not widely known",
	"not appear to be widely",
	"doesn't correspond to",
	"haven't been able to find",
	"isn't a recognized",
	"do not have specific information",
	"does not appear in any",
	# 2026-06-07 gpt-5.1 family: "not a historical or currently documented real person" (guarded:
	# bare "documented real"/"currently documented" stay grounded — see tests)
	"documented real person", "currently documented real",
	)

	# ── SOFT: the answer qualifies a claim it still makes (genuine uncertainty, not an off-map admission). ──────
	_SOFT_PHRASES: tuple[str, ...] = (
	"i think", "i believe", "i'm not sure", "i am not sure", "not entirely sure", "not 100% sure",
	"not completely sure", "if i recall", "if i remember", "to the best of my", "as far as i know",
	"i'm not certain", "i am not certain", "not entirely certain", "it may be", "it might be", "may have been",
	"might have been", "could be", "possibly", "perhaps", "i would guess", "my best guess", "roughly",
	"approximately", "i'm not entirely", "i am not entirely", "not sure", "uncertain", "it's possible that",
	"it is possible that", "i'm fairly", "i am fairly", "presumably", "i suspect",
	)

	# ── GUARDS: substrings that, when present in the head, neutralise an otherwise-STRONG hit. ──────────────────
	# These are the few grounded-prose idioms that embed a STRONG token without being a disclaimer:
	# "there is no doubt", "there is no question" → emphatic AFFIRMATION, not a gap.
	# "no record number", "no record label" → 'no record' as a noun phrase, not "no record of X".
	# Keep tiny and specific; over-guarding would re-open the dead spot.
	_STRONG_GUARDS: tuple[str, ...] = (
	"there is no doubt", "there's no doubt", "no doubt that", "there is no question", "there's no question",
	"no question that", "without a doubt", "leaves no doubt", "there is no denying",
	)

	_HEAD = 280 # disclaimer window: a model that's declining says so up front

	_WORD = re.compile(r"[a-z0-9']+")


	def _norm(text) -> str:
	if not isinstance(text, str):
	return ""
	# fold typographic quotes to ASCII (gpt-5.1 refuses with "can\u2019t find" \u2014 U+2019 broke the match)
	# and strip markdown emphasis asterisks ("there is no country"), then collapse whitespace
	text = (text.replace("\u2019", "'").replace("\u2018", "'")
	.replace("\u201c", '"').replace("\u201d", '"').replace("*", ""))
	return re.sub(r"\s+", " ", text).strip().lower()


	def _head(text_norm: str) -> str:
	return text_norm[:_HEAD]


	def _matches(haystack: str, phrases: tuple[str, ...]) -> list[str]:
	return [p for p in phrases if p in haystack]


	def read_hedge(text) -> dict:
	"""Read epistemic-disclaimer language out of a model's answer text (output-only, no model).

	Returns a certificate dict::

	{"hedge": bool, # any hedge (strong OR soft) detected
	"strength": "strong"\|"soft"\|"none",
	"off_map": bool, # True only for a STRONG (gap-asserting) hedge
	"band": "off-map"\|"uncertain"\|"grounded",
	"hedge_score": float, # 0..1 confidence the answer is a hedge (monotone in match count + strength)
	"matched": [str, ...], # the phrases that fired (head window)
	"guarded": [str, ...]} # affirmation idioms that suppressed a strong hit, if any

	Conservative by construction: STRONG fires only inside the head window and only when no affirmation guard
	is present; an empty/garbage input is "grounded / no hedge" (fail-open — never invents a signal).
	"""
	tn = _norm(text)
	if not tn:
	return {"hedge": False, "strength": "none", "off_map": False, "band": "grounded",
	"hedge_score": 0.0, "matched": [], "guarded": []}
	head = _head(tn)
	guards = _matches(head, _STRONG_GUARDS)
	raw_strong = _matches(head, _STRONG_PHRASES)
	# a strong phrase that is part of an affirmation idiom ("there is no doubt …") does not count
	strong = [p for p in raw_strong if not _guarded(p, head, guards)]
	soft = _matches(head, _SOFT_PHRASES)

	if strong:
	# monotone, saturating: 1 hit already strong evidence, more hits → closer to 1
	score = min(0.99, 0.80 + 0.06 * (len(strong) - 1) + 0.03 * len(soft))
	return {"hedge": True, "strength": "strong", "off_map": True, "band": "off-map",
	"hedge_score": round(score, 3), "matched": strong, "guarded": guards}
	if soft:
	score = min(0.78, 0.45 + 0.08 * (len(soft) - 1))
	return {"hedge": True, "strength": "soft", "off_map": False, "band": "uncertain",
	"hedge_score": round(score, 3), "matched": soft, "guarded": guards}
	return {"hedge": False, "strength": "none", "off_map": False, "band": "grounded",
	"hedge_score": 0.0, "matched": [], "guarded": guards}


	def _guarded(phrase: str, head: str, guards: list[str]) -> bool:
	"""Is this strong `phrase`'s occurrence subsumed by an affirmation guard? Only relevant for the handful
	of phrases the guards are built around ('there is no', 'no record')."""
	if not guards:
	return False
	for g in guards:
	if phrase in g: # e.g. phrase "there is no" ⊂ guard "there is no doubt"
	# only suppress if EVERY occurrence of the phrase sits inside a guard occurrence
	if _all_occurrences_inside(phrase, g, head):
	return True
	return False


	def _all_occurrences_inside(phrase: str, guard: str, head: str) -> bool:
	"""True iff every start index of `phrase` in `head` is covered by an occurrence of `guard`."""
	p_idx = _find_all(head, phrase)
	g_spans = [(i, i + len(guard)) for i in _find_all(head, guard)]
	for pi in p_idx:
	if not any(gs <= pi and pi + len(phrase) <= ge for gs, ge in g_spans):
	return False
	return True


	def _find_all(s: str, sub: str) -> list[int]:
	out, i = [], s.find(sub)
	while i != -1:
	out.append(i)
	i = s.find(sub, i + 1)
	return out


	if __name__ == "__main__": # quick manual check
	import json
	import sys
	samples = [
	"There is no record of a company called Brindlewick Cabinetry. It may be fictional.",
	"Microsoft was founded by Bill Gates and Paul Allen in 1975.",
	"I think the capital might be around the coast, but I'm not entirely sure.",
	"There is no doubt that William Shakespeare wrote Hamlet.", # guarded — grounded
	sys.argv[1] if len(sys.argv) > 1 else "I couldn't find any information on that film.",
	]
	for s in samples:
	print(json.dumps({"text": s[:60], **read_hedge(s)}))