| """Aperture Β· **Hedge reader** β the *second* off-map signal, read from the answer's WORDS. |
| |
| Iris's primary signal is the answer-logprob *trajectory* (`aperture.iris`): a grounded answer holds steady |
| confidence, a fabricated one's confidence collapses. That signal is strong on most models β but it has a known |
| **dead spot** (boundary *b2*). Some models put their honesty in the *text*, not the logprobs: asked about an |
| entity they don't know, they emit a *fluent, high-confidence refusal* β "There is no record of β¦", "I'm not |
| able to verify β¦", "I couldn't find any information on β¦". The tokens of that refusal are perfectly ordinary |
| English, so the logprob trajectory stays flat and the logprob probe scores it GROUNDED. (Measured: phi-4 |
| refuses in text 40/40 but its in-family logprob AUROC is only 0.56 β the probe misses the very case the model |
| is being most honest about.) |
| |
| This module is the complementary reader: a calibrated lexical detector of *epistemic disclaimers* in the |
| model's OUTPUT TEXT. It is deliberately conservative β it must catch the word-hedger WITHOUT firing on a |
| grounded direct answer that happens to contain a hedge token. Two tiers: |
| |
| * **STRONG** β the answer ASSERTS a gap or non-existence ("there is no record of", "does not exist", |
| "I can't verify", "I have no information about", "I couldn't find", "this appears to be fictional"). |
| This is an off-map admission. Fused with the logprob probe, it fires the cert OFF-MAP. |
| * **SOFT** β the answer *qualifies* a claim it nonetheless makes ("I think", "possibly", "it may be", |
| "if I recall", "roughly"). Real uncertainty, but the model is still committing to an answer β so this |
| only lifts the band to UNCERTAIN, never to off-map on its own. |
| |
| The phrase lists here CONSOLIDATE the (independently-grown, battle-tested) refusal/negation lists already in |
| ``web/engine.py::_is_refusal``, ``web/council.py``, ``web/debate.py::_NEG`` and the eval ``ABST`` tuples, plus |
| the soft-hedge vocabulary from ``aperture/honesty_frames.py``. Pure-stdlib, no model, no network β it reads a |
| string. Fail-open to "no hedge" on any malformed input (a reader that can't parse must not invent a signal). |
| |
| Calibration: ``read_hedge`` is anchored on a *head window* (the first ~280 chars, where a model that is |
| declining says so) and a small set of guards that keep grounded prose ("There is no doubt that Gates founded |
| Microsoft", "I think therefore I am" as a *quoted* answer) from tripping STRONG. See |
| ``tests/test_hedge_reader.py`` for the contract and the grounded-prose anti-firing cases. |
| """ |
| from __future__ import annotations |
|
|
| import re |
|
|
| |
| |
| |
| _STRONG_PHRASES: tuple[str, ...] = ( |
| |
| "does not exist", "doesn't exist", "do not exist", "did not exist", "no such", |
| "not a real", "is not a real", "not a recognized", "no recognized", "not a known", "no known", |
| "appears to be fictional", "seems to be fictional", "is fictional", "purely fictional", "a fictional", |
| "made up", "made-up", "imaginary", "invented name", "no real-world", "not a genuine", "no actual", |
| "there is no", "there's no", "no record of", "no record", "no historical", "no widely known", "no widely", |
| "false premise", "flawed premise", "not appear to exist", "no current", |
| |
| "no information", "no reliable information", "don't have information", "do not have information", |
| "i don't have information", "i do not have information", "no verifiable", "no reliable", |
| "could not find", "couldn't find", "i couldn't find", "i could not find", "unable to find", |
| "cannot find", "can't find", "i can't find", "i cannot find", "not finding", "no evidence", |
| "not aware of", "i'm not aware", "i am not aware", "not aware of any", "not familiar with", |
| "don't have any record", "do not have any record", "no information about", "any information on", |
| "any information about", "not something i", "i'm not able to confirm", "not able to confirm", |
| |
| "unable to verify", "cannot be verified", "can't verify", "cannot verify", "i can't verify", |
| "i cannot verify", "not able to verify", "unable to confirm", "cannot confirm", "can't confirm", |
| "not able to confirm", "unable to provide", "cannot be known", "no one knows", "unknowable", |
| "impossible to", |
| |
| |
| |
| "doesn't appear in my knowledge", |
| "does not appear to be a", |
| "have any reliable information", |
| "not a widely recognized", |
| "don't have reliable information", |
| "have any specific information", |
| "does not correspond to", |
| "doesn't appear to be a", |
| "don't have specific information", |
| "typo in", |
| "this name doesn't", |
| "might be some confusion", |
| "misunderstanding or", |
| "not have access to a search", |
| "don't have confident information", |
| "there isn't a", |
| "looks like you might be referring to", |
| "not widely recognized", |
| "not a well-known", |
| "doesn't match any", |
| "not a historical figure", |
| "isn't a widely", |
| "not widely known", |
| "not appear to be widely", |
| "doesn't correspond to", |
| "haven't been able to find", |
| "isn't a recognized", |
| "do not have specific information", |
| "does not appear in any", |
| |
| |
| "documented real person", "currently documented real", |
| ) |
|
|
| |
| _SOFT_PHRASES: tuple[str, ...] = ( |
| "i think", "i believe", "i'm not sure", "i am not sure", "not entirely sure", "not 100% sure", |
| "not completely sure", "if i recall", "if i remember", "to the best of my", "as far as i know", |
| "i'm not certain", "i am not certain", "not entirely certain", "it may be", "it might be", "may have been", |
| "might have been", "could be", "possibly", "perhaps", "i would guess", "my best guess", "roughly", |
| "approximately", "i'm not entirely", "i am not entirely", "not sure", "uncertain", "it's possible that", |
| "it is possible that", "i'm fairly", "i am fairly", "presumably", "i suspect", |
| ) |
|
|
| |
| |
| |
| |
| |
| _STRONG_GUARDS: tuple[str, ...] = ( |
| "there is no doubt", "there's no doubt", "no doubt that", "there is no question", "there's no question", |
| "no question that", "without a doubt", "leaves no doubt", "there is no denying", |
| ) |
|
|
| _HEAD = 280 |
|
|
| _WORD = re.compile(r"[a-z0-9']+") |
|
|
|
|
| def _norm(text) -> str: |
| if not isinstance(text, str): |
| return "" |
| |
| |
| text = (text.replace("\u2019", "'").replace("\u2018", "'") |
| .replace("\u201c", '"').replace("\u201d", '"').replace("*", "")) |
| return re.sub(r"\s+", " ", text).strip().lower() |
|
|
|
|
| def _head(text_norm: str) -> str: |
| return text_norm[:_HEAD] |
|
|
|
|
| def _matches(haystack: str, phrases: tuple[str, ...]) -> list[str]: |
| return [p for p in phrases if p in haystack] |
|
|
|
|
| def read_hedge(text) -> dict: |
| """Read epistemic-disclaimer language out of a model's answer **text** (output-only, no model). |
| |
| Returns a certificate dict:: |
| |
| {"hedge": bool, # any hedge (strong OR soft) detected |
| "strength": "strong"|"soft"|"none", |
| "off_map": bool, # True only for a STRONG (gap-asserting) hedge |
| "band": "off-map"|"uncertain"|"grounded", |
| "hedge_score": float, # 0..1 confidence the answer is a hedge (monotone in match count + strength) |
| "matched": [str, ...], # the phrases that fired (head window) |
| "guarded": [str, ...]} # affirmation idioms that suppressed a strong hit, if any |
| |
| Conservative by construction: STRONG fires only inside the head window and only when no affirmation guard |
| is present; an empty/garbage input is "grounded / no hedge" (fail-open β never invents a signal). |
| """ |
| tn = _norm(text) |
| if not tn: |
| return {"hedge": False, "strength": "none", "off_map": False, "band": "grounded", |
| "hedge_score": 0.0, "matched": [], "guarded": []} |
| head = _head(tn) |
| guards = _matches(head, _STRONG_GUARDS) |
| raw_strong = _matches(head, _STRONG_PHRASES) |
| |
| strong = [p for p in raw_strong if not _guarded(p, head, guards)] |
| soft = _matches(head, _SOFT_PHRASES) |
|
|
| if strong: |
| |
| score = min(0.99, 0.80 + 0.06 * (len(strong) - 1) + 0.03 * len(soft)) |
| return {"hedge": True, "strength": "strong", "off_map": True, "band": "off-map", |
| "hedge_score": round(score, 3), "matched": strong, "guarded": guards} |
| if soft: |
| score = min(0.78, 0.45 + 0.08 * (len(soft) - 1)) |
| return {"hedge": True, "strength": "soft", "off_map": False, "band": "uncertain", |
| "hedge_score": round(score, 3), "matched": soft, "guarded": guards} |
| return {"hedge": False, "strength": "none", "off_map": False, "band": "grounded", |
| "hedge_score": 0.0, "matched": [], "guarded": guards} |
|
|
|
|
| def _guarded(phrase: str, head: str, guards: list[str]) -> bool: |
| """Is this strong `phrase`'s occurrence subsumed by an affirmation guard? Only relevant for the handful |
| of phrases the guards are built around ('there is no', 'no record').""" |
| if not guards: |
| return False |
| for g in guards: |
| if phrase in g: |
| |
| if _all_occurrences_inside(phrase, g, head): |
| return True |
| return False |
|
|
|
|
| def _all_occurrences_inside(phrase: str, guard: str, head: str) -> bool: |
| """True iff every start index of `phrase` in `head` is covered by an occurrence of `guard`.""" |
| p_idx = _find_all(head, phrase) |
| g_spans = [(i, i + len(guard)) for i in _find_all(head, guard)] |
| for pi in p_idx: |
| if not any(gs <= pi and pi + len(phrase) <= ge for gs, ge in g_spans): |
| return False |
| return True |
|
|
|
|
| def _find_all(s: str, sub: str) -> list[int]: |
| out, i = [], s.find(sub) |
| while i != -1: |
| out.append(i) |
| i = s.find(sub, i + 1) |
| return out |
|
|
|
|
| if __name__ == "__main__": |
| import json |
| import sys |
| samples = [ |
| "There is no record of a company called Brindlewick Cabinetry. It may be fictional.", |
| "Microsoft was founded by Bill Gates and Paul Allen in 1975.", |
| "I think the capital might be around the coast, but I'm not entirely sure.", |
| "There is no doubt that William Shakespeare wrote Hamlet.", |
| sys.argv[1] if len(sys.argv) > 1 else "I couldn't find any information on that film.", |
| ] |
| for s in samples: |
| print(json.dumps({"text": s[:60], **read_hedge(s)})) |
|
|