| """Self-refusal detection (regex-based). |
| |
| Used as the post-gate in approach E: after the LLM produces a response, scan |
| the first ~300 chars for explicit-refusal patterns. If found, the model is |
| admitting it doesn't have the requested fact -> we either retrieve from |
| external corpus or surface the refusal honestly. |
| |
| The patterns target what R1-distill / Qwen-family chat models actually emit |
| when they encounter post-cutoff or fictional-entity questions under a strict |
| system prompt: |
| "I am sorry, I cannot answer that" |
| "I don't have information on..." |
| "That is after my training cutoff" |
| "As of my last update..." |
| "I'm not sure..." |
| |
| Tested refusal-trigger rate (4 out-of-corpus unknowns): |
| V0 loose prompt: 25% |
| V1 strict prompt: 50% |
| V2 cutoff-aware prompt: 75% <- recommended (see RECOMMENDED_SYSTEM) |
| V3 combined (strict + cutoff): 50% (over-specified, worse) |
| |
| Residual failure: topic-hijack cases (e.g. "Who won the 2024 Nobel Physics?" |
| gets rewritten to a fabricated 1986/1998 Nobel answer). Prompt engineering |
| alone cannot fully suppress this. Mitigations: short-latency heuristic, |
| topic-blocklist, answer-verification pass. |
| """ |
|
|
| import re |
|
|
|
|
| _REFUSAL_RE = re.compile( |
| r"\b(I (am sorry|don'?t|do not|am unable|cannot|can'?t)\s+" |
| r"(know|have|recall|remember|answer|provide|find|see))" |
| r"|\b(I'?m not (sure|aware|certain))" |
| r"|\b(no (information|details|data) (on|about|regarding))" |
| r"|\b(as of my (training|last update|knowledge)|my training data|training cutoff)" |
| r"|\bcannot answer that\b" |
| r"|\bdon'?t have (information|details|data)\b" |
| r"|\bafter my (training|knowledge) cutoff\b" |
| r"|\bI lack (information|data)\b", |
| re.IGNORECASE, |
| ) |
|
|
|
|
| def detect_refusal(text: str, scan_chars: int = 400) -> bool: |
| """Return True if `text[:scan_chars]` contains an explicit self-refusal phrase.""" |
| if not text: |
| return False |
| return _REFUSAL_RE.search(text[:scan_chars]) is not None |
|
|
|
|
| |
| |
| RECOMMENDED_SYSTEM = ( |
| "You are a helpful assistant. Your knowledge cutoff is March 2024. " |
| "For any question about events, people, or facts after March 2024, " |
| "you MUST explicitly say 'That is after my training cutoff.' " |
| "Do not guess. Do not fabricate. " |
| "If a Context is provided, use it to answer." |
| ) |
|
|