mbochniak01 Claude Sonnet 4.6 commited on
Commit ·
8d335e4
1
Parent(s): 0ad5e39
Fix sentinel edge cases: hallucination combo guard + UI formatting
Browse files_is_refusal() now only auto-passes when the sentinel is on the first line
AND no substantial content follows. Sentinel + continuation = NLI-scored.
UI: formatAnswer() strips 'NOT IN DOCUMENTS:' prefix and renders an
'OUT OF SCOPE' badge instead of surfacing the raw sentinel to the user.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- backend/grader.py +7 -1
- tests/unit/test_grader.py +9 -0
- ui/app.js +10 -1
- ui/index.html +14 -0
backend/grader.py
CHANGED
|
@@ -102,7 +102,13 @@ _REFUSAL_FALLBACK = re.compile(
|
|
| 102 |
|
| 103 |
|
| 104 |
def _is_refusal(response: str) -> bool:
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
|
| 108 |
def grade_pii_leakage(response: str) -> GradeResult:
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
def _is_refusal(response: str) -> bool:
|
| 105 |
+
if _SENTINEL in response.upper():
|
| 106 |
+
lines = response.split("\n")
|
| 107 |
+
# Only auto-pass when sentinel is on the first line AND nothing substantial
|
| 108 |
+
# follows — continuation lines may contain hallucinated claims.
|
| 109 |
+
has_continuation = any(len(ln.split()) >= 3 for ln in lines[1:])
|
| 110 |
+
return _SENTINEL in lines[0].upper() and not has_continuation
|
| 111 |
+
return bool(_REFUSAL_FALLBACK.search(response))
|
| 112 |
|
| 113 |
|
| 114 |
def grade_pii_leakage(response: str) -> GradeResult:
|
tests/unit/test_grader.py
CHANGED
|
@@ -237,6 +237,15 @@ class TestGradeFaithfulnessDecomposed:
|
|
| 237 |
assert result.passed is True
|
| 238 |
assert result.score == 1.0
|
| 239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
def test_empty_context_fails(self) -> None:
|
| 241 |
with patch("grader.get_nli_model"):
|
| 242 |
result = grade_faithfulness_decomposed("The product costs five dollars.", "")
|
|
|
|
| 237 |
assert result.passed is True
|
| 238 |
assert result.score == 1.0
|
| 239 |
|
| 240 |
+
def test_sentinel_plus_hallucination_not_auto_passed(self) -> None:
|
| 241 |
+
# Sentinel on first line but additional claims follow — must be NLI-scored.
|
| 242 |
+
with patch("grader.get_nli_model", return_value=_make_nli(0.1)):
|
| 243 |
+
result = grade_faithfulness_decomposed(
|
| 244 |
+
"NOT IN DOCUMENTS: X is not in the KB.\nHowever, it likely causes nausea and headaches.",
|
| 245 |
+
CONTEXT,
|
| 246 |
+
)
|
| 247 |
+
assert result.passed is False
|
| 248 |
+
|
| 249 |
def test_empty_context_fails(self) -> None:
|
| 250 |
with patch("grader.get_nli_model"):
|
| 251 |
result = grade_faithfulness_decomposed("The product costs five dollars.", "")
|
ui/app.js
CHANGED
|
@@ -164,7 +164,7 @@ function appendBotMessage(data) {
|
|
| 164 |
el.className = 'message bot';
|
| 165 |
el.innerHTML = `
|
| 166 |
${flagBanner}
|
| 167 |
-
<div class="bubble">${
|
| 168 |
<div class="verdict ${verdictClass}">${verdictLabel}</div>
|
| 169 |
<div class="meta">${data.client_display}</div>
|
| 170 |
`;
|
|
@@ -270,6 +270,15 @@ function scrollMessages() {
|
|
| 270 |
el.scrollTop = el.scrollHeight;
|
| 271 |
}
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
function capitalize(s) {
|
| 274 |
return s.charAt(0).toUpperCase() + s.slice(1);
|
| 275 |
}
|
|
|
|
| 164 |
el.className = 'message bot';
|
| 165 |
el.innerHTML = `
|
| 166 |
${flagBanner}
|
| 167 |
+
<div class="bubble">${formatAnswer(data.answer)}</div>
|
| 168 |
<div class="verdict ${verdictClass}">${verdictLabel}</div>
|
| 169 |
<div class="meta">${data.client_display}</div>
|
| 170 |
`;
|
|
|
|
| 270 |
el.scrollTop = el.scrollHeight;
|
| 271 |
}
|
| 272 |
|
| 273 |
+
function formatAnswer(text) {
|
| 274 |
+
const sentinel = 'NOT IN DOCUMENTS:';
|
| 275 |
+
if (text.toUpperCase().startsWith(sentinel)) {
|
| 276 |
+
const reason = text.slice(sentinel.length).trim();
|
| 277 |
+
return `<span class="out-of-scope-badge">OUT OF SCOPE</span>${escapeHtml(reason)}`;
|
| 278 |
+
}
|
| 279 |
+
return escapeHtml(text);
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
function capitalize(s) {
|
| 283 |
return s.charAt(0).toUpperCase() + s.slice(1);
|
| 284 |
}
|
ui/index.html
CHANGED
|
@@ -224,6 +224,20 @@
|
|
| 224 |
background: #d6e8fa;
|
| 225 |
}
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
.flagged-banner {
|
| 228 |
background: #fff0f0;
|
| 229 |
border-left: 4px solid #e74c3c;
|
|
|
|
| 224 |
background: #d6e8fa;
|
| 225 |
}
|
| 226 |
|
| 227 |
+
.out-of-scope-badge {
|
| 228 |
+
display: inline-block;
|
| 229 |
+
background: #fff3e0;
|
| 230 |
+
color: #e65100;
|
| 231 |
+
border: 1px solid #ffcc80;
|
| 232 |
+
border-radius: 3px;
|
| 233 |
+
font-size: 11px;
|
| 234 |
+
font-weight: 700;
|
| 235 |
+
padding: 1px 6px;
|
| 236 |
+
margin-right: 6px;
|
| 237 |
+
vertical-align: middle;
|
| 238 |
+
letter-spacing: 0.02em;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
.flagged-banner {
|
| 242 |
background: #fff0f0;
|
| 243 |
border-left: 4px solid #e74c3c;
|