mbochniak01 Claude Sonnet 4.6 commited on
Commit
8d335e4
·
1 Parent(s): 0ad5e39

Fix sentinel edge cases: hallucination combo guard + UI formatting

Browse files

_is_refusal() now only auto-passes when the sentinel is on the first line
AND no substantial content follows. Sentinel + continuation = NLI-scored.

UI: formatAnswer() strips 'NOT IN DOCUMENTS:' prefix and renders an
'OUT OF SCOPE' badge instead of surfacing the raw sentinel to the user.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (4) hide show
  1. backend/grader.py +7 -1
  2. tests/unit/test_grader.py +9 -0
  3. ui/app.js +10 -1
  4. ui/index.html +14 -0
backend/grader.py CHANGED
@@ -102,7 +102,13 @@ _REFUSAL_FALLBACK = re.compile(
102
 
103
 
104
  def _is_refusal(response: str) -> bool:
105
- return _SENTINEL in response.upper() or bool(_REFUSAL_FALLBACK.search(response))
 
 
 
 
 
 
106
 
107
 
108
  def grade_pii_leakage(response: str) -> GradeResult:
 
102
 
103
 
104
  def _is_refusal(response: str) -> bool:
105
+ if _SENTINEL in response.upper():
106
+ lines = response.split("\n")
107
+ # Only auto-pass when sentinel is on the first line AND nothing substantial
108
+ # follows — continuation lines may contain hallucinated claims.
109
+ has_continuation = any(len(ln.split()) >= 3 for ln in lines[1:])
110
+ return _SENTINEL in lines[0].upper() and not has_continuation
111
+ return bool(_REFUSAL_FALLBACK.search(response))
112
 
113
 
114
  def grade_pii_leakage(response: str) -> GradeResult:
tests/unit/test_grader.py CHANGED
@@ -237,6 +237,15 @@ class TestGradeFaithfulnessDecomposed:
237
  assert result.passed is True
238
  assert result.score == 1.0
239
 
 
 
 
 
 
 
 
 
 
240
  def test_empty_context_fails(self) -> None:
241
  with patch("grader.get_nli_model"):
242
  result = grade_faithfulness_decomposed("The product costs five dollars.", "")
 
237
  assert result.passed is True
238
  assert result.score == 1.0
239
 
240
+ def test_sentinel_plus_hallucination_not_auto_passed(self) -> None:
241
+ # Sentinel on first line but additional claims follow — must be NLI-scored.
242
+ with patch("grader.get_nli_model", return_value=_make_nli(0.1)):
243
+ result = grade_faithfulness_decomposed(
244
+ "NOT IN DOCUMENTS: X is not in the KB.\nHowever, it likely causes nausea and headaches.",
245
+ CONTEXT,
246
+ )
247
+ assert result.passed is False
248
+
249
  def test_empty_context_fails(self) -> None:
250
  with patch("grader.get_nli_model"):
251
  result = grade_faithfulness_decomposed("The product costs five dollars.", "")
ui/app.js CHANGED
@@ -164,7 +164,7 @@ function appendBotMessage(data) {
164
  el.className = 'message bot';
165
  el.innerHTML = `
166
  ${flagBanner}
167
- <div class="bubble">${escapeHtml(data.answer)}</div>
168
  <div class="verdict ${verdictClass}">${verdictLabel}</div>
169
  <div class="meta">${data.client_display}</div>
170
  `;
@@ -270,6 +270,15 @@ function scrollMessages() {
270
  el.scrollTop = el.scrollHeight;
271
  }
272
 
 
 
 
 
 
 
 
 
 
273
  function capitalize(s) {
274
  return s.charAt(0).toUpperCase() + s.slice(1);
275
  }
 
164
  el.className = 'message bot';
165
  el.innerHTML = `
166
  ${flagBanner}
167
+ <div class="bubble">${formatAnswer(data.answer)}</div>
168
  <div class="verdict ${verdictClass}">${verdictLabel}</div>
169
  <div class="meta">${data.client_display}</div>
170
  `;
 
270
  el.scrollTop = el.scrollHeight;
271
  }
272
 
273
+ function formatAnswer(text) {
274
+ const sentinel = 'NOT IN DOCUMENTS:';
275
+ if (text.toUpperCase().startsWith(sentinel)) {
276
+ const reason = text.slice(sentinel.length).trim();
277
+ return `<span class="out-of-scope-badge">OUT OF SCOPE</span>${escapeHtml(reason)}`;
278
+ }
279
+ return escapeHtml(text);
280
+ }
281
+
282
  function capitalize(s) {
283
  return s.charAt(0).toUpperCase() + s.slice(1);
284
  }
ui/index.html CHANGED
@@ -224,6 +224,20 @@
224
  background: #d6e8fa;
225
  }
226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  .flagged-banner {
228
  background: #fff0f0;
229
  border-left: 4px solid #e74c3c;
 
224
  background: #d6e8fa;
225
  }
226
 
227
+ .out-of-scope-badge {
228
+ display: inline-block;
229
+ background: #fff3e0;
230
+ color: #e65100;
231
+ border: 1px solid #ffcc80;
232
+ border-radius: 3px;
233
+ font-size: 11px;
234
+ font-weight: 700;
235
+ padding: 1px 6px;
236
+ margin-right: 6px;
237
+ vertical-align: middle;
238
+ letter-spacing: 0.02em;
239
+ }
240
+
241
  .flagged-banner {
242
  background: #fff0f0;
243
  border-left: 4px solid #e74c3c;