mbochniak01 Claude Sonnet 4.6 commited on
Commit
0ad5e39
·
1 Parent(s): 7ee90da

Replace ad-hoc refusal regexes with NOT IN DOCUMENTS sentinel

Browse files

Prompt now instructs the model to use 'NOT IN DOCUMENTS: ...' for any
out-of-scope response. _is_refusal() checks for this sentinel first —
one deterministic check instead of an open-ended regex list.

Fallback patterns retained for responses that predate the instruction.
Aligns with NOT IN DOCUMENTS pattern from knowledge/problems/keeps-hallucinating.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

backend/grader.py CHANGED
@@ -89,10 +89,12 @@ TOKEN_BUDGET = 512
89
  RELEVANCY_THRESHOLD = 0.45
90
  FAITHFULNESS_THRESHOLD = 0.35
91
 
92
- _REFUSAL_PATTERNS = re.compile(
 
 
 
 
93
  r"(i (don't|do not|cannot|can't|'m not able to) (have|find|provide|answer)|"
94
- r"not able to (answer|provide|help)|"
95
- r"(falls?|is) outside (of )?(the )?(scope|knowledge base)|"
96
  r"not enough (information|context)|"
97
  r"the (context|provided) (does not|doesn't) (contain|include|mention))",
98
  re.IGNORECASE,
@@ -100,7 +102,7 @@ _REFUSAL_PATTERNS = re.compile(
100
 
101
 
102
  def _is_refusal(response: str) -> bool:
103
- return bool(_REFUSAL_PATTERNS.search(response))
104
 
105
 
106
  def grade_pii_leakage(response: str) -> GradeResult:
 
89
  RELEVANCY_THRESHOLD = 0.45
90
  FAITHFULNESS_THRESHOLD = 0.35
91
 
92
+ _SENTINEL = "NOT IN DOCUMENTS"
93
+
94
+ # Fallback patterns for responses that predate the sentinel instruction or
95
+ # where the model ignores the sentinel format.
96
+ _REFUSAL_FALLBACK = re.compile(
97
  r"(i (don't|do not|cannot|can't|'m not able to) (have|find|provide|answer)|"
 
 
98
  r"not enough (information|context)|"
99
  r"the (context|provided) (does not|doesn't) (contain|include|mention))",
100
  re.IGNORECASE,
 
102
 
103
 
104
  def _is_refusal(response: str) -> bool:
105
+ return _SENTINEL in response.upper() or bool(_REFUSAL_FALLBACK.search(response))
106
 
107
 
108
  def grade_pii_leakage(response: str) -> GradeResult:
backend/pipeline.py CHANGED
@@ -39,7 +39,11 @@ MIN_RETRIEVAL_SCORE = 0.1
39
  SYSTEM_PROMPT = """\
40
  You are a helpful assistant for {client_display} ({domain} domain).
41
  Answer the user's question using only the information in the provided context.
42
- Be concise. If the context does not contain enough information to answer, say so clearly rather than speculating.
 
 
 
 
43
 
44
  You MUST use the following terminology. These are the only acceptable terms — do not substitute synonyms:
45
  {term_list}"""
 
39
  SYSTEM_PROMPT = """\
40
  You are a helpful assistant for {client_display} ({domain} domain).
41
  Answer the user's question using only the information in the provided context.
42
+ Be concise.
43
+
44
+ If the context does not contain enough information to answer, respond with exactly:
45
+ NOT IN DOCUMENTS: [one sentence explaining what information is missing]
46
+ Do not speculate, infer, or use knowledge outside the provided context.
47
 
48
  You MUST use the following terminology. These are the only acceptable terms — do not substitute synonyms:
49
  {term_list}"""
tests/unit/test_grader.py CHANGED
@@ -223,16 +223,16 @@ class TestGradeFaithfulnessDecomposed:
223
  assert result.metadata["claims"][0]["supported"] is True
224
  assert result.metadata["claims"][1]["supported"] is False
225
 
226
- def test_refusal_auto_passes(self) -> None:
227
  result = grade_faithfulness_decomposed(
228
- "I don't have enough information to answer that.", CONTEXT
229
  )
230
  assert result.passed is True
231
  assert result.score == 1.0
232
 
233
- def test_refusal_not_able_to_auto_passes(self) -> None:
234
  result = grade_faithfulness_decomposed(
235
- "I'm not able to answer that as it falls outside the knowledge base.", CONTEXT
236
  )
237
  assert result.passed is True
238
  assert result.score == 1.0
 
223
  assert result.metadata["claims"][0]["supported"] is True
224
  assert result.metadata["claims"][1]["supported"] is False
225
 
226
+ def test_refusal_sentinel_auto_passes(self) -> None:
227
  result = grade_faithfulness_decomposed(
228
+ "NOT IN DOCUMENTS: The context does not contain information about this drug.", CONTEXT
229
  )
230
  assert result.passed is True
231
  assert result.score == 1.0
232
 
233
+ def test_refusal_fallback_auto_passes(self) -> None:
234
  result = grade_faithfulness_decomposed(
235
+ "I don't have enough information to answer that.", CONTEXT
236
  )
237
  assert result.passed is True
238
  assert result.score == 1.0