below-threshold commited on
Commit
cd30e2d
·
1 Parent(s): aef9f0f

Faithfulness: mean sentence scoring, strip chunk title prefix, lower threshold to 0.35

Browse files
Files changed (2) hide show
  1. backend/grader.py +15 -7
  2. eval/bot-answers.json +426 -0
backend/grader.py CHANGED
@@ -82,7 +82,7 @@ _PII_PATTERNS = [
82
 
83
  TOKEN_BUDGET = 512
84
  RELEVANCY_THRESHOLD = 0.45
85
- FAITHFULNESS_THRESHOLD = 0.7
86
 
87
  _REFUSAL_PATTERNS = re.compile(
88
  r"(i (don't|do not|cannot|can't) (have|find|provide|answer)|"
@@ -134,18 +134,26 @@ def grade_answer_relevancy(query: str, response: str) -> GradeResult:
134
  )
135
 
136
 
 
 
 
 
 
 
 
137
  def grade_faithfulness(response: str, context: str) -> GradeResult:
138
- """Sentence-level NLI faithfulness: weakest sentence score wins."""
139
  if _is_refusal(response):
140
  return GradeResult(
141
  metric="faithfulness", passed=True, score=1.0,
142
  detail="Refusal — no factual claims to verify",
143
  )
144
  model = get_nli_model()
145
- chunks = [c.strip() for c in context.split("\n\n") if c.strip()]
146
- if not chunks:
147
  return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
148
- sentences = [s.strip() for s in response.split(". ") if len(s.split()) >= 8]
 
149
  if not sentences:
150
  sentences = [response]
151
  sentence_scores = []
@@ -155,13 +163,13 @@ def grade_faithfulness(response: str, context: str) -> GradeResult:
155
  chunk_scores = model.predict(pairs, apply_softmax=True)
156
  best = float(max(s[1] for s in chunk_scores))
157
  sentence_scores.append(best)
158
- score = min(sentence_scores)
159
  passed = score >= FAITHFULNESS_THRESHOLD
160
  return GradeResult(
161
  metric="faithfulness",
162
  passed=passed,
163
  score=score,
164
- detail=f"NLI min-sentence entailment {score:.3f} (threshold: {FAITHFULNESS_THRESHOLD})",
165
  metadata={"sentence_scores": [round(s, 3) for s in sentence_scores]},
166
  )
167
 
 
82
 
83
  TOKEN_BUDGET = 512
84
  RELEVANCY_THRESHOLD = 0.45
85
+ FAITHFULNESS_THRESHOLD = 0.35
86
 
87
  _REFUSAL_PATTERNS = re.compile(
88
  r"(i (don't|do not|cannot|can't) (have|find|provide|answer)|"
 
134
  )
135
 
136
 
137
+ def _strip_chunk_title(chunk: str) -> str:
138
+ """Remove [Title] prefix added by _build_context before NLI scoring."""
139
+ if chunk.startswith("[") and "]\n" in chunk:
140
+ return chunk.split("]\n", 1)[1].strip()
141
+ return chunk
142
+
143
+
144
  def grade_faithfulness(response: str, context: str) -> GradeResult:
145
+ """NLI faithfulness: mean sentence entailment against best-matching context chunk."""
146
  if _is_refusal(response):
147
  return GradeResult(
148
  metric="faithfulness", passed=True, score=1.0,
149
  detail="Refusal — no factual claims to verify",
150
  )
151
  model = get_nli_model()
152
+ raw_chunks = [c.strip() for c in context.split("\n\n") if c.strip()]
153
+ if not raw_chunks:
154
  return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
155
+ chunks = [_strip_chunk_title(c) for c in raw_chunks]
156
+ sentences = [s.strip() for s in response.split(". ") if len(s.split()) >= 6]
157
  if not sentences:
158
  sentences = [response]
159
  sentence_scores = []
 
163
  chunk_scores = model.predict(pairs, apply_softmax=True)
164
  best = float(max(s[1] for s in chunk_scores))
165
  sentence_scores.append(best)
166
+ score = sum(sentence_scores) / len(sentence_scores) # mean — fairer than min
167
  passed = score >= FAITHFULNESS_THRESHOLD
168
  return GradeResult(
169
  metric="faithfulness",
170
  passed=passed,
171
  score=score,
172
+ detail=f"NLI mean-sentence entailment {score:.3f} (threshold: {FAITHFULNESS_THRESHOLD})",
173
  metadata={"sentence_scores": [round(s, 3) for s in sentence_scores]},
174
  )
175
 
eval/bot-answers.json ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": 1,
4
+ "client": "novamart",
5
+ "question": "What happens when a product runs out of stock?",
6
+ "error": "timeout"
7
+ },
8
+ {
9
+ "id": 2,
10
+ "client": "novamart",
11
+ "question": "How do I onboard a new merchant?",
12
+ "answer": "The information doesn't provide details on merchant onboarding. I can only provide a general statement, as the context only discusses supplier setup.",
13
+ "overall": "FAIL",
14
+ "flagged": true,
15
+ "scores": {
16
+ "pii_leakage": 1.0,
17
+ "token_budget": 1.0,
18
+ "answer_relevancy": 0.724,
19
+ "faithfulness": 0.0,
20
+ "chain_terminology": 1.0
21
+ },
22
+ "passed": {
23
+ "pii_leakage": true,
24
+ "token_budget": true,
25
+ "answer_relevancy": true,
26
+ "faithfulness": false,
27
+ "chain_terminology": true
28
+ }
29
+ },
30
+ {
31
+ "id": 3,
32
+ "client": "novamart",
33
+ "question": "What is an availability scan and when does it trigger?",
34
+ "answer": "I cannot confirm information about what an availability scan is.",
35
+ "overall": "FAIL",
36
+ "flagged": true,
37
+ "scores": {
38
+ "pii_leakage": 1.0,
39
+ "token_budget": 1.0,
40
+ "answer_relevancy": 0.876,
41
+ "faithfulness": 0.0,
42
+ "chain_terminology": 1.0
43
+ },
44
+ "passed": {
45
+ "pii_leakage": true,
46
+ "token_budget": true,
47
+ "answer_relevancy": true,
48
+ "faithfulness": false,
49
+ "chain_terminology": true
50
+ }
51
+ },
52
+ {
53
+ "id": 4,
54
+ "client": "novamart",
55
+ "question": "How do I update pricing for an item in the item registry?",
56
+ "answer": "To update pricing for an item in the item registry, you must submit a price update (pricing sync) through the pricing portal. Changes greater than 15% of the current price require approval, while smaller changes can be approved automatically. Approved changes will go live at the next scheduled sync window, which runs every four hours.",
57
+ "overall": "FAIL",
58
+ "flagged": true,
59
+ "scores": {
60
+ "pii_leakage": 1.0,
61
+ "token_budget": 1.0,
62
+ "answer_relevancy": 0.839,
63
+ "faithfulness": 0.0,
64
+ "chain_terminology": 1.0
65
+ },
66
+ "passed": {
67
+ "pii_leakage": true,
68
+ "token_budget": true,
69
+ "answer_relevancy": true,
70
+ "faithfulness": false,
71
+ "chain_terminology": true
72
+ }
73
+ },
74
+ {
75
+ "id": 5,
76
+ "client": "novamart",
77
+ "question": "What is a capability switch and how do I enable one?",
78
+ "answer": "A capability switch is also referred to as a feature flag. To enable one in production, you will need to get sign-off from both the product and engineering lead.",
79
+ "overall": "PASS",
80
+ "flagged": false,
81
+ "scores": {
82
+ "pii_leakage": 1.0,
83
+ "token_budget": 1.0,
84
+ "answer_relevancy": 0.799,
85
+ "faithfulness": 0.889,
86
+ "chain_terminology": 1.0
87
+ },
88
+ "passed": {
89
+ "pii_leakage": true,
90
+ "token_budget": true,
91
+ "answer_relevancy": true,
92
+ "faithfulness": true,
93
+ "chain_terminology": true
94
+ }
95
+ },
96
+ {
97
+ "id": 6,
98
+ "client": "shelfwise",
99
+ "question": "What triggers an out-of-stock alert?",
100
+ "answer": "A product's on-hand quantity dropping below its configured reorder point triggers an out-of-stock alert.",
101
+ "overall": "FAIL",
102
+ "flagged": true,
103
+ "scores": {
104
+ "pii_leakage": 1.0,
105
+ "token_budget": 1.0,
106
+ "answer_relevancy": 0.677,
107
+ "faithfulness": 0.026,
108
+ "chain_terminology": 1.0
109
+ },
110
+ "passed": {
111
+ "pii_leakage": true,
112
+ "token_budget": true,
113
+ "answer_relevancy": true,
114
+ "faithfulness": false,
115
+ "chain_terminology": true
116
+ }
117
+ },
118
+ {
119
+ "id": 7,
120
+ "client": "shelfwise",
121
+ "question": "How does planogram compliance work?",
122
+ "answer": "I couldn't find any information in the provided context about planogram compliance. It seems that the context only covers compliance reporting, store configuration, and supplier setup.",
123
+ "overall": "FAIL",
124
+ "flagged": true,
125
+ "scores": {
126
+ "pii_leakage": 1.0,
127
+ "token_budget": 1.0,
128
+ "answer_relevancy": 0.791,
129
+ "faithfulness": 0.0,
130
+ "chain_terminology": 1.0
131
+ },
132
+ "passed": {
133
+ "pii_leakage": true,
134
+ "token_budget": true,
135
+ "answer_relevancy": true,
136
+ "faithfulness": false,
137
+ "chain_terminology": true
138
+ }
139
+ },
140
+ {
141
+ "id": 8,
142
+ "client": "shelfwise",
143
+ "question": "How do I add a new store to the system?",
144
+ "answer": "Unfortunately, the provided context does not contain the information on how to add a new store to the system.",
145
+ "overall": "FAIL",
146
+ "flagged": true,
147
+ "scores": {
148
+ "pii_leakage": 1.0,
149
+ "token_budget": 1.0,
150
+ "answer_relevancy": 0.849,
151
+ "faithfulness": 0.0,
152
+ "chain_terminology": 1.0
153
+ },
154
+ "passed": {
155
+ "pii_leakage": true,
156
+ "token_budget": true,
157
+ "answer_relevancy": true,
158
+ "faithfulness": false,
159
+ "chain_terminology": true
160
+ }
161
+ },
162
+ {
163
+ "id": 9,
164
+ "client": "shelfwise",
165
+ "question": "How do I generate a compliance report for a store?",
166
+ "answer": "To generate a compliance report for a store, access must be granted to a user with the Audit role or higher.",
167
+ "overall": "FAIL",
168
+ "flagged": true,
169
+ "scores": {
170
+ "pii_leakage": 1.0,
171
+ "token_budget": 1.0,
172
+ "answer_relevancy": 0.759,
173
+ "faithfulness": 0.038,
174
+ "chain_terminology": 1.0
175
+ },
176
+ "passed": {
177
+ "pii_leakage": true,
178
+ "token_budget": true,
179
+ "answer_relevancy": true,
180
+ "faithfulness": false,
181
+ "chain_terminology": true
182
+ }
183
+ },
184
+ {
185
+ "id": 10,
186
+ "client": "shelfwise",
187
+ "question": "What happens if a feature toggle is disabled mid-deployment?",
188
+ "answer": "The context does not specify what happens if a feature toggle is disabled mid-deployment.",
189
+ "overall": "FAIL",
190
+ "flagged": true,
191
+ "scores": {
192
+ "pii_leakage": 1.0,
193
+ "token_budget": 1.0,
194
+ "answer_relevancy": 0.887,
195
+ "faithfulness": 0.0,
196
+ "chain_terminology": 1.0
197
+ },
198
+ "passed": {
199
+ "pii_leakage": true,
200
+ "token_budget": true,
201
+ "answer_relevancy": true,
202
+ "faithfulness": false,
203
+ "chain_terminology": true
204
+ }
205
+ },
206
+ {
207
+ "id": 11,
208
+ "client": "clinixone",
209
+ "question": "What is prior authorization and when is it required?",
210
+ "answer": "Prior authorization, also known as formulary pre-approval, is a requirement by a payer that a prescriber obtain approval before a specific drug is dispensed and covered. It is needed when a payer wants the prescriber to submit clinical justification for the specific drug.",
211
+ "overall": "FAIL",
212
+ "flagged": true,
213
+ "scores": {
214
+ "pii_leakage": 1.0,
215
+ "token_budget": 1.0,
216
+ "answer_relevancy": 0.735,
217
+ "faithfulness": 0.021,
218
+ "chain_terminology": 1.0
219
+ },
220
+ "passed": {
221
+ "pii_leakage": true,
222
+ "token_budget": true,
223
+ "answer_relevancy": true,
224
+ "faithfulness": false,
225
+ "chain_terminology": true
226
+ }
227
+ },
228
+ {
229
+ "id": 12,
230
+ "client": "clinixone",
231
+ "question": "What are contraindicated drug combinations and how are they flagged?",
232
+ "answer": "Contraindicated drug combinations occur when one drug affects the activity of another when both are administered together. \n\nThese combinations are flagged by alerting prescribers at point-of-care when a contraindicated combination is entered. The severity classification of interactions follows the clinical pharmacology guidelines maintained in the formulary interaction database.",
233
+ "overall": "FAIL",
234
+ "flagged": true,
235
+ "scores": {
236
+ "pii_leakage": 1.0,
237
+ "token_budget": 1.0,
238
+ "answer_relevancy": 0.802,
239
+ "faithfulness": 0.003,
240
+ "chain_terminology": 1.0
241
+ },
242
+ "passed": {
243
+ "pii_leakage": true,
244
+ "token_budget": true,
245
+ "answer_relevancy": true,
246
+ "faithfulness": false,
247
+ "chain_terminology": true
248
+ }
249
+ },
250
+ {
251
+ "id": 13,
252
+ "client": "clinixone",
253
+ "question": "How should adverse events be reported?",
254
+ "answer": "Adverse events must be reported to the regulatory authority within 15 days for serious unexpected events and 90 days for expected events.",
255
+ "overall": "FAIL",
256
+ "flagged": true,
257
+ "scores": {
258
+ "pii_leakage": 1.0,
259
+ "token_budget": 1.0,
260
+ "answer_relevancy": 0.676,
261
+ "faithfulness": 0.117,
262
+ "chain_terminology": 1.0
263
+ },
264
+ "passed": {
265
+ "pii_leakage": true,
266
+ "token_budget": true,
267
+ "answer_relevancy": true,
268
+ "faithfulness": false,
269
+ "chain_terminology": true
270
+ }
271
+ },
272
+ {
273
+ "id": 14,
274
+ "client": "clinixone",
275
+ "question": "What is a safety signal and how is it escalated?",
276
+ "answer": "A safety signal (pharmacovigilance alert) is a pattern of adverse events that suggests a previously unknown or incompletely documented causal relationship between a drug and an outcome. Adverse events must be reported to the regulatory authority within 15 days for serious unexpected events and 90 days for expected events.",
277
+ "overall": "FAIL",
278
+ "flagged": true,
279
+ "scores": {
280
+ "pii_leakage": 1.0,
281
+ "token_budget": 1.0,
282
+ "answer_relevancy": 0.644,
283
+ "faithfulness": 0.001,
284
+ "chain_terminology": 1.0
285
+ },
286
+ "passed": {
287
+ "pii_leakage": true,
288
+ "token_budget": true,
289
+ "answer_relevancy": true,
290
+ "faithfulness": false,
291
+ "chain_terminology": true
292
+ }
293
+ },
294
+ {
295
+ "id": 15,
296
+ "client": "clinixone",
297
+ "question": "Which drugs are contraindicated in patients with bleeding disorders?",
298
+ "answer": "The context does not mention any contraindication for patients with bleeding disorders.",
299
+ "overall": "PASS",
300
+ "flagged": false,
301
+ "scores": {
302
+ "pii_leakage": 1.0,
303
+ "token_budget": 1.0,
304
+ "answer_relevancy": 0.665,
305
+ "faithfulness": 1.0,
306
+ "chain_terminology": 1.0
307
+ },
308
+ "passed": {
309
+ "pii_leakage": true,
310
+ "token_budget": true,
311
+ "answer_relevancy": true,
312
+ "faithfulness": true,
313
+ "chain_terminology": true
314
+ }
315
+ },
316
+ {
317
+ "id": 16,
318
+ "client": "clinixone",
319
+ "question": "What adverse events are associated with drugs for allergy relief?",
320
+ "answer": "There is no information about drugs specifically for allergy relief in the context provided.",
321
+ "overall": "FAIL",
322
+ "flagged": true,
323
+ "scores": {
324
+ "pii_leakage": 1.0,
325
+ "token_budget": 1.0,
326
+ "answer_relevancy": 0.741,
327
+ "faithfulness": 0.0,
328
+ "chain_terminology": 1.0
329
+ },
330
+ "passed": {
331
+ "pii_leakage": true,
332
+ "token_budget": true,
333
+ "answer_relevancy": true,
334
+ "faithfulness": false,
335
+ "chain_terminology": true
336
+ }
337
+ },
338
+ {
339
+ "id": 17,
340
+ "client": "clinixone",
341
+ "question": "What drugs cause fatigue as a known adverse event?",
342
+ "answer": "Amoxstatin62, Amoxzole72, and Mecozole85 cause fatigue as a known adverse event.",
343
+ "overall": "FAIL",
344
+ "flagged": true,
345
+ "scores": {
346
+ "pii_leakage": 1.0,
347
+ "token_budget": 1.0,
348
+ "answer_relevancy": 0.715,
349
+ "faithfulness": 0.0,
350
+ "chain_terminology": 1.0
351
+ },
352
+ "passed": {
353
+ "pii_leakage": true,
354
+ "token_budget": true,
355
+ "answer_relevancy": true,
356
+ "faithfulness": false,
357
+ "chain_terminology": true
358
+ }
359
+ },
360
+ {
361
+ "id": 18,
362
+ "client": "pharmalink",
363
+ "question": "What is formulary pre-approval?",
364
+ "answer": "Formulary pre-approval is a prior authorization requirement by a payer that a prescriber obtain approval before a specific drug is dispensed and covered.",
365
+ "overall": "FAIL",
366
+ "flagged": true,
367
+ "scores": {
368
+ "pii_leakage": 1.0,
369
+ "token_budget": 1.0,
370
+ "answer_relevancy": 0.869,
371
+ "faithfulness": 0.003,
372
+ "chain_terminology": 1.0
373
+ },
374
+ "passed": {
375
+ "pii_leakage": true,
376
+ "token_budget": true,
377
+ "answer_relevancy": true,
378
+ "faithfulness": false,
379
+ "chain_terminology": true
380
+ }
381
+ },
382
+ {
383
+ "id": 19,
384
+ "client": "pharmalink",
385
+ "question": "What is a pharmacovigilance alert and when does it fire?",
386
+ "answer": "A pharmacovigilance alert, also known as a safety signal, is a pattern of adverse events that suggests a previously unknown or incompletely documented causal relationship between a drug and an outcome. \n\nIt suggests a possible causal relationship but does not confirm one.",
387
+ "overall": "FAIL",
388
+ "flagged": true,
389
+ "scores": {
390
+ "pii_leakage": 1.0,
391
+ "token_budget": 1.0,
392
+ "answer_relevancy": 0.811,
393
+ "faithfulness": 0.005,
394
+ "chain_terminology": 1.0
395
+ },
396
+ "passed": {
397
+ "pii_leakage": true,
398
+ "token_budget": true,
399
+ "answer_relevancy": true,
400
+ "faithfulness": false,
401
+ "chain_terminology": true
402
+ }
403
+ },
404
+ {
405
+ "id": 20,
406
+ "client": "pharmalink",
407
+ "question": "Which drugs have a prescriber warning about avoiding alcohol?",
408
+ "answer": "There's no information in the provided context regarding a prescriber warning about avoiding alcohol for any of the mentioned drugs (Zylozole96, Zylozepam79, Zylozole29).",
409
+ "overall": "FAIL",
410
+ "flagged": true,
411
+ "scores": {
412
+ "pii_leakage": 1.0,
413
+ "token_budget": 1.0,
414
+ "answer_relevancy": 0.792,
415
+ "faithfulness": 0.0,
416
+ "chain_terminology": 1.0
417
+ },
418
+ "passed": {
419
+ "pii_leakage": true,
420
+ "token_budget": true,
421
+ "answer_relevancy": true,
422
+ "faithfulness": false,
423
+ "chain_terminology": true
424
+ }
425
+ }
426
+ ]