Commit ·
7b3dadd
1
Parent(s): 6e6032f
Fix faithfulness: score per chunk, take max entailment
Browse files- backend/grader.py +8 -3
backend/grader.py
CHANGED
|
@@ -124,11 +124,16 @@ def grade_answer_relevancy(query: str, response: str) -> GradeResult:
|
|
| 124 |
|
| 125 |
|
| 126 |
def grade_faithfulness(response: str, context: str) -> GradeResult:
|
| 127 |
-
"""Score NLI entailment
|
| 128 |
model = get_nli_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
# NLI label order for cross-encoder/nli-deberta-v3-small: contradiction, entailment, neutral
|
| 130 |
-
|
| 131 |
-
|
|
|
|
| 132 |
passed = entailment_score >= FAITHFULNESS_THRESHOLD
|
| 133 |
return GradeResult(
|
| 134 |
metric="faithfulness",
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
def grade_faithfulness(response: str, context: str) -> GradeResult:
|
| 127 |
+
"""Score NLI entailment per context chunk; pass if any chunk entails the response."""
|
| 128 |
model = get_nli_model()
|
| 129 |
+
# Split on the double-newline separator used by _build_context in pipeline.py
|
| 130 |
+
chunks = [c.strip() for c in context.split("\n\n") if c.strip()]
|
| 131 |
+
if not chunks:
|
| 132 |
+
return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
|
| 133 |
# NLI label order for cross-encoder/nli-deberta-v3-small: contradiction, entailment, neutral
|
| 134 |
+
pairs = [(chunk, response) for chunk in chunks]
|
| 135 |
+
all_scores = model.predict(pairs, apply_softmax=True)
|
| 136 |
+
entailment_score = float(max(s[1] for s in all_scores))
|
| 137 |
passed = entailment_score >= FAITHFULNESS_THRESHOLD
|
| 138 |
return GradeResult(
|
| 139 |
metric="faithfulness",
|