mbochniak01 Claude Sonnet 4.6 commited on
Commit ·
5935cf6
1
Parent(s): e77a2f2
Fix Vectara label check and input format
Browse filesTwo bugs:
1. Label check used 'in' so "inconsistent" matched "consistent" — always returned
raw score regardless of label. Fixed with startswith("factually consistent").
2. text_pair dict format causes T5 tokenizer to encode pairs incorrectly. Switched
to single concatenated string: "{chunk} {response}". T5 handles one sequence
cleanly; text_pair semantics are BERT-specific.
Also added debug logging of raw Vectara output for future calibration.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- Dockerfile +2 -1
- backend/grader.py +6 -4
Dockerfile
CHANGED
|
@@ -16,7 +16,8 @@ from sentence_transformers import SentenceTransformer; \
|
|
| 16 |
from transformers import T5Tokenizer, pipeline; \
|
| 17 |
SentenceTransformer('all-MiniLM-L6-v2'); \
|
| 18 |
tok = T5Tokenizer.from_pretrained('t5-small'); \
|
| 19 |
-
pipeline('text-classification', model='vectara/hallucination_evaluation_model', tokenizer=tok, trust_remote_code=True)
|
|
|
|
| 20 |
|
| 21 |
COPY knowledge/ ./knowledge/
|
| 22 |
COPY backend/ ./backend/
|
|
|
|
| 16 |
from transformers import T5Tokenizer, pipeline; \
|
| 17 |
SentenceTransformer('all-MiniLM-L6-v2'); \
|
| 18 |
tok = T5Tokenizer.from_pretrained('t5-small'); \
|
| 19 |
+
pipe = pipeline('text-classification', model='vectara/hallucination_evaluation_model', tokenizer=tok, trust_remote_code=True); \
|
| 20 |
+
pipe(['test document test response'])"
|
| 21 |
|
| 22 |
COPY knowledge/ ./knowledge/
|
| 23 |
COPY backend/ ./backend/
|
backend/grader.py
CHANGED
|
@@ -163,11 +163,13 @@ def grade_faithfulness(response: str, context: str) -> GradeResult:
|
|
| 163 |
if not raw_chunks:
|
| 164 |
return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
|
| 165 |
chunks = [_strip_chunk_title(c) for c in raw_chunks]
|
| 166 |
-
# Vectara HHEM v2:
|
| 167 |
-
|
| 168 |
-
|
|
|
|
|
|
|
| 169 |
scores = [
|
| 170 |
-
r["score"] if
|
| 171 |
for r in results
|
| 172 |
]
|
| 173 |
score = float(max(scores))
|
|
|
|
| 163 |
if not raw_chunks:
|
| 164 |
return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
|
| 165 |
chunks = [_strip_chunk_title(c) for c in raw_chunks]
|
| 166 |
+
# Vectara HHEM v2: single concatenated string avoids T5 text_pair encoding issues.
|
| 167 |
+
# Label "Factually Consistent" = faithful; use startswith to avoid "inconsistent" false match.
|
| 168 |
+
inputs = [f"{chunk} {response}" for chunk in chunks]
|
| 169 |
+
results = model(inputs)
|
| 170 |
+
log.debug("Vectara raw results: %s", results)
|
| 171 |
scores = [
|
| 172 |
+
r["score"] if r["label"].lower().startswith("factually consistent") else 1.0 - r["score"]
|
| 173 |
for r in results
|
| 174 |
]
|
| 175 |
score = float(max(scores))
|