mbochniak01 Claude Sonnet 4.6 commited on
Commit Β·
a42a9e0
1
Parent(s): 1ea03d4
Load Vectara model via transformers pipeline, not CrossEncoder
Browse filesCrossEncoder conflicts with HHEMv2's custom architecture on newer
transformers (tied weights resolution mismatch). Vectara's own docs
recommend transformers.pipeline("text-classification").
Pipeline output: {"label": "Factually Consistent", "score": float}
Normalise: if label is inconsistent, faithfulness = 1 - score.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- Dockerfile +3 -2
- backend/grader.py +21 -9
Dockerfile
CHANGED
|
@@ -12,9 +12,10 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
| 12 |
|
| 13 |
# Pre-download models so first request isn't slow on HF Spaces
|
| 14 |
RUN python -c "\
|
| 15 |
-
from sentence_transformers import SentenceTransformer
|
|
|
|
| 16 |
SentenceTransformer('all-MiniLM-L6-v2'); \
|
| 17 |
-
|
| 18 |
|
| 19 |
COPY knowledge/ ./knowledge/
|
| 20 |
COPY backend/ ./backend/
|
|
|
|
| 12 |
|
| 13 |
# Pre-download models so first request isn't slow on HF Spaces
|
| 14 |
RUN python -c "\
|
| 15 |
+
from sentence_transformers import SentenceTransformer; \
|
| 16 |
+
from transformers import pipeline; \
|
| 17 |
SentenceTransformer('all-MiniLM-L6-v2'); \
|
| 18 |
+
pipeline('text-classification', model='vectara/hallucination_evaluation_model', trust_remote_code=True)"
|
| 19 |
|
| 20 |
COPY knowledge/ ./knowledge/
|
| 21 |
COPY backend/ ./backend/
|
backend/grader.py
CHANGED
|
@@ -5,16 +5,18 @@ Metrics:
|
|
| 5 |
pii_leakage β regex scan for PII patterns in response
|
| 6 |
token_budget β response within allowed token ceiling
|
| 7 |
answer_relevancy β cosine similarity between query and response embeddings
|
| 8 |
-
faithfulness β
|
| 9 |
chain_terminology β deterministic: client-specific terms used (via RosettaStone)
|
| 10 |
"""
|
| 11 |
|
| 12 |
import re
|
| 13 |
import logging
|
| 14 |
from dataclasses import dataclass, field
|
|
|
|
| 15 |
|
| 16 |
-
from sentence_transformers import SentenceTransformer
|
| 17 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
| 18 |
|
| 19 |
from config import EMBEDDER_MODEL
|
| 20 |
from rosetta import check_terminology
|
|
@@ -22,7 +24,7 @@ from rosetta import check_terminology
|
|
| 22 |
log = logging.getLogger(__name__)
|
| 23 |
|
| 24 |
_embedder: SentenceTransformer | None = None
|
| 25 |
-
_nli_model:
|
| 26 |
|
| 27 |
NLI_MODEL = "vectara/hallucination_evaluation_model"
|
| 28 |
|
|
@@ -35,11 +37,17 @@ def get_embedder() -> SentenceTransformer:
|
|
| 35 |
return _embedder
|
| 36 |
|
| 37 |
|
| 38 |
-
def get_nli_model() ->
|
| 39 |
-
"""Return the shared
|
| 40 |
global _nli_model
|
| 41 |
if _nli_model is None:
|
| 42 |
-
_nli_model =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
return _nli_model
|
| 44 |
|
| 45 |
|
|
@@ -153,9 +161,13 @@ def grade_faithfulness(response: str, context: str) -> GradeResult:
|
|
| 153 |
if not raw_chunks:
|
| 154 |
return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
|
| 155 |
chunks = [_strip_chunk_title(c) for c in raw_chunks]
|
| 156 |
-
# Vectara
|
| 157 |
-
pairs = [
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
score = float(max(scores))
|
| 160 |
passed = score >= FAITHFULNESS_THRESHOLD
|
| 161 |
return GradeResult(
|
|
|
|
| 5 |
pii_leakage β regex scan for PII patterns in response
|
| 6 |
token_budget β response within allowed token ceiling
|
| 7 |
answer_relevancy β cosine similarity between query and response embeddings
|
| 8 |
+
faithfulness β Vectara HHEM v2: RAG faithfulness probability per (doc, response) pair
|
| 9 |
chain_terminology β deterministic: client-specific terms used (via RosettaStone)
|
| 10 |
"""
|
| 11 |
|
| 12 |
import re
|
| 13 |
import logging
|
| 14 |
from dataclasses import dataclass, field
|
| 15 |
+
from typing import Any
|
| 16 |
|
| 17 |
+
from sentence_transformers import SentenceTransformer
|
| 18 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 19 |
+
from transformers import pipeline as hf_pipeline
|
| 20 |
|
| 21 |
from config import EMBEDDER_MODEL
|
| 22 |
from rosetta import check_terminology
|
|
|
|
| 24 |
log = logging.getLogger(__name__)
|
| 25 |
|
| 26 |
_embedder: SentenceTransformer | None = None
|
| 27 |
+
_nli_model: Any = None
|
| 28 |
|
| 29 |
NLI_MODEL = "vectara/hallucination_evaluation_model"
|
| 30 |
|
|
|
|
| 37 |
return _embedder
|
| 38 |
|
| 39 |
|
| 40 |
+
def get_nli_model() -> Any:
|
| 41 |
+
"""Return the shared Vectara faithfulness pipeline, loading it on first call."""
|
| 42 |
global _nli_model
|
| 43 |
if _nli_model is None:
|
| 44 |
+
_nli_model = hf_pipeline(
|
| 45 |
+
"text-classification",
|
| 46 |
+
model=NLI_MODEL,
|
| 47 |
+
trust_remote_code=True,
|
| 48 |
+
truncation=True,
|
| 49 |
+
max_length=512,
|
| 50 |
+
)
|
| 51 |
return _nli_model
|
| 52 |
|
| 53 |
|
|
|
|
| 161 |
if not raw_chunks:
|
| 162 |
return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
|
| 163 |
chunks = [_strip_chunk_title(c) for c in raw_chunks]
|
| 164 |
+
# Vectara HHEM v2: text-classification pipeline, label "Factually Consistent" = faithful
|
| 165 |
+
pairs = [{"text": chunk, "text_pair": response} for chunk in chunks]
|
| 166 |
+
results = model(pairs)
|
| 167 |
+
scores = [
|
| 168 |
+
r["score"] if "consistent" in r["label"].lower() else 1.0 - r["score"]
|
| 169 |
+
for r in results
|
| 170 |
+
]
|
| 171 |
score = float(max(scores))
|
| 172 |
passed = score >= FAITHFULNESS_THRESHOLD
|
| 173 |
return GradeResult(
|