Spaces:

below-threshold
/

ai-response-validator

Sleeping

mbochniak01 Claude Sonnet 4.6 commited on 21 days ago

Commit

a42a9e0

1 Parent(s): 1ea03d4

Load Vectara model via transformers pipeline, not CrossEncoder

CrossEncoder conflicts with HHEMv2's custom architecture on newer
transformers (tied weights resolution mismatch). Vectara's own docs
recommend transformers.pipeline("text-classification").

Pipeline output: {"label": "Factually Consistent", "score": float}
Normalise: if label is inconsistent, faithfulness = 1 - score.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

Dockerfile +3 -2
backend/grader.py +21 -9

Dockerfile CHANGED Viewed

@@ -12,9 +12,10 @@ RUN pip install --no-cache-dir -r requirements.txt
 # Pre-download models so first request isn't slow on HF Spaces
 RUN python -c "\
-from sentence_transformers import SentenceTransformer, CrossEncoder; \
 SentenceTransformer('all-MiniLM-L6-v2'); \
-CrossEncoder('vectara/hallucination_evaluation_model', trust_remote_code=True)"
 COPY knowledge/ ./knowledge/
 COPY backend/   ./backend/

 # Pre-download models so first request isn't slow on HF Spaces
 RUN python -c "\
+from sentence_transformers import SentenceTransformer; \
+from transformers import pipeline; \
 SentenceTransformer('all-MiniLM-L6-v2'); \
+pipeline('text-classification', model='vectara/hallucination_evaluation_model', trust_remote_code=True)"
 COPY knowledge/ ./knowledge/
 COPY backend/   ./backend/

backend/grader.py CHANGED Viewed

@@ -5,16 +5,18 @@ Metrics:
   pii_leakage        — regex scan for PII patterns in response
   token_budget       — response within allowed token ceiling
   answer_relevancy   — cosine similarity between query and response embeddings
-  faithfulness       — NLI cross-encoder: is response grounded in retrieved context?
   chain_terminology  — deterministic: client-specific terms used (via RosettaStone)
 """
 import re
 import logging
 from dataclasses import dataclass, field
-from sentence_transformers import SentenceTransformer, CrossEncoder
 from sklearn.metrics.pairwise import cosine_similarity
 from config import EMBEDDER_MODEL
 from rosetta import check_terminology
@@ -22,7 +24,7 @@ from rosetta import check_terminology
 log = logging.getLogger(__name__)
 _embedder: SentenceTransformer | None = None
-_nli_model: CrossEncoder | None = None
 NLI_MODEL = "vectara/hallucination_evaluation_model"
@@ -35,11 +37,17 @@ def get_embedder() -> SentenceTransformer:
     return _embedder
-def get_nli_model() -> CrossEncoder:
-    """Return the shared NLI cross-encoder, loading it on first call."""
     global _nli_model
     if _nli_model is None:
-        _nli_model = CrossEncoder(NLI_MODEL, trust_remote_code=True)
     return _nli_model
@@ -153,9 +161,13 @@ def grade_faithfulness(response: str, context: str) -> GradeResult:
     if not raw_chunks:
         return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
     chunks = [_strip_chunk_title(c) for c in raw_chunks]
-    # Vectara model: predict((document, response)) → faithfulness probability in [0, 1]
-    pairs = [(chunk, response) for chunk in chunks]
-    scores = model.predict(pairs)
     score = float(max(scores))
     passed = score >= FAITHFULNESS_THRESHOLD
     return GradeResult(

   pii_leakage        — regex scan for PII patterns in response
   token_budget       — response within allowed token ceiling
   answer_relevancy   — cosine similarity between query and response embeddings
+  faithfulness       — Vectara HHEM v2: RAG faithfulness probability per (doc, response) pair
   chain_terminology  — deterministic: client-specific terms used (via RosettaStone)
 """
 import re
 import logging
 from dataclasses import dataclass, field
+from typing import Any
+from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
+from transformers import pipeline as hf_pipeline
 from config import EMBEDDER_MODEL
 from rosetta import check_terminology
 log = logging.getLogger(__name__)
 _embedder: SentenceTransformer | None = None
+_nli_model: Any = None
 NLI_MODEL = "vectara/hallucination_evaluation_model"
     return _embedder
+def get_nli_model() -> Any:
+    """Return the shared Vectara faithfulness pipeline, loading it on first call."""
     global _nli_model
     if _nli_model is None:
+        _nli_model = hf_pipeline(
+            "text-classification",
+            model=NLI_MODEL,
+            trust_remote_code=True,
+            truncation=True,
+            max_length=512,
+        )
     return _nli_model
     if not raw_chunks:
         return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
     chunks = [_strip_chunk_title(c) for c in raw_chunks]
+    # Vectara HHEM v2: text-classification pipeline, label "Factually Consistent" = faithful
+    pairs = [{"text": chunk, "text_pair": response} for chunk in chunks]
+    results = model(pairs)
+    scores = [
+        r["score"] if "consistent" in r["label"].lower() else 1.0 - r["score"]
+        for r in results
+    ]
     score = float(max(scores))
     passed = score >= FAITHFULNESS_THRESHOLD
     return GradeResult(