mbochniak01 Claude Sonnet 4.6 commited on
Commit
5935cf6
·
1 Parent(s): e77a2f2

Fix Vectara label check and input format

Browse files

Two bugs:
1. Label check used 'in' so "inconsistent" matched "consistent" — always returned
raw score regardless of label. Fixed with startswith("factually consistent").
2. text_pair dict format causes T5 tokenizer to encode pairs incorrectly. Switched
to single concatenated string: "{chunk} {response}". T5 handles one sequence
cleanly; text_pair semantics are BERT-specific.

Also added debug logging of raw Vectara output for future calibration.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. Dockerfile +2 -1
  2. backend/grader.py +6 -4
Dockerfile CHANGED
@@ -16,7 +16,8 @@ from sentence_transformers import SentenceTransformer; \
16
  from transformers import T5Tokenizer, pipeline; \
17
  SentenceTransformer('all-MiniLM-L6-v2'); \
18
  tok = T5Tokenizer.from_pretrained('t5-small'); \
19
- pipeline('text-classification', model='vectara/hallucination_evaluation_model', tokenizer=tok, trust_remote_code=True)"
 
20
 
21
  COPY knowledge/ ./knowledge/
22
  COPY backend/ ./backend/
 
16
  from transformers import T5Tokenizer, pipeline; \
17
  SentenceTransformer('all-MiniLM-L6-v2'); \
18
  tok = T5Tokenizer.from_pretrained('t5-small'); \
19
+ pipe = pipeline('text-classification', model='vectara/hallucination_evaluation_model', tokenizer=tok, trust_remote_code=True); \
20
+ pipe(['test document test response'])"
21
 
22
  COPY knowledge/ ./knowledge/
23
  COPY backend/ ./backend/
backend/grader.py CHANGED
@@ -163,11 +163,13 @@ def grade_faithfulness(response: str, context: str) -> GradeResult:
163
  if not raw_chunks:
164
  return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
165
  chunks = [_strip_chunk_title(c) for c in raw_chunks]
166
- # Vectara HHEM v2: text-classification pipeline, label "Factually Consistent" = faithful
167
- pairs = [{"text": chunk, "text_pair": response} for chunk in chunks]
168
- results = model(pairs)
 
 
169
  scores = [
170
- r["score"] if "consistent" in r["label"].lower() else 1.0 - r["score"]
171
  for r in results
172
  ]
173
  score = float(max(scores))
 
163
  if not raw_chunks:
164
  return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
165
  chunks = [_strip_chunk_title(c) for c in raw_chunks]
166
+ # Vectara HHEM v2: single concatenated string avoids T5 text_pair encoding issues.
167
+ # Label "Factually Consistent" = faithful; use startswith to avoid "inconsistent" false match.
168
+ inputs = [f"{chunk} {response}" for chunk in chunks]
169
+ results = model(inputs)
170
+ log.debug("Vectara raw results: %s", results)
171
  scores = [
172
+ r["score"] if r["label"].lower().startswith("factually consistent") else 1.0 - r["score"]
173
  for r in results
174
  ]
175
  score = float(max(scores))