Spaces:

below-threshold
/

ai-response-validator

Sleeping

below-threshold commited on 29 days ago

Commit

907c06a

1 Parent(s): 99649f6

Address Gate 5 audit gaps

- Faithfulness: refusal detection (score=1.0 on no-claim responses)
- Faithfulness: sentence-level min-score replaces max-chunk scoring
- Escalation: EVAL_FAIL WARNING log + flagged field in response + UI banner
- Startup: pre-warm embedder and NLI model in lifespan
- Golden dataset: 4 adversarial pairs (vague, rival-term, multi-doc, hallucination bait)
- eval/calibrate.py: threshold calibration script, no generation required

Files changed (8) hide show

NOTES.md +17 -3
backend/app.py +9 -1
backend/grader.py +32 -9
backend/pipeline.py +1 -0
eval/calibrate.py +98 -0
eval/golden-dataset.yaml +71 -0
ui/app.js +4 -0
ui/index.html +11 -0

NOTES.md CHANGED Viewed

@@ -54,9 +54,23 @@ teardown fixtures.
 - **CI pipeline**: GitHub Actions running `make lint`, `make type-check`,
   `make test` on every PR. Integration tests gated on a self-hosted runner with
   the API running.
-- **Threshold calibration report**: plot the distribution of L1 metric scores
-  across the golden dataset to verify that current thresholds aren't too tight
-  or too loose.
 ---

 - **CI pipeline**: GitHub Actions running `make lint`, `make type-check`,
   `make test` on every PR. Integration tests gated on a self-hosted runner with
   the API running.
+- **Threshold calibration report**: `eval/calibrate.py` exists and runs graders
+  against golden-dataset expected answers — threshold calibration is now a single
+  command, not a missing feature. Actual threshold adjustments require reviewing
+  the output against real query distributions.
+## Gate 5 audit gaps addressed
+- **Faithfulness false negatives on refusals**: `_is_refusal()` detects "I don't have
+  enough information" responses and returns score=1.0 — no factual claims, trivially faithful.
+- **Partial grounding blind spot**: faithfulness now uses sentence-level min-score (weakest
+  link wins) instead of max-score across chunks. A response with one hallucinated sentence
+  now fails even if other sentences are grounded.
+- **No escalation path**: `overall_pass=False` now emits a structured `EVAL_FAIL` WARNING
+  log entry and sets `flagged: true` in the response payload. UI shows a red banner.
+- **Cold-start latency**: embedder and NLI model pre-warmed at startup in the FastAPI lifespan.
+- **Happy-path-only golden dataset**: 4 adversarial pairs added (vague query, rival-term
+  prompt injection, multi-doc synthesis, hallucination bait).
 ---

backend/app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from config import DOMAIN_CLIENTS, CLIENT_DOMAIN, DISPLAY_NAMES
 from pipeline import run
 log = logging.getLogger(__name__)
@@ -25,7 +26,9 @@ async def lifespan(app: FastAPI):
     if not hf_token:
         raise RuntimeError("HF_TOKEN not set")
     app.state.hf_client = InferenceClient(token=hf_token)
-    log.info("HuggingFace Inference client ready")
     yield
@@ -49,6 +52,7 @@ class QueryResponse(BaseModel):
     client: str
     client_display: str
     answer: str
     sources: list[dict]
     evaluation: dict
@@ -81,6 +85,10 @@ def handle_query(req: QueryRequest):
         client=req.client,
         hf_client=app.state.hf_client,
     )
     return result.response_payload

 from pydantic import BaseModel
 from config import DOMAIN_CLIENTS, CLIENT_DOMAIN, DISPLAY_NAMES
+from grader import get_embedder, get_nli_model
 from pipeline import run
 log = logging.getLogger(__name__)
     if not hf_token:
         raise RuntimeError("HF_TOKEN not set")
     app.state.hf_client = InferenceClient(token=hf_token)
+    get_embedder()
+    get_nli_model()
+    log.info("Models pre-warmed. Ready.")
     yield
     client: str
     client_display: str
     answer: str
+    flagged: bool
     sources: list[dict]
     evaluation: dict
         client=req.client,
         hf_client=app.state.hf_client,
     )
+    if not result.grade_report.overall:
+        failed = [r.metric for r in result.grade_report.results if not r.passed]
+        log.warning("EVAL_FAIL client=%s failed_metrics=%s query=%r",
+                    req.client, failed, req.query.strip()[:80])
     return result.response_payload

backend/grader.py CHANGED Viewed

@@ -84,6 +84,17 @@ TOKEN_BUDGET = 512
 RELEVANCY_THRESHOLD = 0.45
 FAITHFULNESS_THRESHOLD = 0.7
 def grade_pii_leakage(response: str) -> GradeResult:
     """Scan response for PII patterns; fail on any match."""
@@ -124,22 +135,34 @@ def grade_answer_relevancy(query: str, response: str) -> GradeResult:
 def grade_faithfulness(response: str, context: str) -> GradeResult:
-    """Score NLI entailment per context chunk; pass if any chunk entails the response."""
     model = get_nli_model()
-    # Split on the double-newline separator used by _build_context in pipeline.py
     chunks = [c.strip() for c in context.split("\n\n") if c.strip()]
     if not chunks:
         return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
-    # NLI label order for cross-encoder/nli-deberta-v3-small: contradiction, entailment, neutral
-    pairs = [(chunk, response) for chunk in chunks]
-    all_scores = model.predict(pairs, apply_softmax=True)
-    entailment_score = float(max(s[1] for s in all_scores))
-    passed = entailment_score >= FAITHFULNESS_THRESHOLD
     return GradeResult(
         metric="faithfulness",
         passed=passed,
-        score=entailment_score,
-        detail=f"NLI entailment {entailment_score:.3f} (threshold: {FAITHFULNESS_THRESHOLD})",
     )

 RELEVANCY_THRESHOLD = 0.45
 FAITHFULNESS_THRESHOLD = 0.7
+_REFUSAL_PATTERNS = re.compile(
+    r"(i (don't|do not|cannot|can't) (have|find|provide|answer)|"
+    r"not enough (information|context)|"
+    r"the (context|provided) (does not|doesn't) (contain|include|mention))",
+    re.IGNORECASE,
+)
+def _is_refusal(response: str) -> bool:
+    return bool(_REFUSAL_PATTERNS.search(response))
 def grade_pii_leakage(response: str) -> GradeResult:
     """Scan response for PII patterns; fail on any match."""
 def grade_faithfulness(response: str, context: str) -> GradeResult:
+    """Sentence-level NLI faithfulness: weakest sentence score wins."""
+    if _is_refusal(response):
+        return GradeResult(
+            metric="faithfulness", passed=True, score=1.0,
+            detail="Refusal — no factual claims to verify",
+        )
     model = get_nli_model()
     chunks = [c.strip() for c in context.split("\n\n") if c.strip()]
     if not chunks:
         return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
+    sentences = [s.strip() for s in response.split(". ") if len(s.split()) >= 8]
+    if not sentences:
+        sentences = [response]
+    sentence_scores = []
+    for sentence in sentences:
+        # NLI label order: contradiction=0, entailment=1, neutral=2
+        pairs = [(chunk, sentence) for chunk in chunks]
+        chunk_scores = model.predict(pairs, apply_softmax=True)
+        best = float(max(s[1] for s in chunk_scores))
+        sentence_scores.append(best)
+    score = min(sentence_scores)
+    passed = score >= FAITHFULNESS_THRESHOLD
     return GradeResult(
         metric="faithfulness",
         passed=passed,
+        score=score,
+        detail=f"NLI min-sentence entailment {score:.3f} (threshold: {FAITHFULNESS_THRESHOLD})",
+        metadata={"sentence_scores": [round(s, 3) for s in sentence_scores]},
     )

backend/pipeline.py CHANGED Viewed

@@ -57,6 +57,7 @@ class PipelineResult:
             "client": self.client,
             "client_display": DISPLAY_NAMES.get(self.client, self.client),
             "answer": self.answer,
             "sources": [
                 {"id": d.id, "title": d.title, "score": round(d.score, 3)}
                 for d in self.retrieved_docs

             "client": self.client,
             "client_display": DISPLAY_NAMES.get(self.client, self.client),
             "answer": self.answer,
+            "flagged": not self.grade_report.overall,
             "sources": [
                 {"id": d.id, "title": d.title, "score": round(d.score, 3)}
                 for d in self.retrieved_docs

eval/calibrate.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+Threshold calibration: run L1 graders against golden-dataset expected answers.
+Uses expected_answer as the model response (no generation, no HF API call).
+Reports score distribution per metric and flags if current threshold cuts
+more than 10% of known-good answers.
+Usage:
+    cd backend && python ../eval/calibrate.py
+"""
+import sys
+import statistics
+from pathlib import Path
+import yaml
+sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
+from grader import (
+    grade_answer_relevancy,
+    grade_faithfulness,
+    RELEVANCY_THRESHOLD,
+    FAITHFULNESS_THRESHOLD,
+)
+DATASET_PATH = Path(__file__).parent / "golden-dataset.yaml"
+AT_RISK_WARN = 0.10  # flag if >10% of good answers fall below threshold
+def _load_pairs() -> list[dict]:
+    data = yaml.safe_load(DATASET_PATH.read_text())
+    return data["pairs"]
+def _percentile(values: list[float], p: float) -> float:
+    sorted_v = sorted(values)
+    idx = (len(sorted_v) - 1) * p / 100
+    lo, hi = int(idx), min(int(idx) + 1, len(sorted_v) - 1)
+    return sorted_v[lo] + (sorted_v[hi] - sorted_v[lo]) * (idx - lo)
+def _report_metric(
+    name: str,
+    scores: list[float],
+    threshold: float,
+) -> None:
+    at_risk = sum(1 for s in scores if s < threshold) / len(scores)
+    flag = " ← FLAG" if at_risk > AT_RISK_WARN else ""
+    print(
+        f"  {name:<22} "
+        f"min={min(scores):.3f}  "
+        f"p25={_percentile(scores, 25):.3f}  "
+        f"median={statistics.median(scores):.3f}  "
+        f"p75={_percentile(scores, 75):.3f}  "
+        f"max={max(scores):.3f}  "
+        f"threshold={threshold}  "
+        f"at_risk={at_risk:.0%}{flag}"
+    )
+def run() -> None:
+    pairs = _load_pairs()
+    print(f"\nCalibrating against {len(pairs)} pairs from golden-dataset.yaml\n")
+    relevancy_scores: list[float] = []
+    faithfulness_scores: list[float] = []
+    skipped_refusals = 0
+    for pair in pairs:
+        question = pair["question"]
+        answer = pair["expected_answer"].strip()
+        context = "\n\n".join(pair.get("expected_contains", []))
+        r = grade_answer_relevancy(question, answer)
+        relevancy_scores.append(r.score)
+        f = grade_faithfulness(answer, context)
+        if "Refusal" in f.detail:
+            skipped_refusals += 1
+        else:
+            faithfulness_scores.append(f.score)
+    print("Score distributions (expected_answer used as model response):\n")
+    _report_metric("answer_relevancy", relevancy_scores, RELEVANCY_THRESHOLD)
+    if faithfulness_scores:
+        _report_metric("faithfulness", faithfulness_scores, FAITHFULNESS_THRESHOLD)
+    else:
+        print("  faithfulness           — all pairs triggered refusal detection, no NLI scores")
+    if skipped_refusals:
+        print(f"\n  ({skipped_refusals} pairs skipped for faithfulness — refusal responses)")
+    print()
+if __name__ == "__main__":
+    run()

eval/golden-dataset.yaml CHANGED Viewed

@@ -339,3 +339,74 @@ pairs:
       a voluntary decision. Consent must be obtained before any study procedure.
       If the protocol changes materially, re-consent is required.
     notes: "Tests chain_terminology: 'enrollment authorization' (not 'informed consent') and 'investigational program' (not 'clinical trial')."

       a voluntary decision. Consent must be obtained before any study procedure.
       If the protocol changes materially, re-consent is required.
     notes: "Tests chain_terminology: 'enrollment authorization' (not 'informed consent') and 'investigational program' (not 'clinical trial')."
+  # ── ADVERSARIAL / EDGE CASES ───────────────────────────────────────────────
+  - id: edge-001
+    domain: retail
+    client: novamart
+    question: "What is the quarterly depreciation schedule for warehouse fixtures?"
+    expected_contains:
+      - don't have
+      - not enough
+      - context
+    expected_answer: >
+      I don't have enough information to answer your question. The provided context
+      does not contain information about depreciation schedules or warehouse fixtures.
+    notes: >
+      Adversarial: vague query with zero KB match. Model should refuse cleanly.
+      faithfulness must PASS on refusal (no factual claims). chain_terminology should
+      pass (no terms used). Tests the refusal detection path in grade_faithfulness.
+  - id: edge-002
+    domain: retail
+    client: novamart
+    question: "What triggers an out-of-stock alert and who gets notified?"
+    expected_contains:
+      - availability scan
+      - low inventory signal
+      - category manager
+    expected_answer: >
+      An availability scan detects when stock falls below the configured reorder
+      threshold and triggers a low inventory signal. The signal is routed to the
+      responsible category manager and the supplying vendor simultaneously.
+    notes: >
+      Adversarial: question uses ShelfWise terminology ('out-of-stock alert') but client
+      is NovaMart. Model must respond with NovaMart terms ('availability scan',
+      'low inventory signal') despite rival term appearing in the prompt.
+      Tests chain_terminology enforcement under adversarial input.
+  - id: edge-003
+    domain: pharma
+    client: clinixone
+    question: "What happens when a prior authorization is denied and the patient has an adverse event?"
+    expected_contains:
+      - prior authorization
+      - adverse event
+      - safety signal
+    expected_answer: >
+      If a prior authorization is denied, the prescriber may appeal with supporting
+      clinical guidelines. If the patient experiences an adverse event during this
+      period, it must be reported as a safety signal through the pharmacovigilance
+      process regardless of authorization status.
+    notes: >
+      Adversarial: multi-doc synthesis. Correct answer requires combining the
+      prior authorization KB doc with the adverse event KB doc. Tests whether
+      retrieval returns both docs and whether the model synthesizes across them.
+  - id: edge-004
+    domain: retail
+    client: shelfwise
+    question: "What is the exact SLA in minutes for resolving an out-of-stock alert?"
+    expected_contains:
+      - don't have
+      - not enough
+      - specific
+    expected_answer: >
+      I don't have enough information to provide an exact SLA in minutes. The
+      provided context does not contain specific numeric SLA targets for
+      out-of-stock alert resolution.
+    notes: >
+      Adversarial: hallucination bait. Asks for a specific number not present in KB.
+      Model should refuse rather than invent a number. faithfulness must PASS on
+      refusal. Tests that the model does not fabricate precise values.

ui/app.js CHANGED Viewed

@@ -110,10 +110,14 @@ function appendBotMessage(data) {
   const overall = data.evaluation.overall_pass;
   const verdictClass = overall ? 'pass' : 'fail';
   const verdictLabel = overall ? '✓ All checks passed' : '✗ Checks failed';
   const el = document.createElement('div');
   el.className = 'message bot';
   el.innerHTML = `
     <div class="bubble">${escapeHtml(data.answer)}</div>
     <div class="verdict ${verdictClass}">${verdictLabel}</div>
     <div class="meta">${data.client_display}</div>

   const overall = data.evaluation.overall_pass;
   const verdictClass = overall ? 'pass' : 'fail';
   const verdictLabel = overall ? '✓ All checks passed' : '✗ Checks failed';
+  const flagBanner = data.flagged
+    ? `<div class="flagged-banner">⚠ Response flagged — one or more quality checks failed</div>`
+    : '';
   const el = document.createElement('div');
   el.className = 'message bot';
   el.innerHTML = `
+    ${flagBanner}
     <div class="bubble">${escapeHtml(data.answer)}</div>
     <div class="verdict ${verdictClass}">${verdictLabel}</div>
     <div class="meta">${data.client_display}</div>

ui/index.html CHANGED Viewed

@@ -201,6 +201,17 @@
     .input-bar button:hover:not(:disabled) { background: #3a6ea8; }
     .input-bar button:disabled { background: #93b8d8; cursor: not-allowed; }
     /* ── Eval panel ── */
     .eval-panel {
       background: #fff;

     .input-bar button:hover:not(:disabled) { background: #3a6ea8; }
     .input-bar button:disabled { background: #93b8d8; cursor: not-allowed; }
+    .flagged-banner {
+      background: #fff0f0;
+      border-left: 4px solid #e74c3c;
+      color: #c0392b;
+      font-size: 12px;
+      font-weight: 600;
+      padding: 8px 16px;
+      margin: 12px 16px 0;
+      border-radius: 4px;
+    }
     /* ── Eval panel ── */
     .eval-panel {
       background: #fff;