Commit Β·
907c06a
1
Parent(s): 99649f6
Address Gate 5 audit gaps
Browse files- Faithfulness: refusal detection (score=1.0 on no-claim responses)
- Faithfulness: sentence-level min-score replaces max-chunk scoring
- Escalation: EVAL_FAIL WARNING log + flagged field in response + UI banner
- Startup: pre-warm embedder and NLI model in lifespan
- Golden dataset: 4 adversarial pairs (vague, rival-term, multi-doc, hallucination bait)
- eval/calibrate.py: threshold calibration script, no generation required
- NOTES.md +17 -3
- backend/app.py +9 -1
- backend/grader.py +32 -9
- backend/pipeline.py +1 -0
- eval/calibrate.py +98 -0
- eval/golden-dataset.yaml +71 -0
- ui/app.js +4 -0
- ui/index.html +11 -0
NOTES.md
CHANGED
|
@@ -54,9 +54,23 @@ teardown fixtures.
|
|
| 54 |
- **CI pipeline**: GitHub Actions running `make lint`, `make type-check`,
|
| 55 |
`make test` on every PR. Integration tests gated on a self-hosted runner with
|
| 56 |
the API running.
|
| 57 |
-
- **Threshold calibration report**:
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
---
|
| 62 |
|
|
|
|
| 54 |
- **CI pipeline**: GitHub Actions running `make lint`, `make type-check`,
|
| 55 |
`make test` on every PR. Integration tests gated on a self-hosted runner with
|
| 56 |
the API running.
|
| 57 |
+
- **Threshold calibration report**: `eval/calibrate.py` exists and runs graders
|
| 58 |
+
against golden-dataset expected answers β threshold calibration is now a single
|
| 59 |
+
command, not a missing feature. Actual threshold adjustments require reviewing
|
| 60 |
+
the output against real query distributions.
|
| 61 |
+
|
| 62 |
+
## Gate 5 audit gaps addressed
|
| 63 |
+
|
| 64 |
+
- **Faithfulness false negatives on refusals**: `_is_refusal()` detects "I don't have
|
| 65 |
+
enough information" responses and returns score=1.0 β no factual claims, trivially faithful.
|
| 66 |
+
- **Partial grounding blind spot**: faithfulness now uses sentence-level min-score (weakest
|
| 67 |
+
link wins) instead of max-score across chunks. A response with one hallucinated sentence
|
| 68 |
+
now fails even if other sentences are grounded.
|
| 69 |
+
- **No escalation path**: `overall_pass=False` now emits a structured `EVAL_FAIL` WARNING
|
| 70 |
+
log entry and sets `flagged: true` in the response payload. UI shows a red banner.
|
| 71 |
+
- **Cold-start latency**: embedder and NLI model pre-warmed at startup in the FastAPI lifespan.
|
| 72 |
+
- **Happy-path-only golden dataset**: 4 adversarial pairs added (vague query, rival-term
|
| 73 |
+
prompt injection, multi-doc synthesis, hallucination bait).
|
| 74 |
|
| 75 |
---
|
| 76 |
|
backend/app.py
CHANGED
|
@@ -11,6 +11,7 @@ from fastapi.staticfiles import StaticFiles
|
|
| 11 |
from pydantic import BaseModel
|
| 12 |
|
| 13 |
from config import DOMAIN_CLIENTS, CLIENT_DOMAIN, DISPLAY_NAMES
|
|
|
|
| 14 |
from pipeline import run
|
| 15 |
|
| 16 |
log = logging.getLogger(__name__)
|
|
@@ -25,7 +26,9 @@ async def lifespan(app: FastAPI):
|
|
| 25 |
if not hf_token:
|
| 26 |
raise RuntimeError("HF_TOKEN not set")
|
| 27 |
app.state.hf_client = InferenceClient(token=hf_token)
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
yield
|
| 30 |
|
| 31 |
|
|
@@ -49,6 +52,7 @@ class QueryResponse(BaseModel):
|
|
| 49 |
client: str
|
| 50 |
client_display: str
|
| 51 |
answer: str
|
|
|
|
| 52 |
sources: list[dict]
|
| 53 |
evaluation: dict
|
| 54 |
|
|
@@ -81,6 +85,10 @@ def handle_query(req: QueryRequest):
|
|
| 81 |
client=req.client,
|
| 82 |
hf_client=app.state.hf_client,
|
| 83 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
return result.response_payload
|
| 85 |
|
| 86 |
|
|
|
|
| 11 |
from pydantic import BaseModel
|
| 12 |
|
| 13 |
from config import DOMAIN_CLIENTS, CLIENT_DOMAIN, DISPLAY_NAMES
|
| 14 |
+
from grader import get_embedder, get_nli_model
|
| 15 |
from pipeline import run
|
| 16 |
|
| 17 |
log = logging.getLogger(__name__)
|
|
|
|
| 26 |
if not hf_token:
|
| 27 |
raise RuntimeError("HF_TOKEN not set")
|
| 28 |
app.state.hf_client = InferenceClient(token=hf_token)
|
| 29 |
+
get_embedder()
|
| 30 |
+
get_nli_model()
|
| 31 |
+
log.info("Models pre-warmed. Ready.")
|
| 32 |
yield
|
| 33 |
|
| 34 |
|
|
|
|
| 52 |
client: str
|
| 53 |
client_display: str
|
| 54 |
answer: str
|
| 55 |
+
flagged: bool
|
| 56 |
sources: list[dict]
|
| 57 |
evaluation: dict
|
| 58 |
|
|
|
|
| 85 |
client=req.client,
|
| 86 |
hf_client=app.state.hf_client,
|
| 87 |
)
|
| 88 |
+
if not result.grade_report.overall:
|
| 89 |
+
failed = [r.metric for r in result.grade_report.results if not r.passed]
|
| 90 |
+
log.warning("EVAL_FAIL client=%s failed_metrics=%s query=%r",
|
| 91 |
+
req.client, failed, req.query.strip()[:80])
|
| 92 |
return result.response_payload
|
| 93 |
|
| 94 |
|
backend/grader.py
CHANGED
|
@@ -84,6 +84,17 @@ TOKEN_BUDGET = 512
|
|
| 84 |
RELEVANCY_THRESHOLD = 0.45
|
| 85 |
FAITHFULNESS_THRESHOLD = 0.7
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
def grade_pii_leakage(response: str) -> GradeResult:
|
| 89 |
"""Scan response for PII patterns; fail on any match."""
|
|
@@ -124,22 +135,34 @@ def grade_answer_relevancy(query: str, response: str) -> GradeResult:
|
|
| 124 |
|
| 125 |
|
| 126 |
def grade_faithfulness(response: str, context: str) -> GradeResult:
|
| 127 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
model = get_nli_model()
|
| 129 |
-
# Split on the double-newline separator used by _build_context in pipeline.py
|
| 130 |
chunks = [c.strip() for c in context.split("\n\n") if c.strip()]
|
| 131 |
if not chunks:
|
| 132 |
return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
return GradeResult(
|
| 139 |
metric="faithfulness",
|
| 140 |
passed=passed,
|
| 141 |
-
score=
|
| 142 |
-
detail=f"NLI entailment {
|
|
|
|
| 143 |
)
|
| 144 |
|
| 145 |
|
|
|
|
| 84 |
RELEVANCY_THRESHOLD = 0.45
|
| 85 |
FAITHFULNESS_THRESHOLD = 0.7
|
| 86 |
|
| 87 |
+
_REFUSAL_PATTERNS = re.compile(
|
| 88 |
+
r"(i (don't|do not|cannot|can't) (have|find|provide|answer)|"
|
| 89 |
+
r"not enough (information|context)|"
|
| 90 |
+
r"the (context|provided) (does not|doesn't) (contain|include|mention))",
|
| 91 |
+
re.IGNORECASE,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _is_refusal(response: str) -> bool:
|
| 96 |
+
return bool(_REFUSAL_PATTERNS.search(response))
|
| 97 |
+
|
| 98 |
|
| 99 |
def grade_pii_leakage(response: str) -> GradeResult:
|
| 100 |
"""Scan response for PII patterns; fail on any match."""
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
def grade_faithfulness(response: str, context: str) -> GradeResult:
|
| 138 |
+
"""Sentence-level NLI faithfulness: weakest sentence score wins."""
|
| 139 |
+
if _is_refusal(response):
|
| 140 |
+
return GradeResult(
|
| 141 |
+
metric="faithfulness", passed=True, score=1.0,
|
| 142 |
+
detail="Refusal β no factual claims to verify",
|
| 143 |
+
)
|
| 144 |
model = get_nli_model()
|
|
|
|
| 145 |
chunks = [c.strip() for c in context.split("\n\n") if c.strip()]
|
| 146 |
if not chunks:
|
| 147 |
return GradeResult(metric="faithfulness", passed=False, score=0.0, detail="No context")
|
| 148 |
+
sentences = [s.strip() for s in response.split(". ") if len(s.split()) >= 8]
|
| 149 |
+
if not sentences:
|
| 150 |
+
sentences = [response]
|
| 151 |
+
sentence_scores = []
|
| 152 |
+
for sentence in sentences:
|
| 153 |
+
# NLI label order: contradiction=0, entailment=1, neutral=2
|
| 154 |
+
pairs = [(chunk, sentence) for chunk in chunks]
|
| 155 |
+
chunk_scores = model.predict(pairs, apply_softmax=True)
|
| 156 |
+
best = float(max(s[1] for s in chunk_scores))
|
| 157 |
+
sentence_scores.append(best)
|
| 158 |
+
score = min(sentence_scores)
|
| 159 |
+
passed = score >= FAITHFULNESS_THRESHOLD
|
| 160 |
return GradeResult(
|
| 161 |
metric="faithfulness",
|
| 162 |
passed=passed,
|
| 163 |
+
score=score,
|
| 164 |
+
detail=f"NLI min-sentence entailment {score:.3f} (threshold: {FAITHFULNESS_THRESHOLD})",
|
| 165 |
+
metadata={"sentence_scores": [round(s, 3) for s in sentence_scores]},
|
| 166 |
)
|
| 167 |
|
| 168 |
|
backend/pipeline.py
CHANGED
|
@@ -57,6 +57,7 @@ class PipelineResult:
|
|
| 57 |
"client": self.client,
|
| 58 |
"client_display": DISPLAY_NAMES.get(self.client, self.client),
|
| 59 |
"answer": self.answer,
|
|
|
|
| 60 |
"sources": [
|
| 61 |
{"id": d.id, "title": d.title, "score": round(d.score, 3)}
|
| 62 |
for d in self.retrieved_docs
|
|
|
|
| 57 |
"client": self.client,
|
| 58 |
"client_display": DISPLAY_NAMES.get(self.client, self.client),
|
| 59 |
"answer": self.answer,
|
| 60 |
+
"flagged": not self.grade_report.overall,
|
| 61 |
"sources": [
|
| 62 |
{"id": d.id, "title": d.title, "score": round(d.score, 3)}
|
| 63 |
for d in self.retrieved_docs
|
eval/calibrate.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Threshold calibration: run L1 graders against golden-dataset expected answers.
|
| 3 |
+
|
| 4 |
+
Uses expected_answer as the model response (no generation, no HF API call).
|
| 5 |
+
Reports score distribution per metric and flags if current threshold cuts
|
| 6 |
+
more than 10% of known-good answers.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
cd backend && python ../eval/calibrate.py
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import sys
|
| 13 |
+
import statistics
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
import yaml
|
| 17 |
+
|
| 18 |
+
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
|
| 19 |
+
|
| 20 |
+
from grader import (
|
| 21 |
+
grade_answer_relevancy,
|
| 22 |
+
grade_faithfulness,
|
| 23 |
+
RELEVANCY_THRESHOLD,
|
| 24 |
+
FAITHFULNESS_THRESHOLD,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
DATASET_PATH = Path(__file__).parent / "golden-dataset.yaml"
|
| 28 |
+
AT_RISK_WARN = 0.10 # flag if >10% of good answers fall below threshold
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _load_pairs() -> list[dict]:
|
| 32 |
+
data = yaml.safe_load(DATASET_PATH.read_text())
|
| 33 |
+
return data["pairs"]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _percentile(values: list[float], p: float) -> float:
|
| 37 |
+
sorted_v = sorted(values)
|
| 38 |
+
idx = (len(sorted_v) - 1) * p / 100
|
| 39 |
+
lo, hi = int(idx), min(int(idx) + 1, len(sorted_v) - 1)
|
| 40 |
+
return sorted_v[lo] + (sorted_v[hi] - sorted_v[lo]) * (idx - lo)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _report_metric(
|
| 44 |
+
name: str,
|
| 45 |
+
scores: list[float],
|
| 46 |
+
threshold: float,
|
| 47 |
+
) -> None:
|
| 48 |
+
at_risk = sum(1 for s in scores if s < threshold) / len(scores)
|
| 49 |
+
flag = " β FLAG" if at_risk > AT_RISK_WARN else ""
|
| 50 |
+
print(
|
| 51 |
+
f" {name:<22} "
|
| 52 |
+
f"min={min(scores):.3f} "
|
| 53 |
+
f"p25={_percentile(scores, 25):.3f} "
|
| 54 |
+
f"median={statistics.median(scores):.3f} "
|
| 55 |
+
f"p75={_percentile(scores, 75):.3f} "
|
| 56 |
+
f"max={max(scores):.3f} "
|
| 57 |
+
f"threshold={threshold} "
|
| 58 |
+
f"at_risk={at_risk:.0%}{flag}"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def run() -> None:
|
| 63 |
+
pairs = _load_pairs()
|
| 64 |
+
print(f"\nCalibrating against {len(pairs)} pairs from golden-dataset.yaml\n")
|
| 65 |
+
|
| 66 |
+
relevancy_scores: list[float] = []
|
| 67 |
+
faithfulness_scores: list[float] = []
|
| 68 |
+
skipped_refusals = 0
|
| 69 |
+
|
| 70 |
+
for pair in pairs:
|
| 71 |
+
question = pair["question"]
|
| 72 |
+
answer = pair["expected_answer"].strip()
|
| 73 |
+
context = "\n\n".join(pair.get("expected_contains", []))
|
| 74 |
+
|
| 75 |
+
r = grade_answer_relevancy(question, answer)
|
| 76 |
+
relevancy_scores.append(r.score)
|
| 77 |
+
|
| 78 |
+
f = grade_faithfulness(answer, context)
|
| 79 |
+
if "Refusal" in f.detail:
|
| 80 |
+
skipped_refusals += 1
|
| 81 |
+
else:
|
| 82 |
+
faithfulness_scores.append(f.score)
|
| 83 |
+
|
| 84 |
+
print("Score distributions (expected_answer used as model response):\n")
|
| 85 |
+
_report_metric("answer_relevancy", relevancy_scores, RELEVANCY_THRESHOLD)
|
| 86 |
+
if faithfulness_scores:
|
| 87 |
+
_report_metric("faithfulness", faithfulness_scores, FAITHFULNESS_THRESHOLD)
|
| 88 |
+
else:
|
| 89 |
+
print(" faithfulness β all pairs triggered refusal detection, no NLI scores")
|
| 90 |
+
|
| 91 |
+
if skipped_refusals:
|
| 92 |
+
print(f"\n ({skipped_refusals} pairs skipped for faithfulness β refusal responses)")
|
| 93 |
+
|
| 94 |
+
print()
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
if __name__ == "__main__":
|
| 98 |
+
run()
|
eval/golden-dataset.yaml
CHANGED
|
@@ -339,3 +339,74 @@ pairs:
|
|
| 339 |
a voluntary decision. Consent must be obtained before any study procedure.
|
| 340 |
If the protocol changes materially, re-consent is required.
|
| 341 |
notes: "Tests chain_terminology: 'enrollment authorization' (not 'informed consent') and 'investigational program' (not 'clinical trial')."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
a voluntary decision. Consent must be obtained before any study procedure.
|
| 340 |
If the protocol changes materially, re-consent is required.
|
| 341 |
notes: "Tests chain_terminology: 'enrollment authorization' (not 'informed consent') and 'investigational program' (not 'clinical trial')."
|
| 342 |
+
|
| 343 |
+
# ββ ADVERSARIAL / EDGE CASES βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 344 |
+
|
| 345 |
+
- id: edge-001
|
| 346 |
+
domain: retail
|
| 347 |
+
client: novamart
|
| 348 |
+
question: "What is the quarterly depreciation schedule for warehouse fixtures?"
|
| 349 |
+
expected_contains:
|
| 350 |
+
- don't have
|
| 351 |
+
- not enough
|
| 352 |
+
- context
|
| 353 |
+
expected_answer: >
|
| 354 |
+
I don't have enough information to answer your question. The provided context
|
| 355 |
+
does not contain information about depreciation schedules or warehouse fixtures.
|
| 356 |
+
notes: >
|
| 357 |
+
Adversarial: vague query with zero KB match. Model should refuse cleanly.
|
| 358 |
+
faithfulness must PASS on refusal (no factual claims). chain_terminology should
|
| 359 |
+
pass (no terms used). Tests the refusal detection path in grade_faithfulness.
|
| 360 |
+
|
| 361 |
+
- id: edge-002
|
| 362 |
+
domain: retail
|
| 363 |
+
client: novamart
|
| 364 |
+
question: "What triggers an out-of-stock alert and who gets notified?"
|
| 365 |
+
expected_contains:
|
| 366 |
+
- availability scan
|
| 367 |
+
- low inventory signal
|
| 368 |
+
- category manager
|
| 369 |
+
expected_answer: >
|
| 370 |
+
An availability scan detects when stock falls below the configured reorder
|
| 371 |
+
threshold and triggers a low inventory signal. The signal is routed to the
|
| 372 |
+
responsible category manager and the supplying vendor simultaneously.
|
| 373 |
+
notes: >
|
| 374 |
+
Adversarial: question uses ShelfWise terminology ('out-of-stock alert') but client
|
| 375 |
+
is NovaMart. Model must respond with NovaMart terms ('availability scan',
|
| 376 |
+
'low inventory signal') despite rival term appearing in the prompt.
|
| 377 |
+
Tests chain_terminology enforcement under adversarial input.
|
| 378 |
+
|
| 379 |
+
- id: edge-003
|
| 380 |
+
domain: pharma
|
| 381 |
+
client: clinixone
|
| 382 |
+
question: "What happens when a prior authorization is denied and the patient has an adverse event?"
|
| 383 |
+
expected_contains:
|
| 384 |
+
- prior authorization
|
| 385 |
+
- adverse event
|
| 386 |
+
- safety signal
|
| 387 |
+
expected_answer: >
|
| 388 |
+
If a prior authorization is denied, the prescriber may appeal with supporting
|
| 389 |
+
clinical guidelines. If the patient experiences an adverse event during this
|
| 390 |
+
period, it must be reported as a safety signal through the pharmacovigilance
|
| 391 |
+
process regardless of authorization status.
|
| 392 |
+
notes: >
|
| 393 |
+
Adversarial: multi-doc synthesis. Correct answer requires combining the
|
| 394 |
+
prior authorization KB doc with the adverse event KB doc. Tests whether
|
| 395 |
+
retrieval returns both docs and whether the model synthesizes across them.
|
| 396 |
+
|
| 397 |
+
- id: edge-004
|
| 398 |
+
domain: retail
|
| 399 |
+
client: shelfwise
|
| 400 |
+
question: "What is the exact SLA in minutes for resolving an out-of-stock alert?"
|
| 401 |
+
expected_contains:
|
| 402 |
+
- don't have
|
| 403 |
+
- not enough
|
| 404 |
+
- specific
|
| 405 |
+
expected_answer: >
|
| 406 |
+
I don't have enough information to provide an exact SLA in minutes. The
|
| 407 |
+
provided context does not contain specific numeric SLA targets for
|
| 408 |
+
out-of-stock alert resolution.
|
| 409 |
+
notes: >
|
| 410 |
+
Adversarial: hallucination bait. Asks for a specific number not present in KB.
|
| 411 |
+
Model should refuse rather than invent a number. faithfulness must PASS on
|
| 412 |
+
refusal. Tests that the model does not fabricate precise values.
|
ui/app.js
CHANGED
|
@@ -110,10 +110,14 @@ function appendBotMessage(data) {
|
|
| 110 |
const overall = data.evaluation.overall_pass;
|
| 111 |
const verdictClass = overall ? 'pass' : 'fail';
|
| 112 |
const verdictLabel = overall ? 'β All checks passed' : 'β Checks failed';
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
const el = document.createElement('div');
|
| 115 |
el.className = 'message bot';
|
| 116 |
el.innerHTML = `
|
|
|
|
| 117 |
<div class="bubble">${escapeHtml(data.answer)}</div>
|
| 118 |
<div class="verdict ${verdictClass}">${verdictLabel}</div>
|
| 119 |
<div class="meta">${data.client_display}</div>
|
|
|
|
| 110 |
const overall = data.evaluation.overall_pass;
|
| 111 |
const verdictClass = overall ? 'pass' : 'fail';
|
| 112 |
const verdictLabel = overall ? 'β All checks passed' : 'β Checks failed';
|
| 113 |
+
const flagBanner = data.flagged
|
| 114 |
+
? `<div class="flagged-banner">β Response flagged β one or more quality checks failed</div>`
|
| 115 |
+
: '';
|
| 116 |
|
| 117 |
const el = document.createElement('div');
|
| 118 |
el.className = 'message bot';
|
| 119 |
el.innerHTML = `
|
| 120 |
+
${flagBanner}
|
| 121 |
<div class="bubble">${escapeHtml(data.answer)}</div>
|
| 122 |
<div class="verdict ${verdictClass}">${verdictLabel}</div>
|
| 123 |
<div class="meta">${data.client_display}</div>
|
ui/index.html
CHANGED
|
@@ -201,6 +201,17 @@
|
|
| 201 |
.input-bar button:hover:not(:disabled) { background: #3a6ea8; }
|
| 202 |
.input-bar button:disabled { background: #93b8d8; cursor: not-allowed; }
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
/* ββ Eval panel ββ */
|
| 205 |
.eval-panel {
|
| 206 |
background: #fff;
|
|
|
|
| 201 |
.input-bar button:hover:not(:disabled) { background: #3a6ea8; }
|
| 202 |
.input-bar button:disabled { background: #93b8d8; cursor: not-allowed; }
|
| 203 |
|
| 204 |
+
.flagged-banner {
|
| 205 |
+
background: #fff0f0;
|
| 206 |
+
border-left: 4px solid #e74c3c;
|
| 207 |
+
color: #c0392b;
|
| 208 |
+
font-size: 12px;
|
| 209 |
+
font-weight: 600;
|
| 210 |
+
padding: 8px 16px;
|
| 211 |
+
margin: 12px 16px 0;
|
| 212 |
+
border-radius: 4px;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
/* ββ Eval panel ββ */
|
| 216 |
.eval-panel {
|
| 217 |
background: #fff;
|