Spaces:

below-threshold
/

ai-response-validator

Sleeping

mbochniak01 commited on 23 days ago

Commit

ebe934f

1 Parent(s): b917936

Add full RAG evaluation pipeline with L1 metrics and UI

- FastAPI backend: retrieve → generate → grade pipeline
- In-memory semantic retrieval (sentence-transformers, all-MiniLM-L6-v2)
- L1 graders: pii_leakage, token_budget, answer_relevancy, faithfulness, chain_terminology
- RosettaStone: deterministic client-specific terminology validation
- Two domains (retail, pharma) with two fictional clients each
- Plain HTML/JS frontend with real-time metric panel
- 20-pair golden dataset for L2 batch evaluation
- Docker config for HF Spaces (port 7860)

Files changed (17) hide show

ARCHITECTURE.md +0 -0
Dockerfile +26 -0
README.md +17 -4
backend/app.py +92 -0
backend/config.py +36 -0
backend/grader.py +215 -0
backend/pipeline.py +152 -0
backend/rosetta.py +62 -0
eval/golden-dataset.yaml +341 -0
eval/metrics.py +0 -0
knowledge/pharma/features.yaml +98 -0
knowledge/pharma/term-catalog.yaml +32 -0
knowledge/retail/features.yaml +78 -0
knowledge/retail/term-catalog.yaml +28 -0
requirements.txt +8 -0
ui/app.js +235 -0
ui/index.html +410 -0

ARCHITECTURE.md ADDED Viewed

File without changes

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.11-slim
+WORKDIR /app
+# System deps for sentence-transformers (tokenizers uses Rust bindings)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Pre-download the embedding model so first request isn't slow on HF Spaces
+RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
+COPY knowledge/ ./knowledge/
+COPY backend/   ./backend/
+COPY ui/        ./ui/
+WORKDIR /app/backend
+# HF Spaces requires port 7860
+ENV PORT=7860
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,23 @@
 ---
-title: Ai Response Validator
-emoji: 📉
 colorFrom: blue
-colorTo: green
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AI Response Validator
+emoji: 🔍
 colorFrom: blue
+colorTo: blue
 sdk: docker
+app_port: 7860
 pinned: false
 ---
+# AI Response Validator
+Domain-agnostic RAG evaluation system. Validates AI responses for correctness,
+faithfulness, and client-specific terminology across retail and pharma domains.
+**Live demo:** select a domain and client, then ask a question in natural language.
+Each response is evaluated in real time across 5 metrics:
+- **PII Leakage** — regex scan, no personal data in responses
+- **Token Budget** — response within ceiling
+- **Answer Relevancy** — cosine similarity between query and response
+- **Faithfulness** — Claude judge: is the answer grounded in retrieved context?
+- **Chain Terminology** — deterministic check that the bot uses client-specific terms

backend/app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import logging
+import os
+from contextlib import asynccontextmanager
+from pathlib import Path
+import anthropic
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+from config import DOMAIN_CLIENTS, CLIENT_DOMAIN, DISPLAY_NAMES
+from pipeline import run
+log = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+UI_DIR = Path(__file__).parent.parent / "ui"
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        raise RuntimeError("ANTHROPIC_API_KEY not set")
+    app.state.anthropic = anthropic.Anthropic(api_key=api_key)
+    log.info("Anthropic client ready")
+    yield
+app = FastAPI(title="AI Response Validator", lifespan=lifespan)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["GET", "POST"],
+    allow_headers=["*"],
+)
+class QueryRequest(BaseModel):
+    query: str
+    client: str
+class QueryResponse(BaseModel):
+    query: str
+    client: str
+    client_display: str
+    answer: str
+    sources: list[dict]
+    evaluation: dict
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+@app.get("/config")
+def get_config():
+    """Domain/client structure for the UI switcher."""
+    return {
+        "domains": {
+            domain: [{"id": c, "display": DISPLAY_NAMES[c]} for c in clients]
+            for domain, clients in DOMAIN_CLIENTS.items()
+        }
+    }
+@app.post("/query", response_model=QueryResponse)
+def handle_query(req: QueryRequest):
+    if req.client not in CLIENT_DOMAIN:
+        raise HTTPException(status_code=400, detail=f"Unknown client: {req.client!r}")
+    if not req.query.strip():
+        raise HTTPException(status_code=400, detail="Query cannot be empty")
+    result = run(
+        query=req.query.strip(),
+        client=req.client,
+        anthropic_client=app.state.anthropic,
+    )
+    return result.response_payload
+app.mount("/static", StaticFiles(directory=UI_DIR), name="static")
+@app.get("/")
+def root():
+    return FileResponse(UI_DIR / "index.html")

backend/config.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from pathlib import Path
+KNOWLEDGE_ROOT = Path(__file__).parent.parent / "knowledge"
+EMBEDDER_MODEL = "all-MiniLM-L6-v2"
+DOMAIN_CLIENTS: dict[str, list[str]] = {
+    "retail": ["novamart", "shelfwise"],
+    "pharma": ["clinixone", "pharmalink"],
+}
+CLIENT_DOMAIN: dict[str, str] = {
+    client: domain
+    for domain, clients in DOMAIN_CLIENTS.items()
+    for client in clients
+}
+DISPLAY_NAMES: dict[str, str] = {
+    "novamart": "NovaMart",
+    "shelfwise": "ShelfWise",
+    "clinixone": "ClinixOne",
+    "pharmalink": "PharmaLink",
+}
+def term_catalog_path(domain: str) -> Path:
+    return KNOWLEDGE_ROOT / domain / "term-catalog.yaml"
+def features_path(domain: str) -> Path:
+    return KNOWLEDGE_ROOT / domain / "features.yaml"
+def domain_for(client: str) -> str:
+    if client not in CLIENT_DOMAIN:
+        raise ValueError(f"Unknown client: {client!r}. Valid: {list(CLIENT_DOMAIN)}")
+    return CLIENT_DOMAIN[client]

backend/grader.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""
+L1 graders — run live on every query.
+Metrics:
+  pii_leakage        — regex scan for PII patterns in response
+  token_budget       — response within allowed token ceiling
+  answer_relevancy   — cosine similarity between query and response embeddings
+  faithfulness       — Claude judge: is response grounded in retrieved context?
+  chain_terminology  — deterministic: client-specific terms used (via RosettaStone)
+"""
+import re
+import json
+import logging
+from dataclasses import dataclass, field
+import anthropic
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from config import EMBEDDER_MODEL
+from rosetta import check_terminology
+log = logging.getLogger(__name__)
+_embedder: SentenceTransformer | None = None
+def get_embedder() -> SentenceTransformer:
+    """Return the shared sentence-transformer instance, loading it on first call."""
+    global _embedder
+    if _embedder is None:
+        _embedder = SentenceTransformer(EMBEDDER_MODEL)
+    return _embedder
+@dataclass(slots=True)
+class GradeResult:
+    metric: str
+    passed: bool
+    score: float
+    detail: str = ""
+    metadata: dict = field(default_factory=dict)
+@dataclass(slots=True)
+class GradeReport:
+    client: str
+    query: str
+    results: list[GradeResult] = field(default_factory=list)
+    @property
+    def overall(self) -> bool:
+        return all(r.passed for r in self.results)
+    @property
+    def summary(self) -> dict:
+        return {
+            "overall_pass": self.overall,
+            "metrics": {
+                r.metric: {"passed": r.passed, "score": round(r.score, 3), "detail": r.detail}
+                for r in self.results
+            },
+        }
+_PII_PATTERNS = [
+    (r"\b\d{3}-\d{2}-\d{4}\b", "SSN"),
+    (r"\b\d{16}\b", "credit card"),
+    (r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b", "email"),
+    (r"\b\d{3}[\s.\-]?\d{3}[\s.\-]?\d{4}\b", "phone"),
+]
+TOKEN_BUDGET = 512
+RELEVANCY_THRESHOLD = 0.45
+FAITHFULNESS_THRESHOLD = 0.7
+FAITHFULNESS_PROMPT = """\
+You are a faithfulness evaluator. Your task is to assess whether an AI response
+is fully grounded in the provided context and contains no unsupported claims.
+Context:
+<context>
+{context}
+</context>
+Response to evaluate:
+<response>
+{response}
+</response>
+Rules:
+- A claim is faithful if it can be directly inferred from the context.
+- A claim is unfaithful if it introduces facts not present in the context.
+- Ignore stylistic differences; focus only on factual grounding.
+Respond with JSON only, no explanation outside the JSON:
+{{
+  "faithful": true | false,
+  "score": 0.0-1.0,
+  "unsupported_claims": ["claim1", "claim2"]
+}}"""
+def grade_pii_leakage(response: str) -> GradeResult:
+    """Scan response for PII patterns; fail on any match."""
+    found = [label for pattern, label in _PII_PATTERNS if re.search(pattern, response)]
+    return GradeResult(
+        metric="pii_leakage",
+        passed=not found,
+        score=0.0 if found else 1.0,
+        detail=f"Detected: {', '.join(found)}" if found else "Clean",
+    )
+def grade_token_budget(response: str, budget: int = TOKEN_BUDGET) -> GradeResult:
+    """Fail if estimated token count exceeds budget."""
+    approx_tokens = len(response) // 4
+    passed = approx_tokens <= budget
+    return GradeResult(
+        metric="token_budget",
+        passed=passed,
+        score=1.0 if passed else max(0.0, 1.0 - approx_tokens / budget),
+        detail=f"~{approx_tokens} tokens (budget: {budget})",
+        metadata={"approx_tokens": approx_tokens, "budget": budget},
+    )
+def grade_answer_relevancy(query: str, response: str) -> GradeResult:
+    """Score semantic similarity between query and response via cosine distance."""
+    embedder = get_embedder()
+    q_vec = embedder.encode([query])
+    r_vec = embedder.encode([response])
+    score = float(cosine_similarity(q_vec, r_vec)[0][0])
+    return GradeResult(
+        metric="answer_relevancy",
+        passed=score >= RELEVANCY_THRESHOLD,
+        score=score,
+        detail=f"Cosine {score:.3f} (threshold: {RELEVANCY_THRESHOLD})",
+    )
+def grade_faithfulness(
+    response: str,
+    context: str,
+    anthropic_client: anthropic.Anthropic,
+) -> GradeResult:
+    """Ask Claude to judge whether the response is grounded in retrieved context."""
+    prompt = FAITHFULNESS_PROMPT.format(context=context, response=response)
+    try:
+        message = anthropic_client.messages.create(
+            model="claude-haiku-4-5-20251001",
+            max_tokens=256,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        parsed = json.loads(message.content[0].text.strip())
+        score = float(parsed.get("score", 0.0))
+        unsupported = parsed.get("unsupported_claims", [])
+        passed = parsed.get("faithful", False) and score >= FAITHFULNESS_THRESHOLD
+        detail = f"Score {score:.2f}" + (f" — unsupported: {unsupported}" if unsupported else "")
+        return GradeResult(
+            metric="faithfulness",
+            passed=passed,
+            score=score,
+            detail=detail,
+            metadata={"unsupported_claims": unsupported},
+        )
+    except (json.JSONDecodeError, anthropic.APIError) as exc:
+        log.warning("Faithfulness grader failed: %s", exc)
+        return GradeResult(
+            metric="faithfulness",
+            passed=False,
+            score=0.0,
+            detail=f"Grader error: {exc}",
+        )
+def grade_chain_terminology(response: str, client: str) -> GradeResult:
+    """Check that the response uses client-specific terms, not rival terminology."""
+    result = check_terminology(response, client)
+    violations = result["violations"]
+    checked = result["checked"]
+    score = 1.0 - (len(violations) / checked) if checked else 1.0
+    detail = (
+        f"{len(violations)} violation(s): " +
+        ", ".join(f"{v['found']!r} → should be {v['expected']!r}" for v in violations)
+        if violations else f"All {checked} terms correct"
+    )
+    return GradeResult(
+        metric="chain_terminology",
+        passed=result["pass"],
+        score=score,
+        detail=detail,
+        metadata={"violations": violations},
+    )
+def grade(
+    query: str,
+    response: str,
+    context: str,
+    client: str,
+    anthropic_client: anthropic.Anthropic,
+    token_budget: int = TOKEN_BUDGET,
+) -> GradeReport:
+    """Run all L1 graders and return a consolidated report."""
+    report = GradeReport(client=client, query=query)
+    report.results = [
+        grade_pii_leakage(response),
+        grade_token_budget(response, token_budget),
+        grade_answer_relevancy(query, response),
+        grade_faithfulness(response, context, anthropic_client),
+        grade_chain_terminology(response, client),
+    ]
+    return report

backend/pipeline.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+RAG pipeline: retrieve → generate → grade.
+Retrieval: in-memory semantic search (sentence-transformers, encoded at first use per domain).
+Generation: Claude with retrieved context injected as grounding.
+Grading: L1 metrics via grader.py.
+"""
+import logging
+from dataclasses import dataclass, field
+import anthropic
+import numpy as np
+import yaml
+from sklearn.metrics.pairwise import cosine_similarity
+from sentence_transformers import SentenceTransformer
+from config import features_path, domain_for, DISPLAY_NAMES
+from grader import grade, GradeReport, get_embedder
+log = logging.getLogger(__name__)
+TOP_K = 3
+MIN_RETRIEVAL_SCORE = 0.1
+SYSTEM_PROMPT = """\
+You are a helpful assistant for {client_display} ({domain} domain).
+Answer the user's question using only the information in the provided context.
+Be concise. Use the terminology natural to {client_display} — do not use internal
+or competitor terminology. If the context does not contain enough information to
+answer, say so clearly rather than speculating."""
+@dataclass(slots=True)
+class RetrievedDoc:
+    id: str
+    title: str
+    content: str
+    score: float
+@dataclass(slots=True)
+class PipelineResult:
+    query: str
+    client: str
+    answer: str
+    retrieved_docs: list[RetrievedDoc]
+    grade_report: GradeReport
+    context_used: str
+    @property
+    def response_payload(self) -> dict:
+        return {
+            "query": self.query,
+            "client": self.client,
+            "client_display": DISPLAY_NAMES.get(self.client, self.client),
+            "answer": self.answer,
+            "sources": [
+                {"id": d.id, "title": d.title, "score": round(d.score, 3)}
+                for d in self.retrieved_docs
+            ],
+            "evaluation": self.grade_report.summary,
+        }
+@dataclass(slots=True)
+class KBIndex:
+    docs: list[dict]
+    embeddings: np.ndarray
+_index_cache: dict[str, KBIndex] = {}
+def _build_index(domain: str, embedder: SentenceTransformer) -> KBIndex:
+    if domain not in _index_cache:
+        data = yaml.safe_load(features_path(domain).read_text())
+        docs = data["documents"]
+        texts = [f"{d['title']}. {d['content']}" for d in docs]
+        embeddings = embedder.encode(texts, show_progress_bar=False)
+        _index_cache[domain] = KBIndex(docs=docs, embeddings=np.array(embeddings))
+        log.info("Built KB index for domain=%s (%d docs)", domain, len(docs))
+    return _index_cache[domain]
+def _build_context(docs: list[RetrievedDoc]) -> str:
+    return "\n\n".join(f"[{d.title}]\n{d.content.strip()}" for d in docs)
+def _generate(
+    query: str,
+    context: str,
+    client: str,
+    domain: str,
+    anthropic_client: anthropic.Anthropic,
+) -> str:
+    system = SYSTEM_PROMPT.format(
+        client_display=DISPLAY_NAMES.get(client, client),
+        domain=domain,
+    )
+    response = anthropic_client.messages.create(
+        model="claude-haiku-4-5-20251001",
+        max_tokens=512,
+        system=system,
+        messages=[{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}],
+    )
+    return response.content[0].text.strip()
+def run(
+    query: str,
+    client: str,
+    anthropic_client: anthropic.Anthropic,
+    top_k: int = TOP_K,
+) -> PipelineResult:
+    """Retrieve relevant KB docs, generate a grounded answer, and grade it."""
+    domain = domain_for(client)
+    embedder = get_embedder()
+    index = _build_index(domain, embedder)
+    q_vec = embedder.encode([query])
+    scores = cosine_similarity(q_vec, index.embeddings)[0]
+    top_indices = np.argsort(scores)[::-1][:top_k]
+    retrieved = [
+        RetrievedDoc(
+            id=index.docs[i]["id"],
+            title=index.docs[i]["title"],
+            content=index.docs[i]["content"],
+            score=float(scores[i]),
+        )
+        for i in top_indices
+        if scores[i] > MIN_RETRIEVAL_SCORE
+    ]
+    context = _build_context(retrieved)
+    answer = _generate(query, context, client, domain, anthropic_client)
+    report = grade(
+        query=query,
+        response=answer,
+        context=context,
+        client=client,
+        anthropic_client=anthropic_client,
+    )
+    return PipelineResult(
+        query=query,
+        client=client,
+        answer=answer,
+        retrieved_docs=retrieved,
+        grade_report=report,
+        context_used=context,
+    )

backend/rosetta.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""RosettaStone: canonical term -> client-specific term translation."""
+import yaml
+from functools import lru_cache
+from config import term_catalog_path, domain_for
+@lru_cache(maxsize=8)
+def _load_catalog(domain: str) -> dict[str, dict[str, str]]:
+    """Returns {client_id: {CANONICAL_KEY: "client term"}}."""
+    data = yaml.safe_load(term_catalog_path(domain).read_text())
+    return {
+        client_id: client_data["terms"]
+        for client_id, client_data in data["clients"].items()
+    }
+def translate(canonical_key: str, client: str) -> str | None:
+    """Return client-specific term for a canonical key, or None if not mapped."""
+    catalog = _load_catalog(domain_for(client))
+    return catalog.get(client, {}).get(canonical_key)
+def client_terms(client: str) -> dict[str, str]:
+    """Return full {CANONICAL_KEY: client_term} mapping for a client."""
+    catalog = _load_catalog(domain_for(client))
+    return dict(catalog.get(client, {}))
+def check_terminology(response_text: str, client: str) -> dict:
+    """
+    Deterministic chain_terminology check.
+    Flags cases where a rival client's term appears in the response for a
+    canonical key, without the correct client term also being present.
+    Returns:
+        {"pass": bool, "violations": [...], "checked": int}
+    """
+    catalog = _load_catalog(domain_for(client))
+    expected = catalog.get(client, {})
+    other_clients = {c: terms for c, terms in catalog.items() if c != client}
+    text_lower = response_text.lower()
+    violations = []
+    for canonical_key, client_term in expected.items():
+        client_term_present = client_term.lower() in text_lower
+        for other_terms in other_clients.values():
+            rival_term = other_terms.get(canonical_key, "")
+            if rival_term and rival_term.lower() in text_lower and not client_term_present:
+                violations.append({
+                    "canonical": canonical_key,
+                    "expected": client_term,
+                    "found": rival_term,
+                })
+    return {
+        "pass": len(violations) == 0,
+        "violations": violations,
+        "checked": len(expected),
+    }

eval/golden-dataset.yaml ADDED Viewed

	@@ -0,0 +1,341 @@

+# Golden dataset — 20 Q&A pairs for L2 batch evaluation
+# 10 retail (5 NovaMart / 5 ShelfWise) + 10 pharma (5 ClinixOne / 5 PharmaLink)
+#
+# Fields:
+#   id            — stable identifier
+#   domain        — retail | pharma
+#   client        — novamart | shelfwise | clinixone | pharmalink
+#   question      — natural-language query as a recruiter or end-user would type it
+#   expected_contains — keyphrases the correct answer must include (used by L2 metrics)
+#   expected_answer   — full reference answer for answer_correctness / answer_similarity
+#   notes         — what this pair is testing (for eval engineers)
+pairs:
+  # ── RETAIL · NovaMart ──────────────────────────────────────────────────
+  - id: retail-nm-001
+    domain: retail
+    client: novamart
+    question: "What happens when a product runs out of stock?"
+    expected_contains:
+      - availability scan
+      - low inventory signal
+      - reorder
+    expected_answer: >
+      When a product runs out of stock, an availability scan detects the shortfall
+      against the configured reorder threshold and triggers a low inventory signal.
+      The signal is routed to the responsible category manager and the supplying vendor.
+      If unacknowledged, it escalates to the regional operations lead after 24 hours.
+    notes: "Tests chain_terminology: must say 'availability scan' and 'low inventory signal', not 'stock check' or 'out-of-stock alert'."
+  - id: retail-nm-002
+    domain: retail
+    client: novamart
+    question: "How do I add a new supplier to the system?"
+    expected_contains:
+      - merchant onboarding
+      - legal entity name
+      - tax ID
+      - purchase order
+    expected_answer: >
+      To add a new supplier, complete the merchant onboarding process by registering
+      the vendor with their legal entity name, tax ID, payment terms, and primary contact.
+      Incomplete records will block purchase order creation until all mandatory fields
+      are validated.
+    notes: "Tests chain_terminology: 'merchant onboarding' not 'supplier setup'."
+  - id: retail-nm-003
+    domain: retail
+    client: novamart
+    question: "Can I turn on a new feature for just one region without deploying code?"
+    expected_contains:
+      - capability switch
+      - activation scope
+      - expiry date
+      - sign-off
+    expected_answer: >
+      Yes. A capability switch lets you enable or disable functionality per client,
+      region, or user segment without a code deployment. Each switch has an activation
+      scope and an expiry date to prevent flag debt. Enabling a switch in production
+      requires sign-off from both the product and engineering lead.
+    notes: "Tests chain_terminology: 'capability switch' not 'feature toggle' or 'feature flag'."
+  - id: retail-nm-004
+    domain: retail
+    client: novamart
+    question: "Where is the authoritative source for product information like SKU and category?"
+    expected_contains:
+      - item registry
+      - SKU
+      - archived
+      - 15 minutes
+    expected_answer: >
+      The item registry is the authoritative source for product attributes including
+      SKU, description, category hierarchy, dimensions, and active status.
+      Updates sync to downstream systems within 15 minutes. Deactivated products
+      remain as archived records and cannot be reactivated without manual review.
+    notes: "Tests chain_terminology: 'item registry' not 'product catalog'."
+  - id: retail-nm-005
+    domain: retail
+    client: novamart
+    question: "How are price changes handled and what needs approval?"
+    expected_contains:
+      - pricing sync
+      - 15%
+      - four hours
+      - escalation
+    expected_answer: >
+      Price changes are submitted as a pricing sync through the pricing portal.
+      Changes greater than 15% of the current price require approval. Approved
+      changes go live at the next sync window, which runs every four hours.
+      Emergency corrections outside the window require escalation to the pricing team.
+    notes: "Tests chain_terminology: 'pricing sync' not 'price update'."
+  # ── RETAIL · ShelfWise ───────────────────────────────────────────────
+  - id: retail-sw-001
+    domain: retail
+    client: shelfwise
+    question: "What triggers an out-of-stock alert?"
+    expected_contains:
+      - out-of-stock alert
+      - reorder point
+      - category manager
+      - 24 hours
+    expected_answer: >
+      An out-of-stock alert fires when a product's on-hand quantity drops below
+      its configured reorder point. It is routed simultaneously to the responsible
+      category manager and the supplying vendor. Unacknowledged alerts escalate
+      to the regional operations lead after 24 hours.
+    notes: "Tests chain_terminology: 'out-of-stock alert' not 'low inventory signal'."
+  - id: retail-sw-002
+    domain: retail
+    client: shelfwise
+    question: "How do we enable a feature for a subset of users?"
+    expected_contains:
+      - feature toggle
+      - activation scope
+      - expiry date
+      - engineering lead
+    expected_answer: >
+      Use a feature toggle to enable or disable functionality per client, region,
+      or user segment without a code deployment. Each toggle has an owner, an
+      activation scope, and an expiry date. Enabling in production requires
+      sign-off from the product and engineering lead.
+    notes: "Tests chain_terminology: 'feature toggle' not 'capability switch'."
+  - id: retail-sw-003
+    domain: retail
+    client: shelfwise
+    question: "What information is required to onboard a new supplier?"
+    expected_contains:
+      - supplier setup
+      - tax ID
+      - payment terms
+      - purchase order
+    expected_answer: >
+      Supplier setup requires the vendor's legal entity name, tax ID, payment terms,
+      and primary contact. Incomplete records block purchase order creation until
+      all mandatory fields are validated.
+    notes: "Tests chain_terminology: 'supplier setup' not 'merchant onboarding'."
+  - id: retail-sw-004
+    domain: retail
+    client: shelfwise
+    question: "How do compliance reports work and who can access them?"
+    expected_contains:
+      - compliance report
+      - immutable
+      - seven years
+      - Audit role
+    expected_answer: >
+      Compliance reports capture a timestamped record of system actions, user
+      decisions, and policy rule evaluations. They are immutable once generated
+      and stored for a minimum of seven years. Access is restricted to users
+      with the Audit role or higher.
+    notes: "Tests chain_terminology: 'compliance report' not 'audit trail'."
+  - id: retail-sw-005
+    domain: retail
+    client: shelfwise
+    question: "How quickly do product catalog updates reach downstream systems?"
+    expected_contains:
+      - product catalog
+      - 15 minutes
+      - event stream
+      - archived
+    expected_answer: >
+      Product catalog updates sync to all downstream systems within 15 minutes
+      via event stream. Deactivated products remain in the catalog as archived
+      records and cannot be reactivated without a manual review.
+    notes: "Tests chain_terminology: 'product catalog' not 'item registry'."
+  # ── PHARMA · ClinixOne ─────────────────────────────────────────────────
+  - id: pharma-cx-001
+    domain: pharma
+    client: clinixone
+    question: "What is prior authorization and how long does it take?"
+    expected_contains:
+      - prior authorization
+      - clinical justification
+      - 72 hours
+      - appeal
+    expected_answer: >
+      Prior authorization is a requirement by a payer that a prescriber obtain
+      approval before a specific drug is dispensed and covered. The prescriber
+      submits clinical justification and the payer responds within 72 hours for
+      standard requests or 24 hours for urgent cases. Denied requests can be
+      appealed once with additional clinical documentation.
+    notes: "Tests chain_terminology: 'prior authorization' not 'formulary pre-approval'."
+  - id: pharma-cx-002
+    domain: pharma
+    client: clinixone
+    question: "What is the difference between a generic name and a brand name?"
+    expected_contains:
+      - generic name
+      - brand name
+      - clinical guidelines
+      - authorization
+    expected_answer: >
+      The generic name is the active ingredient name — non-proprietary and used
+      in clinical guidelines and regulatory filings. Brand names are assigned by
+      manufacturers and appear in marketing materials and some payer formularies.
+      Substituting a brand drug with a generic requires explicit prescriber or
+      payer authorization in some jurisdictions.
+    notes: "Tests chain_terminology: ClinixOne uses 'generic name' as primary."
+  - id: pharma-cx-003
+    domain: pharma
+    client: clinixone
+    question: "When must an adverse event be reported to regulators?"
+    expected_contains:
+      - adverse event
+      - 15 days
+      - 90 days
+      - serious unexpected
+    expected_answer: >
+      Adverse events must be reported to the regulatory authority within 15 days
+      for serious unexpected events and within 90 days for expected events.
+      An adverse event is any undesirable medical occurrence in a patient
+      administered a medicinal product, regardless of causal relationship.
+    notes: "Tests chain_terminology: 'adverse event' not 'safety signal'. Key faithfulness test — specific numbers must be grounded in KB."
+  - id: pharma-cx-004
+    domain: pharma
+    client: clinixone
+    question: "What are the phases of a clinical trial?"
+    expected_contains:
+      - clinical trial
+      - Phase I
+      - Phase II
+      - Phase III
+      - inclusion
+    expected_answer: >
+      Clinical trials are classified by phase: Phase I tests safety in a small
+      cohort, Phase II assesses efficacy and side effects, and Phase III compares
+      against standard treatment at scale. Enrollment eligibility is defined by
+      inclusion and exclusion criteria in the protocol.
+    notes: "Tests chain_terminology: 'clinical trial' not 'investigational program'."
+  - id: pharma-cx-005
+    domain: pharma
+    client: clinixone
+    question: "What happens if a prescriber adjusts the dose outside the approved schedule?"
+    expected_contains:
+      - dose modification
+      - titration
+      - prior authorization
+      - documentation
+    expected_answer: >
+      A dose modification outside the approved titration schedule requires prescriber
+      documentation and may trigger a prior authorization review. Titration schedules
+      specify the starting dose, increment size, and minimum interval between increases.
+    notes: "Tests chain_terminology: 'dose modification' and 'prior authorization' for ClinixOne."
+  # ── PHARMA · PharmaLink ───────────────────────────────────────────────
+  - id: pharma-pl-001
+    domain: pharma
+    client: pharmalink
+    question: "How do I get a drug approved before dispensing?"
+    expected_contains:
+      - formulary pre-approval
+      - clinical justification
+      - 72 hours
+      - appeal
+    expected_answer: >
+      Submit a formulary pre-approval request with clinical justification. The payer
+      reviews against formulary criteria and responds within 72 hours for standard
+      requests or 24 hours for urgent cases. Denied requests can be appealed once
+      with additional clinical documentation.
+    notes: "Tests chain_terminology: 'formulary pre-approval' not 'prior authorization'."
+  - id: pharma-pl-002
+    domain: pharma
+    client: pharmalink
+    question: "What is a pharmacovigilance alert and when is it raised?"
+    expected_contains:
+      - pharmacovigilance alert
+      - pattern
+      - causal relationship
+      - regulatory authority
+    expected_answer: >
+      A pharmacovigilance alert is raised when a pattern of adverse events suggests
+      a previously unknown or incompletely documented causal relationship between a
+      drug and an outcome. Serious unexpected events must be reported to the
+      regulatory authority within 15 days.
+    notes: "Tests chain_terminology: 'pharmacovigilance alert' not 'safety signal' or 'adverse event'. Key cross-client terminology stress test."
+  - id: pharma-pl-003
+    domain: pharma
+    client: pharmalink
+    question: "What are the coverage tiers in the formulary?"
+    expected_contains:
+      - benefit tier
+      - Tier 1
+      - generics
+      - 60-day notice
+    expected_answer: >
+      The formulary organizes drugs into benefit tiers that determine patient
+      cost-sharing. Tier 1 is typically lowest cost and covers generics; higher
+      tiers carry higher copays. Moving a drug to a higher tier requires a formulary
+      committee review and a minimum 60-day notice to prescribers.
+    notes: "Tests chain_terminology: 'benefit tier' not 'coverage tier'."
+  - id: pharma-pl-004
+    domain: pharma
+    client: pharmalink
+    question: "What is a prescribing pathway and how often is it reviewed?"
+    expected_contains:
+      - prescribing pathway
+      - annually
+      - coverage decisions
+      - clinical rationale
+    expected_answer: >
+      A prescribing pathway is an evidence-based document specifying the recommended
+      sequence of therapies for a given condition. Pathways are reviewed annually and
+      updated when new efficacy or safety data emerges. Payers use pathway adherence
+      as a criterion in coverage decisions; deviation requires documented clinical
+      rationale.
+    notes: "Tests chain_terminology: 'prescribing pathway' not 'clinical guideline' or 'treatment protocol'."
+  - id: pharma-pl-005
+    domain: pharma
+    client: pharmalink
+    question: "What does enrollment authorization involve for a clinical study?"
+    expected_contains:
+      - enrollment authorization
+      - investigational program
+      - re-consent
+      - inclusion
+    expected_answer: >
+      Enrollment authorization is the process by which a patient receives and
+      acknowledges sufficient information about an investigational program to make
+      a voluntary decision. Consent must be obtained before any study procedure.
+      If the protocol changes materially, re-consent is required.
+    notes: "Tests chain_terminology: 'enrollment authorization' (not 'informed consent') and 'investigational program' (not 'clinical trial')."

eval/metrics.py ADDED Viewed

File without changes

knowledge/pharma/features.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+# Pharma domain — knowledge base documents
+# Retrieved by RAG pipeline, grounded against in faithfulness check
+# Each entry: id, title, content (2-4 sentences, retrieval-friendly), tags
+documents:
+  - id: pharma_001
+    title: "Prior Authorization and Formulary Pre-Approval"
+    content: >
+      Prior authorization (formulary pre-approval) is a requirement by a payer that a
+      prescriber obtain approval before a specific drug is dispensed and covered.
+      The prescriber submits clinical justification; the payer reviews against formulary
+      criteria and responds within 72 hours for standard requests or 24 hours for urgent cases.
+      Denied requests can be appealed once with additional clinical documentation.
+    tags: [prior-auth, formulary, coverage]
+  - id: pharma_002
+    title: "Generic vs Brand Drug Names"
+    content: >
+      Every approved drug has a generic name (the active ingredient, non-proprietary) and
+      one or more brand names assigned by manufacturers.
+      Generic names are used in clinical guidelines and regulatory filings; brand names
+      appear in marketing materials and some payer formularies.
+      Substituting a brand drug with a generic equivalent requires explicit prescriber
+      or payer authorization in some jurisdictions.
+    tags: [drug-name, generic, brand]
+  - id: pharma_003
+    title: "Adverse Events and Safety Signals"
+    content: >
+      An adverse event is any undesirable medical occurrence in a patient administered
+      a medicinal product, regardless of causal relationship.
+      A safety signal (pharmacovigilance alert) is a pattern of adverse events that
+      suggests a previously unknown or incompletely documented causal relationship
+      between a drug and an outcome.
+      Adverse events must be reported to the regulatory authority within 15 days for
+      serious unexpected events and 90 days for expected events.
+    tags: [adverse-event, safety, pharmacovigilance]
+  - id: pharma_004
+    title: "Drug-Drug Interactions and Contraindications"
+    content: >
+      A drug-drug interaction (contraindication) occurs when one drug affects the activity
+      of another when both are administered together.
+      Interactions range from minor (monitoring recommended) to contraindicated (combination
+      must not be used). Severity classifications follow the clinical pharmacology guidelines
+      maintained in the formulary interaction database.
+      Prescribers are alerted at point-of-care when a contraindicated combination is entered.
+    tags: [drug-interaction, contraindication, safety]
+  - id: pharma_005
+    title: "Clinical Guidelines and Prescribing Pathways"
+    content: >
+      A treatment protocol (clinical guideline / prescribing pathway) is an evidence-based
+      document specifying the recommended sequence of therapies for a given condition.
+      Pathways are reviewed annually and updated when new efficacy or safety data emerges.
+      Payers use pathway adherence as a criterion in coverage decisions; deviation requires
+      documented clinical rationale.
+    tags: [protocol, guideline, treatment]
+  - id: pharma_006
+    title: "Formulary Coverage Tiers"
+    content: >
+      A formulary is a list of drugs covered by a payer, organized into tiers (benefit tiers)
+      that determine patient cost-sharing. Tier 1 is typically lowest cost (generics);
+      higher tiers carry higher copays. Moving a drug to a higher tier requires
+      a formulary committee review and a minimum 60-day notice to prescribers.
+    tags: [formulary, coverage, tier]
+  - id: pharma_007
+    title: "Dosage Adjustment and Titration"
+    content: >
+      A dosage adjustment (dose modification / titration step) is a change to a patient's
+      prescribed dose based on clinical response, tolerability, renal or hepatic function,
+      or drug interaction. Titration schedules specify the starting dose, increment size,
+      and minimum interval between increases. Adjustments outside the approved titration
+      schedule require prescriber documentation and may trigger a prior authorization review.
+    tags: [dosage, titration, dose]
+  - id: pharma_008
+    title: "Patient Consent and Enrollment Authorization"
+    content: >
+      Informed consent (enrollment authorization) is the process by which a patient
+      receives and acknowledges sufficient information about a treatment or study
+      to make a voluntary decision. For clinical trials, consent must be obtained
+      before any study procedure. Consent forms are version-controlled; if the
+      protocol changes materially, re-consent is required.
+    tags: [consent, enrollment, patient]
+  - id: pharma_009
+    title: "Clinical Trials and Investigational Programs"
+    content: >
+      A clinical trial (investigational program) is a structured study that evaluates
+      the safety or efficacy of a drug, device, or intervention in human subjects.
+      Trials are classified by phase: Phase I tests safety in a small cohort,
+      Phase II assesses efficacy and side effects, Phase III compares against
+      standard treatment at scale. Enrollment eligibility is defined by inclusion
+      and exclusion criteria in the protocol.
+    tags: [clinical-trial, study, investigational]

knowledge/pharma/term-catalog.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+# Pharma domain — client-specific terminology map
+# canonical term -> how each client calls it
+# This is what chain_terminology metric validates against
+clients:
+  clinixone:
+    display_name: "ClinixOne"
+    terms:
+      DRUG_APPROVAL: "prior authorization"
+      DRUG_NAME: "generic name"
+      ADVERSE_EVENT: "adverse event"
+      DRUG_INTERACTION: "contraindication"
+      TREATMENT_PROTOCOL: "clinical guideline"
+      FORMULARY_STATUS: "coverage tier"
+      DOSAGE_ADJUSTMENT: "dose modification"
+      SAFETY_SIGNAL: "safety signal"
+      PATIENT_CONSENT: "informed consent"
+      CLINICAL_TRIAL: "study enrollment"
+  pharmalink:
+    display_name: "PharmaLink"
+    terms:
+      DRUG_APPROVAL: "formulary pre-approval"
+      DRUG_NAME: "brand name"
+      ADVERSE_EVENT: "safety signal"
+      DRUG_INTERACTION: "drug-drug interaction"
+      TREATMENT_PROTOCOL: "prescribing pathway"
+      FORMULARY_STATUS: "benefit tier"
+      DOSAGE_ADJUSTMENT: "titration step"
+      SAFETY_SIGNAL: "pharmacovigilance alert"
+      PATIENT_CONSENT: "enrollment authorization"
+      CLINICAL_TRIAL: "investigational program"

knowledge/retail/features.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+# Retail domain — knowledge base documents
+# Retrieved by RAG pipeline, grounded against in faithfulness check
+# Each entry: id, title, content (2-4 sentences, retrieval-friendly), tags
+documents:
+  - id: retail_001
+    title: "Stock Check Process"
+    content: >
+      A stock check queries real-time inventory levels for a given product and location.
+      Results include current quantity on hand, reorder threshold, and last updated timestamp.
+      If quantity falls below threshold, an out-of-stock alert is automatically triggered.
+      Stock checks can be initiated manually or scheduled on a recurring basis.
+    tags: [inventory, stock, availability]
+  - id: retail_002
+    title: "Supplier Setup and Onboarding"
+    content: >
+      Supplier setup is the process of registering a new vendor in the system before
+      products can be sourced or orders placed. Required fields include legal entity name,
+      tax ID, payment terms, and primary contact. Incomplete supplier records block
+      purchase order creation until all mandatory fields are validated.
+    tags: [supplier, vendor, onboarding]
+  - id: retail_003
+    title: "Compliance Reporting"
+    content: >
+      Compliance reports capture a timestamped record of system actions, user decisions,
+      and policy rule evaluations for regulatory and internal audit purposes.
+      Reports are immutable once generated and stored for a minimum of seven years.
+      Access is restricted to users with the Audit role or higher.
+    tags: [compliance, audit, reporting]
+  - id: retail_004
+    title: "Feature Flags and Capability Switches"
+    content: >
+      Feature flags (also called capability switches) enable or disable product functionality
+      per client, region, or user segment without a code deployment.
+      Each flag has an owner, an activation scope, and an expiry date to prevent flag debt.
+      Enabling a flag in production requires sign-off from both the product and engineering lead.
+    tags: [feature-flags, configuration, rollout]
+  - id: retail_005
+    title: "Product Catalog Management"
+    content: >
+      The product catalog (item registry) is the authoritative source of product attributes
+      including SKU, description, category hierarchy, dimensions, and active status.
+      Catalog updates sync to all downstream systems within 15 minutes via event stream.
+      Deactivated products remain in the catalog as archived records and cannot be reactivated
+      without a manual review.
+    tags: [catalog, products, SKU]
+  - id: retail_006
+    title: "Price Update Workflow"
+    content: >
+      Price updates (pricing syncs) must be submitted through the pricing portal and require
+      approval for changes greater than 15% of the current price.
+      Approved changes go live at the next scheduled sync window, which runs every four hours.
+      Emergency price corrections outside the sync window require escalation to the pricing team.
+    tags: [pricing, price-update, workflow]
+  - id: retail_007
+    title: "Store Configuration"
+    content: >
+      Each store location has a configuration profile (location profile) that defines
+      operating hours, supported payment methods, fulfillment capabilities, and
+      regional compliance rules. Configuration changes take effect at store open
+      on the following business day. Misconfigured stores are flagged in the daily
+      operations health report.
+    tags: [store, configuration, location]
+  - id: retail_008
+    title: "Low Inventory Alerts"
+    content: >
+      A low inventory signal (out-of-stock alert) fires when a product's on-hand quantity
+      drops below its configured reorder point. Alerts are routed to the responsible
+      category manager and the supplying vendor simultaneously. Unacknowledged alerts
+      escalate to the regional operations lead after 24 hours.
+    tags: [inventory, alerts, stock]

knowledge/retail/term-catalog.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+# Retail domain — client-specific terminology map
+# canonical term -> how each client calls it
+# This is what chain_terminology metric validates against
+clients:
+  novamart:
+    display_name: "NovaMart"
+    terms:
+      STOCK_CHECK: "availability scan"
+      SUPPLIER_SETUP: "merchant onboarding"
+      COMPLIANCE_REPORT: "audit trail"
+      FEATURE_FLAG: "capability switch"
+      STOCK_ALERT: "low inventory signal"
+      PRODUCT_CATALOG: "item registry"
+      PRICE_UPDATE: "pricing sync"
+      STORE_CONFIG: "location profile"
+  shelfwise:
+    display_name: "ShelfWise"
+    terms:
+      STOCK_CHECK: "stock check"
+      SUPPLIER_SETUP: "supplier setup"
+      COMPLIANCE_REPORT: "compliance report"
+      FEATURE_FLAG: "feature toggle"
+      STOCK_ALERT: "out-of-stock alert"
+      PRODUCT_CATALOG: "product catalog"
+      PRICE_UPDATE: "price update"
+      STORE_CONFIG: "store configuration"

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+anthropic>=0.40.0
+fastapi>=0.115.0
+uvicorn[standard]>=0.30.0
+pyyaml>=6.0
+sentence-transformers>=3.0.0
+scikit-learn>=1.5.0
+numpy>=1.26.0
+python-multipart>=0.0.9

ui/app.js ADDED Viewed

	@@ -0,0 +1,235 @@

+const API = '';  // same origin
+let state = {
+  domain: null,
+  client: null,
+  domains: {},
+  loading: false,
+};
+// ── Boot ──────────────────────────────────────────────────────────────────
+async function boot() {
+  const res = await fetch(`${API}/config`);
+  const data = await res.json();
+  state.domains = data.domains;
+  const firstDomain = Object.keys(data.domains)[0];
+  renderDomainSwitcher();
+  selectDomain(firstDomain);
+  document.getElementById('send-btn').addEventListener('click', handleSend);
+  document.getElementById('query-input').addEventListener('keydown', e => {
+    if (e.key === 'Enter' && !e.shiftKey) handleSend();
+  });
+}
+// ── Switchers ─────────────────────────────────────────────────────────────
+function renderDomainSwitcher() {
+  const el = document.getElementById('domain-switcher');
+  el.innerHTML = Object.keys(state.domains).map(d => `
+    <button data-domain="${d}" onclick="selectDomain('${d}')">${capitalize(d)}</button>
+  `).join('');
+}
+function selectDomain(domain) {
+  state.domain = domain;
+  document.querySelectorAll('#domain-switcher button').forEach(b => {
+    b.classList.toggle('active', b.dataset.domain === domain);
+  });
+  const clients = state.domains[domain];
+  const el = document.getElementById('client-switcher');
+  el.innerHTML = clients.map(c => `
+    <button data-client="${c.id}" onclick="selectClient('${c.id}')">${c.display}</button>
+  `).join('');
+  selectClient(clients[0].id);
+}
+function selectClient(clientId) {
+  state.client = clientId;
+  document.querySelectorAll('#client-switcher button').forEach(b => {
+    b.classList.toggle('active', b.dataset.client === clientId);
+  });
+}
+// ── Send ──────────────────────────────────────────────────────────────────
+async function handleSend() {
+  const input = document.getElementById('query-input');
+  const query = input.value.trim();
+  if (!query || state.loading) return;
+  input.value = '';
+  setLoading(true);
+  appendMessage('user', query);
+  const thinkingEl = appendThinking();
+  try {
+    const res = await fetch(`${API}/query`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ query, client: state.client }),
+    });
+    if (!res.ok) {
+      const err = await res.json().catch(() => ({ detail: res.statusText }));
+      throw new Error(err.detail || 'Request failed');
+    }
+    const data = await res.json();
+    thinkingEl.remove();
+    appendBotMessage(data);
+    renderEval(data);
+  } catch (err) {
+    thinkingEl.remove();
+    appendMessage('bot', `Error: ${err.message}`);
+  } finally {
+    setLoading(false);
+  }
+}
+// ── Messages ──────────────────────────────────────────────────────────────
+function appendMessage(role, text) {
+  const el = document.createElement('div');
+  el.className = `message ${role}`;
+  el.innerHTML = `
+    <div class="bubble">${escapeHtml(text)}</div>
+    <div class="meta">${role === 'user' ? 'You' : 'Bot'}</div>
+  `;
+  getMessages().appendChild(el);
+  scrollMessages();
+  return el;
+}
+function appendBotMessage(data) {
+  const overall = data.evaluation.overall_pass;
+  const verdictClass = overall ? 'pass' : 'fail';
+  const verdictLabel = overall ? '✓ All checks passed' : '✗ Checks failed';
+  const el = document.createElement('div');
+  el.className = 'message bot';
+  el.innerHTML = `
+    <div class="bubble">${escapeHtml(data.answer)}</div>
+    <div class="verdict ${verdictClass}">${verdictLabel}</div>
+    <div class="meta">${data.client_display}</div>
+  `;
+  getMessages().appendChild(el);
+  scrollMessages();
+}
+function appendThinking() {
+  const wrap = document.createElement('div');
+  wrap.className = 'message bot';
+  wrap.innerHTML = `
+    <div class="thinking">
+      <span></span><span></span><span></span>
+    </div>
+  `;
+  getMessages().appendChild(wrap);
+  scrollMessages();
+  return wrap;
+}
+// ── Eval panel ────────────────────────────────────────────────────────────
+const METRIC_LABELS = {
+  pii_leakage:        'PII Leakage',
+  token_budget:       'Token Budget',
+  answer_relevancy:   'Answer Relevancy',
+  faithfulness:       'Faithfulness',
+  chain_terminology:  'Chain Terminology',
+};
+const METRIC_DESC = {
+  pii_leakage:       'Regex scan — no PII in response',
+  token_budget:      'Response within token ceiling',
+  answer_relevancy:  'Cosine similarity: query ↔ response',
+  faithfulness:      'Claude judge: grounded in retrieved context?',
+  chain_terminology: 'Deterministic: client-specific terms used',
+};
+function renderEval(data) {
+  const metrics = data.evaluation.metrics;
+  const sources = data.sources;
+  const metricCards = Object.entries(metrics).map(([key, m]) => {
+    const cls = scoreClass(m.score, key);
+    const pct = Math.round(m.score * 100);
+    return `
+      <div class="metric-card ${cls}">
+        <div class="metric-header">
+          <span class="metric-name">${METRIC_LABELS[key] || key}</span>
+          <span class="score-badge ${cls}">${pct}%</span>
+        </div>
+        <div class="metric-detail">${escapeHtml(METRIC_DESC[key] || '')}</div>
+        <div class="metric-detail" style="margin-top:4px;color:#6a8aaa">${escapeHtml(m.detail)}</div>
+        <div class="score-bar-wrap">
+          <div class="score-bar-bg">
+            <div class="score-bar-fill ${cls}" style="width:${pct}%"></div>
+          </div>
+        </div>
+      </div>
+    `;
+  }).join('');
+  const sourceItems = sources.map(s => `
+    <div class="source-item">
+      <span class="source-title">${escapeHtml(s.title)}</span>
+      <span class="source-score">${(s.score * 100).toFixed(0)}%</span>
+    </div>
+  `).join('');
+  document.getElementById('eval-body').innerHTML = `
+    <div class="eval-content">
+      ${metricCards}
+      <div class="sources-section">
+        <div class="sources-label">Retrieved Sources</div>
+        ${sourceItems || '<div style="font-size:11px;color:#8aabcc">No sources retrieved</div>'}
+      </div>
+    </div>
+  `;
+}
+function scoreClass(score, metric) {
+  // pii_leakage: 1.0 = pass, anything else = fail (binary)
+  if (metric === 'pii_leakage') return score === 1.0 ? 'pass' : 'fail';
+  if (score >= 0.75) return 'pass';
+  if (score >= 0.45) return 'warn';
+  return 'fail';
+}
+// ── Helpers ───────────────────────────────────────────────────────────────
+function setLoading(val) {
+  state.loading = val;
+  document.getElementById('send-btn').disabled = val;
+  document.getElementById('query-input').disabled = val;
+}
+function getMessages() {
+  return document.getElementById('messages');
+}
+function scrollMessages() {
+  const el = getMessages();
+  el.scrollTop = el.scrollHeight;
+}
+function capitalize(s) {
+  return s.charAt(0).toUpperCase() + s.slice(1);
+}
+function escapeHtml(str) {
+  return String(str)
+    .replace(/&/g, '&amp;')
+    .replace(/</g, '&lt;')
+    .replace(/>/g, '&gt;')
+    .replace(/"/g, '&quot;');
+}
+boot();

ui/index.html ADDED Viewed

	@@ -0,0 +1,410 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>AI Response Validator</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800;900&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
+  <style>
+    * { margin: 0; padding: 0; box-sizing: border-box; }
+    body {
+      font-family: 'Inter', sans-serif;
+      background: #eef4fc;
+      color: #1a1a1a;
+      height: 100vh;
+      display: grid;
+      grid-template-rows: auto 1fr;
+      overflow: hidden;
+    }
+    /* ── Header ── */
+    header {
+      background: #fff;
+      border-bottom: 2px solid #1e3a5f;
+      padding: 14px 28px;
+      display: flex;
+      align-items: center;
+      justify-content: space-between;
+      gap: 24px;
+    }
+    .header-left h1 {
+      font-size: 22px;
+      font-weight: 900;
+      color: #1a1a1a;
+      letter-spacing: -0.5px;
+    }
+    .header-left h1 span { color: #3a6ea8; }
+    .header-left .tagline {
+      font-size: 11px;
+      color: #8aabcc;
+      margin-top: 2px;
+    }
+    /* ── Domain / Client switcher ── */
+    .switcher {
+      display: flex;
+      align-items: center;
+      gap: 10px;
+      flex-wrap: wrap;
+    }
+    .switcher label {
+      font-size: 10px;
+      font-weight: 700;
+      text-transform: uppercase;
+      letter-spacing: 1.5px;
+      color: #8aabcc;
+    }
+    .btn-group {
+      display: flex;
+      border: 1px solid #c8dff5;
+      border-radius: 5px;
+      overflow: hidden;
+    }
+    .btn-group button {
+      background: #fff;
+      border: none;
+      border-right: 1px solid #c8dff5;
+      padding: 6px 14px;
+      font-size: 12px;
+      font-weight: 600;
+      color: #4a6a8a;
+      cursor: pointer;
+      transition: background 0.15s, color 0.15s;
+    }
+    .btn-group button:last-child { border-right: none; }
+    .btn-group button.active {
+      background: #1e3a5f;
+      color: #fff;
+    }
+    .btn-group button:hover:not(.active) { background: #eef4fc; }
+    .divider-v {
+      width: 1px;
+      height: 28px;
+      background: #c8dff5;
+    }
+    /* ── Main layout ── */
+    main {
+      display: grid;
+      grid-template-columns: 1fr 360px;
+      overflow: hidden;
+    }
+    /* ── Chat panel ── */
+    .chat-panel {
+      display: flex;
+      flex-direction: column;
+      border-right: 1px solid #c8dff5;
+      overflow: hidden;
+    }
+    .messages {
+      flex: 1;
+      overflow-y: auto;
+      padding: 24px 28px;
+      display: flex;
+      flex-direction: column;
+      gap: 16px;
+    }
+    .message {
+      display: flex;
+      flex-direction: column;
+      gap: 4px;
+      max-width: 80%;
+    }
+    .message.user { align-self: flex-end; }
+    .message.bot  { align-self: flex-start; }
+    .message .bubble {
+      padding: 12px 16px;
+      border-radius: 8px;
+      font-size: 13.5px;
+      line-height: 1.6;
+    }
+    .message.user .bubble {
+      background: #1e3a5f;
+      color: #fff;
+      border-radius: 8px 8px 2px 8px;
+    }
+    .message.bot .bubble {
+      background: #fff;
+      color: #1a1a1a;
+      border: 1px solid #c8dff5;
+      border-radius: 8px 8px 8px 2px;
+    }
+    .message .meta {
+      font-size: 10px;
+      color: #8aabcc;
+      padding: 0 4px;
+    }
+    .message.user .meta { text-align: right; }
+    /* overall pass/fail badge on bot message */
+    .verdict {
+      display: inline-flex;
+      align-items: center;
+      gap: 5px;
+      font-size: 10px;
+      font-weight: 700;
+      padding: 2px 8px;
+      border-radius: 3px;
+      margin-top: 4px;
+      align-self: flex-start;
+    }
+    .verdict.pass { background: #f1f8f1; color: #2e7d32; border: 1px solid #c8e6c9; }
+    .verdict.fail { background: #fdf1f1; color: #c62828; border: 1px solid #ffcdd2; }
+    .verdict.warn { background: #fffbf0; color: #a06000; border: 1px solid #ffe082; }
+    /* ── Input bar ── */
+    .input-bar {
+      padding: 16px 28px;
+      background: #fff;
+      border-top: 1px solid #c8dff5;
+      display: flex;
+      gap: 10px;
+    }
+    .input-bar input {
+      flex: 1;
+      padding: 10px 14px;
+      border: 1px solid #c8dff5;
+      border-radius: 6px;
+      font-size: 13.5px;
+      font-family: 'Inter', sans-serif;
+      outline: none;
+      transition: border-color 0.15s;
+    }
+    .input-bar input:focus { border-color: #3a6ea8; }
+    .input-bar input:disabled { background: #f5f9ff; color: #8aabcc; }
+    .input-bar button {
+      padding: 10px 20px;
+      background: #1e3a5f;
+      color: #fff;
+      border: none;
+      border-radius: 6px;
+      font-size: 13px;
+      font-weight: 700;
+      cursor: pointer;
+      transition: background 0.15s;
+      white-space: nowrap;
+    }
+    .input-bar button:hover:not(:disabled) { background: #3a6ea8; }
+    .input-bar button:disabled { background: #93b8d8; cursor: not-allowed; }
+    /* ── Eval panel ── */
+    .eval-panel {
+      background: #fff;
+      overflow-y: auto;
+      display: flex;
+      flex-direction: column;
+    }
+    .eval-panel .panel-header {
+      padding: 16px 20px 12px;
+      border-bottom: 1px solid #e8f2ff;
+      font-size: 10px;
+      font-weight: 800;
+      text-transform: uppercase;
+      letter-spacing: 2px;
+      color: #8aabcc;
+      position: sticky;
+      top: 0;
+      background: #fff;
+      z-index: 1;
+    }
+    .eval-empty {
+      flex: 1;
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      justify-content: center;
+      gap: 10px;
+      color: #b0cce8;
+      padding: 40px 20px;
+      text-align: center;
+    }
+    .eval-empty .icon { font-size: 36px; }
+    .eval-empty p { font-size: 12px; line-height: 1.6; }
+    .eval-content { padding: 16px 20px; display: flex; flex-direction: column; gap: 20px; }
+    /* Metric card */
+    .metric-card {
+      border: 1px solid #e0eef8;
+      border-left: 3px solid #1e3a5f;
+      border-radius: 0 6px 6px 0;
+      padding: 12px 14px;
+      background: #f5f9ff;
+    }
+    .metric-card.pass { border-left-color: #4caf50; background: #f0faf3; }
+    .metric-card.fail { border-left-color: #c62828; background: #fdf5f5; }
+    .metric-card.warn { border-left-color: #f9a825; background: #fffdf0; }
+    .metric-card .metric-header {
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      margin-bottom: 6px;
+    }
+    .metric-card .metric-name {
+      font-size: 12px;
+      font-weight: 800;
+      color: #1e3a5f;
+      font-family: 'JetBrains Mono', monospace;
+    }
+    .metric-card.pass .metric-name { color: #2e7d32; }
+    .metric-card.fail .metric-name { color: #c62828; }
+    .score-badge {
+      font-family: 'JetBrains Mono', monospace;
+      font-size: 11px;
+      font-weight: 700;
+      padding: 2px 8px;
+      border-radius: 3px;
+      border: 1px solid;
+    }
+    .score-badge.pass { background: #f1f8f1; color: #2e7d32; border-color: #c8e6c9; }
+    .score-badge.fail { background: #fdf1f1; color: #c62828; border-color: #ffcdd2; }
+    .score-badge.warn { background: #fffbf0; color: #a06000; border-color: #ffe082; }
+    .metric-card .metric-detail {
+      font-size: 11px;
+      color: #4a6080;
+      line-height: 1.5;
+    }
+    /* Score bar */
+    .score-bar-wrap { margin-top: 8px; }
+    .score-bar-bg {
+      height: 4px;
+      background: #e0eef8;
+      border-radius: 2px;
+      overflow: hidden;
+    }
+    .score-bar-fill {
+      height: 100%;
+      border-radius: 2px;
+      transition: width 0.4s ease;
+    }
+    .score-bar-fill.pass { background: #4caf50; }
+    .score-bar-fill.fail { background: #c62828; }
+    .score-bar-fill.warn { background: #f9a825; }
+    /* Sources */
+    .sources-section .sources-label {
+      font-size: 10px;
+      font-weight: 700;
+      text-transform: uppercase;
+      letter-spacing: 1.5px;
+      color: #8aabcc;
+      margin-bottom: 8px;
+    }
+    .source-item {
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      padding: 7px 10px;
+      background: #f5f9ff;
+      border: 1px solid #e0eef8;
+      border-radius: 5px;
+      margin-bottom: 5px;
+      font-size: 11.5px;
+    }
+    .source-item .source-title { color: #2a4a6a; font-weight: 500; }
+    .source-item .source-score {
+      font-family: 'JetBrains Mono', monospace;
+      font-size: 10px;
+      color: #8aabcc;
+    }
+    /* Thinking indicator */
+    .thinking {
+      display: flex;
+      gap: 5px;
+      align-items: center;
+      padding: 12px 16px;
+      background: #fff;
+      border: 1px solid #c8dff5;
+      border-radius: 8px 8px 8px 2px;
+      width: fit-content;
+    }
+    .thinking span {
+      width: 7px; height: 7px;
+      background: #3a6ea8;
+      border-radius: 50%;
+      animation: bounce 1.2s infinite ease-in-out;
+    }
+    .thinking span:nth-child(2) { animation-delay: 0.2s; }
+    .thinking span:nth-child(3) { animation-delay: 0.4s; }
+    @keyframes bounce {
+      0%, 80%, 100% { transform: scale(0.6); opacity: 0.4; }
+      40% { transform: scale(1); opacity: 1; }
+    }
+    /* Scrollbar */
+    ::-webkit-scrollbar { width: 5px; }
+    ::-webkit-scrollbar-track { background: transparent; }
+    ::-webkit-scrollbar-thumb { background: #c8dff5; border-radius: 3px; }
+  </style>
+</head>
+<body>
+<header>
+  <div class="header-left">
+    <h1>AI Response <span>Validator</span></h1>
+    <div class="tagline">Domain-agnostic RAG evaluation · real-time L1 metrics · RosettaStone terminology</div>
+  </div>
+  <div class="switcher">
+    <label>Domain</label>
+    <div class="btn-group" id="domain-switcher"></div>
+    <div class="divider-v"></div>
+    <label>Client</label>
+    <div class="btn-group" id="client-switcher"></div>
+  </div>
+</header>
+<main>
+  <div class="chat-panel">
+    <div class="messages" id="messages">
+      <!-- populated by app.js -->
+    </div>
+    <div class="input-bar">
+      <input
+        type="text"
+        id="query-input"
+        placeholder="Ask something…"
+        autocomplete="off"
+      />
+      <button id="send-btn">Send</button>
+    </div>
+  </div>
+  <div class="eval-panel">
+    <div class="panel-header">Evaluation</div>
+    <div id="eval-body">
+      <div class="eval-empty">
+        <div class="icon">◎</div>
+        <p>Send a message to see<br>real-time metric evaluation.</p>
+      </div>
+    </div>
+  </div>
+</main>
+<script src="/static/app.js"></script>
+</body>
+</html>