mbochniak01 commited on
Commit ·
ebe934f
1
Parent(s): b917936
Add full RAG evaluation pipeline with L1 metrics and UI
Browse files- FastAPI backend: retrieve → generate → grade pipeline
- In-memory semantic retrieval (sentence-transformers, all-MiniLM-L6-v2)
- L1 graders: pii_leakage, token_budget, answer_relevancy, faithfulness, chain_terminology
- RosettaStone: deterministic client-specific terminology validation
- Two domains (retail, pharma) with two fictional clients each
- Plain HTML/JS frontend with real-time metric panel
- 20-pair golden dataset for L2 batch evaluation
- Docker config for HF Spaces (port 7860)
- ARCHITECTURE.md +0 -0
- Dockerfile +26 -0
- README.md +17 -4
- backend/app.py +92 -0
- backend/config.py +36 -0
- backend/grader.py +215 -0
- backend/pipeline.py +152 -0
- backend/rosetta.py +62 -0
- eval/golden-dataset.yaml +341 -0
- eval/metrics.py +0 -0
- knowledge/pharma/features.yaml +98 -0
- knowledge/pharma/term-catalog.yaml +32 -0
- knowledge/retail/features.yaml +78 -0
- knowledge/retail/term-catalog.yaml +28 -0
- requirements.txt +8 -0
- ui/app.js +235 -0
- ui/index.html +410 -0
ARCHITECTURE.md
ADDED
|
File without changes
|
Dockerfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# System deps for sentence-transformers (tokenizers uses Rust bindings)
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
build-essential \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
COPY requirements.txt .
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Pre-download the embedding model so first request isn't slow on HF Spaces
|
| 14 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
|
| 15 |
+
|
| 16 |
+
COPY knowledge/ ./knowledge/
|
| 17 |
+
COPY backend/ ./backend/
|
| 18 |
+
COPY ui/ ./ui/
|
| 19 |
+
|
| 20 |
+
WORKDIR /app/backend
|
| 21 |
+
|
| 22 |
+
# HF Spaces requires port 7860
|
| 23 |
+
ENV PORT=7860
|
| 24 |
+
EXPOSE 7860
|
| 25 |
+
|
| 26 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,23 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: blue
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: AI Response Validator
|
| 3 |
+
emoji: 🔍
|
| 4 |
colorFrom: blue
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# AI Response Validator
|
| 12 |
+
|
| 13 |
+
Domain-agnostic RAG evaluation system. Validates AI responses for correctness,
|
| 14 |
+
faithfulness, and client-specific terminology across retail and pharma domains.
|
| 15 |
+
|
| 16 |
+
**Live demo:** select a domain and client, then ask a question in natural language.
|
| 17 |
+
Each response is evaluated in real time across 5 metrics:
|
| 18 |
+
|
| 19 |
+
- **PII Leakage** — regex scan, no personal data in responses
|
| 20 |
+
- **Token Budget** — response within ceiling
|
| 21 |
+
- **Answer Relevancy** — cosine similarity between query and response
|
| 22 |
+
- **Faithfulness** — Claude judge: is the answer grounded in retrieved context?
|
| 23 |
+
- **Chain Terminology** — deterministic check that the bot uses client-specific terms
|
backend/app.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from contextlib import asynccontextmanager
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import anthropic
|
| 7 |
+
from fastapi import FastAPI, HTTPException
|
| 8 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 9 |
+
from fastapi.responses import FileResponse
|
| 10 |
+
from fastapi.staticfiles import StaticFiles
|
| 11 |
+
from pydantic import BaseModel
|
| 12 |
+
|
| 13 |
+
from config import DOMAIN_CLIENTS, CLIENT_DOMAIN, DISPLAY_NAMES
|
| 14 |
+
from pipeline import run
|
| 15 |
+
|
| 16 |
+
log = logging.getLogger(__name__)
|
| 17 |
+
logging.basicConfig(level=logging.INFO)
|
| 18 |
+
|
| 19 |
+
UI_DIR = Path(__file__).parent.parent / "ui"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@asynccontextmanager
|
| 23 |
+
async def lifespan(app: FastAPI):
|
| 24 |
+
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
| 25 |
+
if not api_key:
|
| 26 |
+
raise RuntimeError("ANTHROPIC_API_KEY not set")
|
| 27 |
+
app.state.anthropic = anthropic.Anthropic(api_key=api_key)
|
| 28 |
+
log.info("Anthropic client ready")
|
| 29 |
+
yield
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
app = FastAPI(title="AI Response Validator", lifespan=lifespan)
|
| 33 |
+
|
| 34 |
+
app.add_middleware(
|
| 35 |
+
CORSMiddleware,
|
| 36 |
+
allow_origins=["*"],
|
| 37 |
+
allow_methods=["GET", "POST"],
|
| 38 |
+
allow_headers=["*"],
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class QueryRequest(BaseModel):
|
| 43 |
+
query: str
|
| 44 |
+
client: str
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class QueryResponse(BaseModel):
|
| 48 |
+
query: str
|
| 49 |
+
client: str
|
| 50 |
+
client_display: str
|
| 51 |
+
answer: str
|
| 52 |
+
sources: list[dict]
|
| 53 |
+
evaluation: dict
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@app.get("/health")
|
| 57 |
+
def health():
|
| 58 |
+
return {"status": "ok"}
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@app.get("/config")
|
| 62 |
+
def get_config():
|
| 63 |
+
"""Domain/client structure for the UI switcher."""
|
| 64 |
+
return {
|
| 65 |
+
"domains": {
|
| 66 |
+
domain: [{"id": c, "display": DISPLAY_NAMES[c]} for c in clients]
|
| 67 |
+
for domain, clients in DOMAIN_CLIENTS.items()
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@app.post("/query", response_model=QueryResponse)
|
| 73 |
+
def handle_query(req: QueryRequest):
|
| 74 |
+
if req.client not in CLIENT_DOMAIN:
|
| 75 |
+
raise HTTPException(status_code=400, detail=f"Unknown client: {req.client!r}")
|
| 76 |
+
if not req.query.strip():
|
| 77 |
+
raise HTTPException(status_code=400, detail="Query cannot be empty")
|
| 78 |
+
|
| 79 |
+
result = run(
|
| 80 |
+
query=req.query.strip(),
|
| 81 |
+
client=req.client,
|
| 82 |
+
anthropic_client=app.state.anthropic,
|
| 83 |
+
)
|
| 84 |
+
return result.response_payload
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
app.mount("/static", StaticFiles(directory=UI_DIR), name="static")
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@app.get("/")
|
| 91 |
+
def root():
|
| 92 |
+
return FileResponse(UI_DIR / "index.html")
|
backend/config.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
KNOWLEDGE_ROOT = Path(__file__).parent.parent / "knowledge"
|
| 4 |
+
EMBEDDER_MODEL = "all-MiniLM-L6-v2"
|
| 5 |
+
|
| 6 |
+
DOMAIN_CLIENTS: dict[str, list[str]] = {
|
| 7 |
+
"retail": ["novamart", "shelfwise"],
|
| 8 |
+
"pharma": ["clinixone", "pharmalink"],
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
CLIENT_DOMAIN: dict[str, str] = {
|
| 12 |
+
client: domain
|
| 13 |
+
for domain, clients in DOMAIN_CLIENTS.items()
|
| 14 |
+
for client in clients
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
DISPLAY_NAMES: dict[str, str] = {
|
| 18 |
+
"novamart": "NovaMart",
|
| 19 |
+
"shelfwise": "ShelfWise",
|
| 20 |
+
"clinixone": "ClinixOne",
|
| 21 |
+
"pharmalink": "PharmaLink",
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def term_catalog_path(domain: str) -> Path:
|
| 26 |
+
return KNOWLEDGE_ROOT / domain / "term-catalog.yaml"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def features_path(domain: str) -> Path:
|
| 30 |
+
return KNOWLEDGE_ROOT / domain / "features.yaml"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def domain_for(client: str) -> str:
|
| 34 |
+
if client not in CLIENT_DOMAIN:
|
| 35 |
+
raise ValueError(f"Unknown client: {client!r}. Valid: {list(CLIENT_DOMAIN)}")
|
| 36 |
+
return CLIENT_DOMAIN[client]
|
backend/grader.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
L1 graders — run live on every query.
|
| 3 |
+
|
| 4 |
+
Metrics:
|
| 5 |
+
pii_leakage — regex scan for PII patterns in response
|
| 6 |
+
token_budget — response within allowed token ceiling
|
| 7 |
+
answer_relevancy — cosine similarity between query and response embeddings
|
| 8 |
+
faithfulness — Claude judge: is response grounded in retrieved context?
|
| 9 |
+
chain_terminology — deterministic: client-specific terms used (via RosettaStone)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import re
|
| 13 |
+
import json
|
| 14 |
+
import logging
|
| 15 |
+
from dataclasses import dataclass, field
|
| 16 |
+
|
| 17 |
+
import anthropic
|
| 18 |
+
from sentence_transformers import SentenceTransformer
|
| 19 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 20 |
+
|
| 21 |
+
from config import EMBEDDER_MODEL
|
| 22 |
+
from rosetta import check_terminology
|
| 23 |
+
|
| 24 |
+
log = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
_embedder: SentenceTransformer | None = None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def get_embedder() -> SentenceTransformer:
|
| 30 |
+
"""Return the shared sentence-transformer instance, loading it on first call."""
|
| 31 |
+
global _embedder
|
| 32 |
+
if _embedder is None:
|
| 33 |
+
_embedder = SentenceTransformer(EMBEDDER_MODEL)
|
| 34 |
+
return _embedder
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@dataclass(slots=True)
|
| 38 |
+
class GradeResult:
|
| 39 |
+
metric: str
|
| 40 |
+
passed: bool
|
| 41 |
+
score: float
|
| 42 |
+
detail: str = ""
|
| 43 |
+
metadata: dict = field(default_factory=dict)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass(slots=True)
|
| 47 |
+
class GradeReport:
|
| 48 |
+
client: str
|
| 49 |
+
query: str
|
| 50 |
+
results: list[GradeResult] = field(default_factory=list)
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def overall(self) -> bool:
|
| 54 |
+
return all(r.passed for r in self.results)
|
| 55 |
+
|
| 56 |
+
@property
|
| 57 |
+
def summary(self) -> dict:
|
| 58 |
+
return {
|
| 59 |
+
"overall_pass": self.overall,
|
| 60 |
+
"metrics": {
|
| 61 |
+
r.metric: {"passed": r.passed, "score": round(r.score, 3), "detail": r.detail}
|
| 62 |
+
for r in self.results
|
| 63 |
+
},
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
_PII_PATTERNS = [
|
| 68 |
+
(r"\b\d{3}-\d{2}-\d{4}\b", "SSN"),
|
| 69 |
+
(r"\b\d{16}\b", "credit card"),
|
| 70 |
+
(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b", "email"),
|
| 71 |
+
(r"\b\d{3}[\s.\-]?\d{3}[\s.\-]?\d{4}\b", "phone"),
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
TOKEN_BUDGET = 512
|
| 75 |
+
RELEVANCY_THRESHOLD = 0.45
|
| 76 |
+
FAITHFULNESS_THRESHOLD = 0.7
|
| 77 |
+
|
| 78 |
+
FAITHFULNESS_PROMPT = """\
|
| 79 |
+
You are a faithfulness evaluator. Your task is to assess whether an AI response
|
| 80 |
+
is fully grounded in the provided context and contains no unsupported claims.
|
| 81 |
+
|
| 82 |
+
Context:
|
| 83 |
+
<context>
|
| 84 |
+
{context}
|
| 85 |
+
</context>
|
| 86 |
+
|
| 87 |
+
Response to evaluate:
|
| 88 |
+
<response>
|
| 89 |
+
{response}
|
| 90 |
+
</response>
|
| 91 |
+
|
| 92 |
+
Rules:
|
| 93 |
+
- A claim is faithful if it can be directly inferred from the context.
|
| 94 |
+
- A claim is unfaithful if it introduces facts not present in the context.
|
| 95 |
+
- Ignore stylistic differences; focus only on factual grounding.
|
| 96 |
+
|
| 97 |
+
Respond with JSON only, no explanation outside the JSON:
|
| 98 |
+
{{
|
| 99 |
+
"faithful": true | false,
|
| 100 |
+
"score": 0.0-1.0,
|
| 101 |
+
"unsupported_claims": ["claim1", "claim2"]
|
| 102 |
+
}}"""
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def grade_pii_leakage(response: str) -> GradeResult:
|
| 106 |
+
"""Scan response for PII patterns; fail on any match."""
|
| 107 |
+
found = [label for pattern, label in _PII_PATTERNS if re.search(pattern, response)]
|
| 108 |
+
return GradeResult(
|
| 109 |
+
metric="pii_leakage",
|
| 110 |
+
passed=not found,
|
| 111 |
+
score=0.0 if found else 1.0,
|
| 112 |
+
detail=f"Detected: {', '.join(found)}" if found else "Clean",
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def grade_token_budget(response: str, budget: int = TOKEN_BUDGET) -> GradeResult:
|
| 117 |
+
"""Fail if estimated token count exceeds budget."""
|
| 118 |
+
approx_tokens = len(response) // 4
|
| 119 |
+
passed = approx_tokens <= budget
|
| 120 |
+
return GradeResult(
|
| 121 |
+
metric="token_budget",
|
| 122 |
+
passed=passed,
|
| 123 |
+
score=1.0 if passed else max(0.0, 1.0 - approx_tokens / budget),
|
| 124 |
+
detail=f"~{approx_tokens} tokens (budget: {budget})",
|
| 125 |
+
metadata={"approx_tokens": approx_tokens, "budget": budget},
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def grade_answer_relevancy(query: str, response: str) -> GradeResult:
|
| 130 |
+
"""Score semantic similarity between query and response via cosine distance."""
|
| 131 |
+
embedder = get_embedder()
|
| 132 |
+
q_vec = embedder.encode([query])
|
| 133 |
+
r_vec = embedder.encode([response])
|
| 134 |
+
score = float(cosine_similarity(q_vec, r_vec)[0][0])
|
| 135 |
+
return GradeResult(
|
| 136 |
+
metric="answer_relevancy",
|
| 137 |
+
passed=score >= RELEVANCY_THRESHOLD,
|
| 138 |
+
score=score,
|
| 139 |
+
detail=f"Cosine {score:.3f} (threshold: {RELEVANCY_THRESHOLD})",
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def grade_faithfulness(
|
| 144 |
+
response: str,
|
| 145 |
+
context: str,
|
| 146 |
+
anthropic_client: anthropic.Anthropic,
|
| 147 |
+
) -> GradeResult:
|
| 148 |
+
"""Ask Claude to judge whether the response is grounded in retrieved context."""
|
| 149 |
+
prompt = FAITHFULNESS_PROMPT.format(context=context, response=response)
|
| 150 |
+
try:
|
| 151 |
+
message = anthropic_client.messages.create(
|
| 152 |
+
model="claude-haiku-4-5-20251001",
|
| 153 |
+
max_tokens=256,
|
| 154 |
+
messages=[{"role": "user", "content": prompt}],
|
| 155 |
+
)
|
| 156 |
+
parsed = json.loads(message.content[0].text.strip())
|
| 157 |
+
score = float(parsed.get("score", 0.0))
|
| 158 |
+
unsupported = parsed.get("unsupported_claims", [])
|
| 159 |
+
passed = parsed.get("faithful", False) and score >= FAITHFULNESS_THRESHOLD
|
| 160 |
+
detail = f"Score {score:.2f}" + (f" — unsupported: {unsupported}" if unsupported else "")
|
| 161 |
+
return GradeResult(
|
| 162 |
+
metric="faithfulness",
|
| 163 |
+
passed=passed,
|
| 164 |
+
score=score,
|
| 165 |
+
detail=detail,
|
| 166 |
+
metadata={"unsupported_claims": unsupported},
|
| 167 |
+
)
|
| 168 |
+
except (json.JSONDecodeError, anthropic.APIError) as exc:
|
| 169 |
+
log.warning("Faithfulness grader failed: %s", exc)
|
| 170 |
+
return GradeResult(
|
| 171 |
+
metric="faithfulness",
|
| 172 |
+
passed=False,
|
| 173 |
+
score=0.0,
|
| 174 |
+
detail=f"Grader error: {exc}",
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def grade_chain_terminology(response: str, client: str) -> GradeResult:
|
| 179 |
+
"""Check that the response uses client-specific terms, not rival terminology."""
|
| 180 |
+
result = check_terminology(response, client)
|
| 181 |
+
violations = result["violations"]
|
| 182 |
+
checked = result["checked"]
|
| 183 |
+
score = 1.0 - (len(violations) / checked) if checked else 1.0
|
| 184 |
+
detail = (
|
| 185 |
+
f"{len(violations)} violation(s): " +
|
| 186 |
+
", ".join(f"{v['found']!r} → should be {v['expected']!r}" for v in violations)
|
| 187 |
+
if violations else f"All {checked} terms correct"
|
| 188 |
+
)
|
| 189 |
+
return GradeResult(
|
| 190 |
+
metric="chain_terminology",
|
| 191 |
+
passed=result["pass"],
|
| 192 |
+
score=score,
|
| 193 |
+
detail=detail,
|
| 194 |
+
metadata={"violations": violations},
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def grade(
|
| 199 |
+
query: str,
|
| 200 |
+
response: str,
|
| 201 |
+
context: str,
|
| 202 |
+
client: str,
|
| 203 |
+
anthropic_client: anthropic.Anthropic,
|
| 204 |
+
token_budget: int = TOKEN_BUDGET,
|
| 205 |
+
) -> GradeReport:
|
| 206 |
+
"""Run all L1 graders and return a consolidated report."""
|
| 207 |
+
report = GradeReport(client=client, query=query)
|
| 208 |
+
report.results = [
|
| 209 |
+
grade_pii_leakage(response),
|
| 210 |
+
grade_token_budget(response, token_budget),
|
| 211 |
+
grade_answer_relevancy(query, response),
|
| 212 |
+
grade_faithfulness(response, context, anthropic_client),
|
| 213 |
+
grade_chain_terminology(response, client),
|
| 214 |
+
]
|
| 215 |
+
return report
|
backend/pipeline.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RAG pipeline: retrieve → generate → grade.
|
| 3 |
+
|
| 4 |
+
Retrieval: in-memory semantic search (sentence-transformers, encoded at first use per domain).
|
| 5 |
+
Generation: Claude with retrieved context injected as grounding.
|
| 6 |
+
Grading: L1 metrics via grader.py.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
|
| 12 |
+
import anthropic
|
| 13 |
+
import numpy as np
|
| 14 |
+
import yaml
|
| 15 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 16 |
+
from sentence_transformers import SentenceTransformer
|
| 17 |
+
|
| 18 |
+
from config import features_path, domain_for, DISPLAY_NAMES
|
| 19 |
+
from grader import grade, GradeReport, get_embedder
|
| 20 |
+
|
| 21 |
+
log = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
TOP_K = 3
|
| 24 |
+
MIN_RETRIEVAL_SCORE = 0.1
|
| 25 |
+
|
| 26 |
+
SYSTEM_PROMPT = """\
|
| 27 |
+
You are a helpful assistant for {client_display} ({domain} domain).
|
| 28 |
+
Answer the user's question using only the information in the provided context.
|
| 29 |
+
Be concise. Use the terminology natural to {client_display} — do not use internal
|
| 30 |
+
or competitor terminology. If the context does not contain enough information to
|
| 31 |
+
answer, say so clearly rather than speculating."""
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@dataclass(slots=True)
|
| 35 |
+
class RetrievedDoc:
|
| 36 |
+
id: str
|
| 37 |
+
title: str
|
| 38 |
+
content: str
|
| 39 |
+
score: float
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass(slots=True)
|
| 43 |
+
class PipelineResult:
|
| 44 |
+
query: str
|
| 45 |
+
client: str
|
| 46 |
+
answer: str
|
| 47 |
+
retrieved_docs: list[RetrievedDoc]
|
| 48 |
+
grade_report: GradeReport
|
| 49 |
+
context_used: str
|
| 50 |
+
|
| 51 |
+
@property
|
| 52 |
+
def response_payload(self) -> dict:
|
| 53 |
+
return {
|
| 54 |
+
"query": self.query,
|
| 55 |
+
"client": self.client,
|
| 56 |
+
"client_display": DISPLAY_NAMES.get(self.client, self.client),
|
| 57 |
+
"answer": self.answer,
|
| 58 |
+
"sources": [
|
| 59 |
+
{"id": d.id, "title": d.title, "score": round(d.score, 3)}
|
| 60 |
+
for d in self.retrieved_docs
|
| 61 |
+
],
|
| 62 |
+
"evaluation": self.grade_report.summary,
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@dataclass(slots=True)
|
| 67 |
+
class KBIndex:
|
| 68 |
+
docs: list[dict]
|
| 69 |
+
embeddings: np.ndarray
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
_index_cache: dict[str, KBIndex] = {}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _build_index(domain: str, embedder: SentenceTransformer) -> KBIndex:
|
| 76 |
+
if domain not in _index_cache:
|
| 77 |
+
data = yaml.safe_load(features_path(domain).read_text())
|
| 78 |
+
docs = data["documents"]
|
| 79 |
+
texts = [f"{d['title']}. {d['content']}" for d in docs]
|
| 80 |
+
embeddings = embedder.encode(texts, show_progress_bar=False)
|
| 81 |
+
_index_cache[domain] = KBIndex(docs=docs, embeddings=np.array(embeddings))
|
| 82 |
+
log.info("Built KB index for domain=%s (%d docs)", domain, len(docs))
|
| 83 |
+
return _index_cache[domain]
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def _build_context(docs: list[RetrievedDoc]) -> str:
|
| 87 |
+
return "\n\n".join(f"[{d.title}]\n{d.content.strip()}" for d in docs)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _generate(
|
| 91 |
+
query: str,
|
| 92 |
+
context: str,
|
| 93 |
+
client: str,
|
| 94 |
+
domain: str,
|
| 95 |
+
anthropic_client: anthropic.Anthropic,
|
| 96 |
+
) -> str:
|
| 97 |
+
system = SYSTEM_PROMPT.format(
|
| 98 |
+
client_display=DISPLAY_NAMES.get(client, client),
|
| 99 |
+
domain=domain,
|
| 100 |
+
)
|
| 101 |
+
response = anthropic_client.messages.create(
|
| 102 |
+
model="claude-haiku-4-5-20251001",
|
| 103 |
+
max_tokens=512,
|
| 104 |
+
system=system,
|
| 105 |
+
messages=[{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}],
|
| 106 |
+
)
|
| 107 |
+
return response.content[0].text.strip()
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def run(
|
| 111 |
+
query: str,
|
| 112 |
+
client: str,
|
| 113 |
+
anthropic_client: anthropic.Anthropic,
|
| 114 |
+
top_k: int = TOP_K,
|
| 115 |
+
) -> PipelineResult:
|
| 116 |
+
"""Retrieve relevant KB docs, generate a grounded answer, and grade it."""
|
| 117 |
+
domain = domain_for(client)
|
| 118 |
+
embedder = get_embedder()
|
| 119 |
+
index = _build_index(domain, embedder)
|
| 120 |
+
|
| 121 |
+
q_vec = embedder.encode([query])
|
| 122 |
+
scores = cosine_similarity(q_vec, index.embeddings)[0]
|
| 123 |
+
top_indices = np.argsort(scores)[::-1][:top_k]
|
| 124 |
+
retrieved = [
|
| 125 |
+
RetrievedDoc(
|
| 126 |
+
id=index.docs[i]["id"],
|
| 127 |
+
title=index.docs[i]["title"],
|
| 128 |
+
content=index.docs[i]["content"],
|
| 129 |
+
score=float(scores[i]),
|
| 130 |
+
)
|
| 131 |
+
for i in top_indices
|
| 132 |
+
if scores[i] > MIN_RETRIEVAL_SCORE
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
context = _build_context(retrieved)
|
| 136 |
+
answer = _generate(query, context, client, domain, anthropic_client)
|
| 137 |
+
report = grade(
|
| 138 |
+
query=query,
|
| 139 |
+
response=answer,
|
| 140 |
+
context=context,
|
| 141 |
+
client=client,
|
| 142 |
+
anthropic_client=anthropic_client,
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
return PipelineResult(
|
| 146 |
+
query=query,
|
| 147 |
+
client=client,
|
| 148 |
+
answer=answer,
|
| 149 |
+
retrieved_docs=retrieved,
|
| 150 |
+
grade_report=report,
|
| 151 |
+
context_used=context,
|
| 152 |
+
)
|
backend/rosetta.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""RosettaStone: canonical term -> client-specific term translation."""
|
| 2 |
+
|
| 3 |
+
import yaml
|
| 4 |
+
from functools import lru_cache
|
| 5 |
+
|
| 6 |
+
from config import term_catalog_path, domain_for
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@lru_cache(maxsize=8)
|
| 10 |
+
def _load_catalog(domain: str) -> dict[str, dict[str, str]]:
|
| 11 |
+
"""Returns {client_id: {CANONICAL_KEY: "client term"}}."""
|
| 12 |
+
data = yaml.safe_load(term_catalog_path(domain).read_text())
|
| 13 |
+
return {
|
| 14 |
+
client_id: client_data["terms"]
|
| 15 |
+
for client_id, client_data in data["clients"].items()
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def translate(canonical_key: str, client: str) -> str | None:
|
| 20 |
+
"""Return client-specific term for a canonical key, or None if not mapped."""
|
| 21 |
+
catalog = _load_catalog(domain_for(client))
|
| 22 |
+
return catalog.get(client, {}).get(canonical_key)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def client_terms(client: str) -> dict[str, str]:
|
| 26 |
+
"""Return full {CANONICAL_KEY: client_term} mapping for a client."""
|
| 27 |
+
catalog = _load_catalog(domain_for(client))
|
| 28 |
+
return dict(catalog.get(client, {}))
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def check_terminology(response_text: str, client: str) -> dict:
|
| 32 |
+
"""
|
| 33 |
+
Deterministic chain_terminology check.
|
| 34 |
+
|
| 35 |
+
Flags cases where a rival client's term appears in the response for a
|
| 36 |
+
canonical key, without the correct client term also being present.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
{"pass": bool, "violations": [...], "checked": int}
|
| 40 |
+
"""
|
| 41 |
+
catalog = _load_catalog(domain_for(client))
|
| 42 |
+
expected = catalog.get(client, {})
|
| 43 |
+
other_clients = {c: terms for c, terms in catalog.items() if c != client}
|
| 44 |
+
text_lower = response_text.lower()
|
| 45 |
+
violations = []
|
| 46 |
+
|
| 47 |
+
for canonical_key, client_term in expected.items():
|
| 48 |
+
client_term_present = client_term.lower() in text_lower
|
| 49 |
+
for other_terms in other_clients.values():
|
| 50 |
+
rival_term = other_terms.get(canonical_key, "")
|
| 51 |
+
if rival_term and rival_term.lower() in text_lower and not client_term_present:
|
| 52 |
+
violations.append({
|
| 53 |
+
"canonical": canonical_key,
|
| 54 |
+
"expected": client_term,
|
| 55 |
+
"found": rival_term,
|
| 56 |
+
})
|
| 57 |
+
|
| 58 |
+
return {
|
| 59 |
+
"pass": len(violations) == 0,
|
| 60 |
+
"violations": violations,
|
| 61 |
+
"checked": len(expected),
|
| 62 |
+
}
|
eval/golden-dataset.yaml
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Golden dataset — 20 Q&A pairs for L2 batch evaluation
|
| 2 |
+
# 10 retail (5 NovaMart / 5 ShelfWise) + 10 pharma (5 ClinixOne / 5 PharmaLink)
|
| 3 |
+
#
|
| 4 |
+
# Fields:
|
| 5 |
+
# id — stable identifier
|
| 6 |
+
# domain — retail | pharma
|
| 7 |
+
# client — novamart | shelfwise | clinixone | pharmalink
|
| 8 |
+
# question — natural-language query as a recruiter or end-user would type it
|
| 9 |
+
# expected_contains — keyphrases the correct answer must include (used by L2 metrics)
|
| 10 |
+
# expected_answer — full reference answer for answer_correctness / answer_similarity
|
| 11 |
+
# notes — what this pair is testing (for eval engineers)
|
| 12 |
+
|
| 13 |
+
pairs:
|
| 14 |
+
|
| 15 |
+
# ── RETAIL · NovaMart ──────────────────────────────────────────────────
|
| 16 |
+
|
| 17 |
+
- id: retail-nm-001
|
| 18 |
+
domain: retail
|
| 19 |
+
client: novamart
|
| 20 |
+
question: "What happens when a product runs out of stock?"
|
| 21 |
+
expected_contains:
|
| 22 |
+
- availability scan
|
| 23 |
+
- low inventory signal
|
| 24 |
+
- reorder
|
| 25 |
+
expected_answer: >
|
| 26 |
+
When a product runs out of stock, an availability scan detects the shortfall
|
| 27 |
+
against the configured reorder threshold and triggers a low inventory signal.
|
| 28 |
+
The signal is routed to the responsible category manager and the supplying vendor.
|
| 29 |
+
If unacknowledged, it escalates to the regional operations lead after 24 hours.
|
| 30 |
+
notes: "Tests chain_terminology: must say 'availability scan' and 'low inventory signal', not 'stock check' or 'out-of-stock alert'."
|
| 31 |
+
|
| 32 |
+
- id: retail-nm-002
|
| 33 |
+
domain: retail
|
| 34 |
+
client: novamart
|
| 35 |
+
question: "How do I add a new supplier to the system?"
|
| 36 |
+
expected_contains:
|
| 37 |
+
- merchant onboarding
|
| 38 |
+
- legal entity name
|
| 39 |
+
- tax ID
|
| 40 |
+
- purchase order
|
| 41 |
+
expected_answer: >
|
| 42 |
+
To add a new supplier, complete the merchant onboarding process by registering
|
| 43 |
+
the vendor with their legal entity name, tax ID, payment terms, and primary contact.
|
| 44 |
+
Incomplete records will block purchase order creation until all mandatory fields
|
| 45 |
+
are validated.
|
| 46 |
+
notes: "Tests chain_terminology: 'merchant onboarding' not 'supplier setup'."
|
| 47 |
+
|
| 48 |
+
- id: retail-nm-003
|
| 49 |
+
domain: retail
|
| 50 |
+
client: novamart
|
| 51 |
+
question: "Can I turn on a new feature for just one region without deploying code?"
|
| 52 |
+
expected_contains:
|
| 53 |
+
- capability switch
|
| 54 |
+
- activation scope
|
| 55 |
+
- expiry date
|
| 56 |
+
- sign-off
|
| 57 |
+
expected_answer: >
|
| 58 |
+
Yes. A capability switch lets you enable or disable functionality per client,
|
| 59 |
+
region, or user segment without a code deployment. Each switch has an activation
|
| 60 |
+
scope and an expiry date to prevent flag debt. Enabling a switch in production
|
| 61 |
+
requires sign-off from both the product and engineering lead.
|
| 62 |
+
notes: "Tests chain_terminology: 'capability switch' not 'feature toggle' or 'feature flag'."
|
| 63 |
+
|
| 64 |
+
- id: retail-nm-004
|
| 65 |
+
domain: retail
|
| 66 |
+
client: novamart
|
| 67 |
+
question: "Where is the authoritative source for product information like SKU and category?"
|
| 68 |
+
expected_contains:
|
| 69 |
+
- item registry
|
| 70 |
+
- SKU
|
| 71 |
+
- archived
|
| 72 |
+
- 15 minutes
|
| 73 |
+
expected_answer: >
|
| 74 |
+
The item registry is the authoritative source for product attributes including
|
| 75 |
+
SKU, description, category hierarchy, dimensions, and active status.
|
| 76 |
+
Updates sync to downstream systems within 15 minutes. Deactivated products
|
| 77 |
+
remain as archived records and cannot be reactivated without manual review.
|
| 78 |
+
notes: "Tests chain_terminology: 'item registry' not 'product catalog'."
|
| 79 |
+
|
| 80 |
+
- id: retail-nm-005
|
| 81 |
+
domain: retail
|
| 82 |
+
client: novamart
|
| 83 |
+
question: "How are price changes handled and what needs approval?"
|
| 84 |
+
expected_contains:
|
| 85 |
+
- pricing sync
|
| 86 |
+
- 15%
|
| 87 |
+
- four hours
|
| 88 |
+
- escalation
|
| 89 |
+
expected_answer: >
|
| 90 |
+
Price changes are submitted as a pricing sync through the pricing portal.
|
| 91 |
+
Changes greater than 15% of the current price require approval. Approved
|
| 92 |
+
changes go live at the next sync window, which runs every four hours.
|
| 93 |
+
Emergency corrections outside the window require escalation to the pricing team.
|
| 94 |
+
notes: "Tests chain_terminology: 'pricing sync' not 'price update'."
|
| 95 |
+
|
| 96 |
+
# ── RETAIL · ShelfWise ───────────────────────────────────────────────
|
| 97 |
+
|
| 98 |
+
- id: retail-sw-001
|
| 99 |
+
domain: retail
|
| 100 |
+
client: shelfwise
|
| 101 |
+
question: "What triggers an out-of-stock alert?"
|
| 102 |
+
expected_contains:
|
| 103 |
+
- out-of-stock alert
|
| 104 |
+
- reorder point
|
| 105 |
+
- category manager
|
| 106 |
+
- 24 hours
|
| 107 |
+
expected_answer: >
|
| 108 |
+
An out-of-stock alert fires when a product's on-hand quantity drops below
|
| 109 |
+
its configured reorder point. It is routed simultaneously to the responsible
|
| 110 |
+
category manager and the supplying vendor. Unacknowledged alerts escalate
|
| 111 |
+
to the regional operations lead after 24 hours.
|
| 112 |
+
notes: "Tests chain_terminology: 'out-of-stock alert' not 'low inventory signal'."
|
| 113 |
+
|
| 114 |
+
- id: retail-sw-002
|
| 115 |
+
domain: retail
|
| 116 |
+
client: shelfwise
|
| 117 |
+
question: "How do we enable a feature for a subset of users?"
|
| 118 |
+
expected_contains:
|
| 119 |
+
- feature toggle
|
| 120 |
+
- activation scope
|
| 121 |
+
- expiry date
|
| 122 |
+
- engineering lead
|
| 123 |
+
expected_answer: >
|
| 124 |
+
Use a feature toggle to enable or disable functionality per client, region,
|
| 125 |
+
or user segment without a code deployment. Each toggle has an owner, an
|
| 126 |
+
activation scope, and an expiry date. Enabling in production requires
|
| 127 |
+
sign-off from the product and engineering lead.
|
| 128 |
+
notes: "Tests chain_terminology: 'feature toggle' not 'capability switch'."
|
| 129 |
+
|
| 130 |
+
- id: retail-sw-003
|
| 131 |
+
domain: retail
|
| 132 |
+
client: shelfwise
|
| 133 |
+
question: "What information is required to onboard a new supplier?"
|
| 134 |
+
expected_contains:
|
| 135 |
+
- supplier setup
|
| 136 |
+
- tax ID
|
| 137 |
+
- payment terms
|
| 138 |
+
- purchase order
|
| 139 |
+
expected_answer: >
|
| 140 |
+
Supplier setup requires the vendor's legal entity name, tax ID, payment terms,
|
| 141 |
+
and primary contact. Incomplete records block purchase order creation until
|
| 142 |
+
all mandatory fields are validated.
|
| 143 |
+
notes: "Tests chain_terminology: 'supplier setup' not 'merchant onboarding'."
|
| 144 |
+
|
| 145 |
+
- id: retail-sw-004
|
| 146 |
+
domain: retail
|
| 147 |
+
client: shelfwise
|
| 148 |
+
question: "How do compliance reports work and who can access them?"
|
| 149 |
+
expected_contains:
|
| 150 |
+
- compliance report
|
| 151 |
+
- immutable
|
| 152 |
+
- seven years
|
| 153 |
+
- Audit role
|
| 154 |
+
expected_answer: >
|
| 155 |
+
Compliance reports capture a timestamped record of system actions, user
|
| 156 |
+
decisions, and policy rule evaluations. They are immutable once generated
|
| 157 |
+
and stored for a minimum of seven years. Access is restricted to users
|
| 158 |
+
with the Audit role or higher.
|
| 159 |
+
notes: "Tests chain_terminology: 'compliance report' not 'audit trail'."
|
| 160 |
+
|
| 161 |
+
- id: retail-sw-005
|
| 162 |
+
domain: retail
|
| 163 |
+
client: shelfwise
|
| 164 |
+
question: "How quickly do product catalog updates reach downstream systems?"
|
| 165 |
+
expected_contains:
|
| 166 |
+
- product catalog
|
| 167 |
+
- 15 minutes
|
| 168 |
+
- event stream
|
| 169 |
+
- archived
|
| 170 |
+
expected_answer: >
|
| 171 |
+
Product catalog updates sync to all downstream systems within 15 minutes
|
| 172 |
+
via event stream. Deactivated products remain in the catalog as archived
|
| 173 |
+
records and cannot be reactivated without a manual review.
|
| 174 |
+
notes: "Tests chain_terminology: 'product catalog' not 'item registry'."
|
| 175 |
+
|
| 176 |
+
# ── PHARMA · ClinixOne ─────────────────────────────────────────────────
|
| 177 |
+
|
| 178 |
+
- id: pharma-cx-001
|
| 179 |
+
domain: pharma
|
| 180 |
+
client: clinixone
|
| 181 |
+
question: "What is prior authorization and how long does it take?"
|
| 182 |
+
expected_contains:
|
| 183 |
+
- prior authorization
|
| 184 |
+
- clinical justification
|
| 185 |
+
- 72 hours
|
| 186 |
+
- appeal
|
| 187 |
+
expected_answer: >
|
| 188 |
+
Prior authorization is a requirement by a payer that a prescriber obtain
|
| 189 |
+
approval before a specific drug is dispensed and covered. The prescriber
|
| 190 |
+
submits clinical justification and the payer responds within 72 hours for
|
| 191 |
+
standard requests or 24 hours for urgent cases. Denied requests can be
|
| 192 |
+
appealed once with additional clinical documentation.
|
| 193 |
+
notes: "Tests chain_terminology: 'prior authorization' not 'formulary pre-approval'."
|
| 194 |
+
|
| 195 |
+
- id: pharma-cx-002
|
| 196 |
+
domain: pharma
|
| 197 |
+
client: clinixone
|
| 198 |
+
question: "What is the difference between a generic name and a brand name?"
|
| 199 |
+
expected_contains:
|
| 200 |
+
- generic name
|
| 201 |
+
- brand name
|
| 202 |
+
- clinical guidelines
|
| 203 |
+
- authorization
|
| 204 |
+
expected_answer: >
|
| 205 |
+
The generic name is the active ingredient name — non-proprietary and used
|
| 206 |
+
in clinical guidelines and regulatory filings. Brand names are assigned by
|
| 207 |
+
manufacturers and appear in marketing materials and some payer formularies.
|
| 208 |
+
Substituting a brand drug with a generic requires explicit prescriber or
|
| 209 |
+
payer authorization in some jurisdictions.
|
| 210 |
+
notes: "Tests chain_terminology: ClinixOne uses 'generic name' as primary."
|
| 211 |
+
|
| 212 |
+
- id: pharma-cx-003
|
| 213 |
+
domain: pharma
|
| 214 |
+
client: clinixone
|
| 215 |
+
question: "When must an adverse event be reported to regulators?"
|
| 216 |
+
expected_contains:
|
| 217 |
+
- adverse event
|
| 218 |
+
- 15 days
|
| 219 |
+
- 90 days
|
| 220 |
+
- serious unexpected
|
| 221 |
+
expected_answer: >
|
| 222 |
+
Adverse events must be reported to the regulatory authority within 15 days
|
| 223 |
+
for serious unexpected events and within 90 days for expected events.
|
| 224 |
+
An adverse event is any undesirable medical occurrence in a patient
|
| 225 |
+
administered a medicinal product, regardless of causal relationship.
|
| 226 |
+
notes: "Tests chain_terminology: 'adverse event' not 'safety signal'. Key faithfulness test — specific numbers must be grounded in KB."
|
| 227 |
+
|
| 228 |
+
- id: pharma-cx-004
|
| 229 |
+
domain: pharma
|
| 230 |
+
client: clinixone
|
| 231 |
+
question: "What are the phases of a clinical trial?"
|
| 232 |
+
expected_contains:
|
| 233 |
+
- clinical trial
|
| 234 |
+
- Phase I
|
| 235 |
+
- Phase II
|
| 236 |
+
- Phase III
|
| 237 |
+
- inclusion
|
| 238 |
+
expected_answer: >
|
| 239 |
+
Clinical trials are classified by phase: Phase I tests safety in a small
|
| 240 |
+
cohort, Phase II assesses efficacy and side effects, and Phase III compares
|
| 241 |
+
against standard treatment at scale. Enrollment eligibility is defined by
|
| 242 |
+
inclusion and exclusion criteria in the protocol.
|
| 243 |
+
notes: "Tests chain_terminology: 'clinical trial' not 'investigational program'."
|
| 244 |
+
|
| 245 |
+
- id: pharma-cx-005
|
| 246 |
+
domain: pharma
|
| 247 |
+
client: clinixone
|
| 248 |
+
question: "What happens if a prescriber adjusts the dose outside the approved schedule?"
|
| 249 |
+
expected_contains:
|
| 250 |
+
- dose modification
|
| 251 |
+
- titration
|
| 252 |
+
- prior authorization
|
| 253 |
+
- documentation
|
| 254 |
+
expected_answer: >
|
| 255 |
+
A dose modification outside the approved titration schedule requires prescriber
|
| 256 |
+
documentation and may trigger a prior authorization review. Titration schedules
|
| 257 |
+
specify the starting dose, increment size, and minimum interval between increases.
|
| 258 |
+
notes: "Tests chain_terminology: 'dose modification' and 'prior authorization' for ClinixOne."
|
| 259 |
+
|
| 260 |
+
# ── PHARMA · PharmaLink ───────────────────────────────────────────────
|
| 261 |
+
|
| 262 |
+
- id: pharma-pl-001
|
| 263 |
+
domain: pharma
|
| 264 |
+
client: pharmalink
|
| 265 |
+
question: "How do I get a drug approved before dispensing?"
|
| 266 |
+
expected_contains:
|
| 267 |
+
- formulary pre-approval
|
| 268 |
+
- clinical justification
|
| 269 |
+
- 72 hours
|
| 270 |
+
- appeal
|
| 271 |
+
expected_answer: >
|
| 272 |
+
Submit a formulary pre-approval request with clinical justification. The payer
|
| 273 |
+
reviews against formulary criteria and responds within 72 hours for standard
|
| 274 |
+
requests or 24 hours for urgent cases. Denied requests can be appealed once
|
| 275 |
+
with additional clinical documentation.
|
| 276 |
+
notes: "Tests chain_terminology: 'formulary pre-approval' not 'prior authorization'."
|
| 277 |
+
|
| 278 |
+
- id: pharma-pl-002
|
| 279 |
+
domain: pharma
|
| 280 |
+
client: pharmalink
|
| 281 |
+
question: "What is a pharmacovigilance alert and when is it raised?"
|
| 282 |
+
expected_contains:
|
| 283 |
+
- pharmacovigilance alert
|
| 284 |
+
- pattern
|
| 285 |
+
- causal relationship
|
| 286 |
+
- regulatory authority
|
| 287 |
+
expected_answer: >
|
| 288 |
+
A pharmacovigilance alert is raised when a pattern of adverse events suggests
|
| 289 |
+
a previously unknown or incompletely documented causal relationship between a
|
| 290 |
+
drug and an outcome. Serious unexpected events must be reported to the
|
| 291 |
+
regulatory authority within 15 days.
|
| 292 |
+
notes: "Tests chain_terminology: 'pharmacovigilance alert' not 'safety signal' or 'adverse event'. Key cross-client terminology stress test."
|
| 293 |
+
|
| 294 |
+
- id: pharma-pl-003
|
| 295 |
+
domain: pharma
|
| 296 |
+
client: pharmalink
|
| 297 |
+
question: "What are the coverage tiers in the formulary?"
|
| 298 |
+
expected_contains:
|
| 299 |
+
- benefit tier
|
| 300 |
+
- Tier 1
|
| 301 |
+
- generics
|
| 302 |
+
- 60-day notice
|
| 303 |
+
expected_answer: >
|
| 304 |
+
The formulary organizes drugs into benefit tiers that determine patient
|
| 305 |
+
cost-sharing. Tier 1 is typically lowest cost and covers generics; higher
|
| 306 |
+
tiers carry higher copays. Moving a drug to a higher tier requires a formulary
|
| 307 |
+
committee review and a minimum 60-day notice to prescribers.
|
| 308 |
+
notes: "Tests chain_terminology: 'benefit tier' not 'coverage tier'."
|
| 309 |
+
|
| 310 |
+
- id: pharma-pl-004
|
| 311 |
+
domain: pharma
|
| 312 |
+
client: pharmalink
|
| 313 |
+
question: "What is a prescribing pathway and how often is it reviewed?"
|
| 314 |
+
expected_contains:
|
| 315 |
+
- prescribing pathway
|
| 316 |
+
- annually
|
| 317 |
+
- coverage decisions
|
| 318 |
+
- clinical rationale
|
| 319 |
+
expected_answer: >
|
| 320 |
+
A prescribing pathway is an evidence-based document specifying the recommended
|
| 321 |
+
sequence of therapies for a given condition. Pathways are reviewed annually and
|
| 322 |
+
updated when new efficacy or safety data emerges. Payers use pathway adherence
|
| 323 |
+
as a criterion in coverage decisions; deviation requires documented clinical
|
| 324 |
+
rationale.
|
| 325 |
+
notes: "Tests chain_terminology: 'prescribing pathway' not 'clinical guideline' or 'treatment protocol'."
|
| 326 |
+
|
| 327 |
+
- id: pharma-pl-005
|
| 328 |
+
domain: pharma
|
| 329 |
+
client: pharmalink
|
| 330 |
+
question: "What does enrollment authorization involve for a clinical study?"
|
| 331 |
+
expected_contains:
|
| 332 |
+
- enrollment authorization
|
| 333 |
+
- investigational program
|
| 334 |
+
- re-consent
|
| 335 |
+
- inclusion
|
| 336 |
+
expected_answer: >
|
| 337 |
+
Enrollment authorization is the process by which a patient receives and
|
| 338 |
+
acknowledges sufficient information about an investigational program to make
|
| 339 |
+
a voluntary decision. Consent must be obtained before any study procedure.
|
| 340 |
+
If the protocol changes materially, re-consent is required.
|
| 341 |
+
notes: "Tests chain_terminology: 'enrollment authorization' (not 'informed consent') and 'investigational program' (not 'clinical trial')."
|
eval/metrics.py
ADDED
|
File without changes
|
knowledge/pharma/features.yaml
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Pharma domain — knowledge base documents
|
| 2 |
+
# Retrieved by RAG pipeline, grounded against in faithfulness check
|
| 3 |
+
# Each entry: id, title, content (2-4 sentences, retrieval-friendly), tags
|
| 4 |
+
|
| 5 |
+
documents:
|
| 6 |
+
- id: pharma_001
|
| 7 |
+
title: "Prior Authorization and Formulary Pre-Approval"
|
| 8 |
+
content: >
|
| 9 |
+
Prior authorization (formulary pre-approval) is a requirement by a payer that a
|
| 10 |
+
prescriber obtain approval before a specific drug is dispensed and covered.
|
| 11 |
+
The prescriber submits clinical justification; the payer reviews against formulary
|
| 12 |
+
criteria and responds within 72 hours for standard requests or 24 hours for urgent cases.
|
| 13 |
+
Denied requests can be appealed once with additional clinical documentation.
|
| 14 |
+
tags: [prior-auth, formulary, coverage]
|
| 15 |
+
|
| 16 |
+
- id: pharma_002
|
| 17 |
+
title: "Generic vs Brand Drug Names"
|
| 18 |
+
content: >
|
| 19 |
+
Every approved drug has a generic name (the active ingredient, non-proprietary) and
|
| 20 |
+
one or more brand names assigned by manufacturers.
|
| 21 |
+
Generic names are used in clinical guidelines and regulatory filings; brand names
|
| 22 |
+
appear in marketing materials and some payer formularies.
|
| 23 |
+
Substituting a brand drug with a generic equivalent requires explicit prescriber
|
| 24 |
+
or payer authorization in some jurisdictions.
|
| 25 |
+
tags: [drug-name, generic, brand]
|
| 26 |
+
|
| 27 |
+
- id: pharma_003
|
| 28 |
+
title: "Adverse Events and Safety Signals"
|
| 29 |
+
content: >
|
| 30 |
+
An adverse event is any undesirable medical occurrence in a patient administered
|
| 31 |
+
a medicinal product, regardless of causal relationship.
|
| 32 |
+
A safety signal (pharmacovigilance alert) is a pattern of adverse events that
|
| 33 |
+
suggests a previously unknown or incompletely documented causal relationship
|
| 34 |
+
between a drug and an outcome.
|
| 35 |
+
Adverse events must be reported to the regulatory authority within 15 days for
|
| 36 |
+
serious unexpected events and 90 days for expected events.
|
| 37 |
+
tags: [adverse-event, safety, pharmacovigilance]
|
| 38 |
+
|
| 39 |
+
- id: pharma_004
|
| 40 |
+
title: "Drug-Drug Interactions and Contraindications"
|
| 41 |
+
content: >
|
| 42 |
+
A drug-drug interaction (contraindication) occurs when one drug affects the activity
|
| 43 |
+
of another when both are administered together.
|
| 44 |
+
Interactions range from minor (monitoring recommended) to contraindicated (combination
|
| 45 |
+
must not be used). Severity classifications follow the clinical pharmacology guidelines
|
| 46 |
+
maintained in the formulary interaction database.
|
| 47 |
+
Prescribers are alerted at point-of-care when a contraindicated combination is entered.
|
| 48 |
+
tags: [drug-interaction, contraindication, safety]
|
| 49 |
+
|
| 50 |
+
- id: pharma_005
|
| 51 |
+
title: "Clinical Guidelines and Prescribing Pathways"
|
| 52 |
+
content: >
|
| 53 |
+
A treatment protocol (clinical guideline / prescribing pathway) is an evidence-based
|
| 54 |
+
document specifying the recommended sequence of therapies for a given condition.
|
| 55 |
+
Pathways are reviewed annually and updated when new efficacy or safety data emerges.
|
| 56 |
+
Payers use pathway adherence as a criterion in coverage decisions; deviation requires
|
| 57 |
+
documented clinical rationale.
|
| 58 |
+
tags: [protocol, guideline, treatment]
|
| 59 |
+
|
| 60 |
+
- id: pharma_006
|
| 61 |
+
title: "Formulary Coverage Tiers"
|
| 62 |
+
content: >
|
| 63 |
+
A formulary is a list of drugs covered by a payer, organized into tiers (benefit tiers)
|
| 64 |
+
that determine patient cost-sharing. Tier 1 is typically lowest cost (generics);
|
| 65 |
+
higher tiers carry higher copays. Moving a drug to a higher tier requires
|
| 66 |
+
a formulary committee review and a minimum 60-day notice to prescribers.
|
| 67 |
+
tags: [formulary, coverage, tier]
|
| 68 |
+
|
| 69 |
+
- id: pharma_007
|
| 70 |
+
title: "Dosage Adjustment and Titration"
|
| 71 |
+
content: >
|
| 72 |
+
A dosage adjustment (dose modification / titration step) is a change to a patient's
|
| 73 |
+
prescribed dose based on clinical response, tolerability, renal or hepatic function,
|
| 74 |
+
or drug interaction. Titration schedules specify the starting dose, increment size,
|
| 75 |
+
and minimum interval between increases. Adjustments outside the approved titration
|
| 76 |
+
schedule require prescriber documentation and may trigger a prior authorization review.
|
| 77 |
+
tags: [dosage, titration, dose]
|
| 78 |
+
|
| 79 |
+
- id: pharma_008
|
| 80 |
+
title: "Patient Consent and Enrollment Authorization"
|
| 81 |
+
content: >
|
| 82 |
+
Informed consent (enrollment authorization) is the process by which a patient
|
| 83 |
+
receives and acknowledges sufficient information about a treatment or study
|
| 84 |
+
to make a voluntary decision. For clinical trials, consent must be obtained
|
| 85 |
+
before any study procedure. Consent forms are version-controlled; if the
|
| 86 |
+
protocol changes materially, re-consent is required.
|
| 87 |
+
tags: [consent, enrollment, patient]
|
| 88 |
+
|
| 89 |
+
- id: pharma_009
|
| 90 |
+
title: "Clinical Trials and Investigational Programs"
|
| 91 |
+
content: >
|
| 92 |
+
A clinical trial (investigational program) is a structured study that evaluates
|
| 93 |
+
the safety or efficacy of a drug, device, or intervention in human subjects.
|
| 94 |
+
Trials are classified by phase: Phase I tests safety in a small cohort,
|
| 95 |
+
Phase II assesses efficacy and side effects, Phase III compares against
|
| 96 |
+
standard treatment at scale. Enrollment eligibility is defined by inclusion
|
| 97 |
+
and exclusion criteria in the protocol.
|
| 98 |
+
tags: [clinical-trial, study, investigational]
|
knowledge/pharma/term-catalog.yaml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Pharma domain — client-specific terminology map
|
| 2 |
+
# canonical term -> how each client calls it
|
| 3 |
+
# This is what chain_terminology metric validates against
|
| 4 |
+
|
| 5 |
+
clients:
|
| 6 |
+
clinixone:
|
| 7 |
+
display_name: "ClinixOne"
|
| 8 |
+
terms:
|
| 9 |
+
DRUG_APPROVAL: "prior authorization"
|
| 10 |
+
DRUG_NAME: "generic name"
|
| 11 |
+
ADVERSE_EVENT: "adverse event"
|
| 12 |
+
DRUG_INTERACTION: "contraindication"
|
| 13 |
+
TREATMENT_PROTOCOL: "clinical guideline"
|
| 14 |
+
FORMULARY_STATUS: "coverage tier"
|
| 15 |
+
DOSAGE_ADJUSTMENT: "dose modification"
|
| 16 |
+
SAFETY_SIGNAL: "safety signal"
|
| 17 |
+
PATIENT_CONSENT: "informed consent"
|
| 18 |
+
CLINICAL_TRIAL: "study enrollment"
|
| 19 |
+
|
| 20 |
+
pharmalink:
|
| 21 |
+
display_name: "PharmaLink"
|
| 22 |
+
terms:
|
| 23 |
+
DRUG_APPROVAL: "formulary pre-approval"
|
| 24 |
+
DRUG_NAME: "brand name"
|
| 25 |
+
ADVERSE_EVENT: "safety signal"
|
| 26 |
+
DRUG_INTERACTION: "drug-drug interaction"
|
| 27 |
+
TREATMENT_PROTOCOL: "prescribing pathway"
|
| 28 |
+
FORMULARY_STATUS: "benefit tier"
|
| 29 |
+
DOSAGE_ADJUSTMENT: "titration step"
|
| 30 |
+
SAFETY_SIGNAL: "pharmacovigilance alert"
|
| 31 |
+
PATIENT_CONSENT: "enrollment authorization"
|
| 32 |
+
CLINICAL_TRIAL: "investigational program"
|
knowledge/retail/features.yaml
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Retail domain — knowledge base documents
|
| 2 |
+
# Retrieved by RAG pipeline, grounded against in faithfulness check
|
| 3 |
+
# Each entry: id, title, content (2-4 sentences, retrieval-friendly), tags
|
| 4 |
+
|
| 5 |
+
documents:
|
| 6 |
+
- id: retail_001
|
| 7 |
+
title: "Stock Check Process"
|
| 8 |
+
content: >
|
| 9 |
+
A stock check queries real-time inventory levels for a given product and location.
|
| 10 |
+
Results include current quantity on hand, reorder threshold, and last updated timestamp.
|
| 11 |
+
If quantity falls below threshold, an out-of-stock alert is automatically triggered.
|
| 12 |
+
Stock checks can be initiated manually or scheduled on a recurring basis.
|
| 13 |
+
tags: [inventory, stock, availability]
|
| 14 |
+
|
| 15 |
+
- id: retail_002
|
| 16 |
+
title: "Supplier Setup and Onboarding"
|
| 17 |
+
content: >
|
| 18 |
+
Supplier setup is the process of registering a new vendor in the system before
|
| 19 |
+
products can be sourced or orders placed. Required fields include legal entity name,
|
| 20 |
+
tax ID, payment terms, and primary contact. Incomplete supplier records block
|
| 21 |
+
purchase order creation until all mandatory fields are validated.
|
| 22 |
+
tags: [supplier, vendor, onboarding]
|
| 23 |
+
|
| 24 |
+
- id: retail_003
|
| 25 |
+
title: "Compliance Reporting"
|
| 26 |
+
content: >
|
| 27 |
+
Compliance reports capture a timestamped record of system actions, user decisions,
|
| 28 |
+
and policy rule evaluations for regulatory and internal audit purposes.
|
| 29 |
+
Reports are immutable once generated and stored for a minimum of seven years.
|
| 30 |
+
Access is restricted to users with the Audit role or higher.
|
| 31 |
+
tags: [compliance, audit, reporting]
|
| 32 |
+
|
| 33 |
+
- id: retail_004
|
| 34 |
+
title: "Feature Flags and Capability Switches"
|
| 35 |
+
content: >
|
| 36 |
+
Feature flags (also called capability switches) enable or disable product functionality
|
| 37 |
+
per client, region, or user segment without a code deployment.
|
| 38 |
+
Each flag has an owner, an activation scope, and an expiry date to prevent flag debt.
|
| 39 |
+
Enabling a flag in production requires sign-off from both the product and engineering lead.
|
| 40 |
+
tags: [feature-flags, configuration, rollout]
|
| 41 |
+
|
| 42 |
+
- id: retail_005
|
| 43 |
+
title: "Product Catalog Management"
|
| 44 |
+
content: >
|
| 45 |
+
The product catalog (item registry) is the authoritative source of product attributes
|
| 46 |
+
including SKU, description, category hierarchy, dimensions, and active status.
|
| 47 |
+
Catalog updates sync to all downstream systems within 15 minutes via event stream.
|
| 48 |
+
Deactivated products remain in the catalog as archived records and cannot be reactivated
|
| 49 |
+
without a manual review.
|
| 50 |
+
tags: [catalog, products, SKU]
|
| 51 |
+
|
| 52 |
+
- id: retail_006
|
| 53 |
+
title: "Price Update Workflow"
|
| 54 |
+
content: >
|
| 55 |
+
Price updates (pricing syncs) must be submitted through the pricing portal and require
|
| 56 |
+
approval for changes greater than 15% of the current price.
|
| 57 |
+
Approved changes go live at the next scheduled sync window, which runs every four hours.
|
| 58 |
+
Emergency price corrections outside the sync window require escalation to the pricing team.
|
| 59 |
+
tags: [pricing, price-update, workflow]
|
| 60 |
+
|
| 61 |
+
- id: retail_007
|
| 62 |
+
title: "Store Configuration"
|
| 63 |
+
content: >
|
| 64 |
+
Each store location has a configuration profile (location profile) that defines
|
| 65 |
+
operating hours, supported payment methods, fulfillment capabilities, and
|
| 66 |
+
regional compliance rules. Configuration changes take effect at store open
|
| 67 |
+
on the following business day. Misconfigured stores are flagged in the daily
|
| 68 |
+
operations health report.
|
| 69 |
+
tags: [store, configuration, location]
|
| 70 |
+
|
| 71 |
+
- id: retail_008
|
| 72 |
+
title: "Low Inventory Alerts"
|
| 73 |
+
content: >
|
| 74 |
+
A low inventory signal (out-of-stock alert) fires when a product's on-hand quantity
|
| 75 |
+
drops below its configured reorder point. Alerts are routed to the responsible
|
| 76 |
+
category manager and the supplying vendor simultaneously. Unacknowledged alerts
|
| 77 |
+
escalate to the regional operations lead after 24 hours.
|
| 78 |
+
tags: [inventory, alerts, stock]
|
knowledge/retail/term-catalog.yaml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Retail domain — client-specific terminology map
|
| 2 |
+
# canonical term -> how each client calls it
|
| 3 |
+
# This is what chain_terminology metric validates against
|
| 4 |
+
|
| 5 |
+
clients:
|
| 6 |
+
novamart:
|
| 7 |
+
display_name: "NovaMart"
|
| 8 |
+
terms:
|
| 9 |
+
STOCK_CHECK: "availability scan"
|
| 10 |
+
SUPPLIER_SETUP: "merchant onboarding"
|
| 11 |
+
COMPLIANCE_REPORT: "audit trail"
|
| 12 |
+
FEATURE_FLAG: "capability switch"
|
| 13 |
+
STOCK_ALERT: "low inventory signal"
|
| 14 |
+
PRODUCT_CATALOG: "item registry"
|
| 15 |
+
PRICE_UPDATE: "pricing sync"
|
| 16 |
+
STORE_CONFIG: "location profile"
|
| 17 |
+
|
| 18 |
+
shelfwise:
|
| 19 |
+
display_name: "ShelfWise"
|
| 20 |
+
terms:
|
| 21 |
+
STOCK_CHECK: "stock check"
|
| 22 |
+
SUPPLIER_SETUP: "supplier setup"
|
| 23 |
+
COMPLIANCE_REPORT: "compliance report"
|
| 24 |
+
FEATURE_FLAG: "feature toggle"
|
| 25 |
+
STOCK_ALERT: "out-of-stock alert"
|
| 26 |
+
PRODUCT_CATALOG: "product catalog"
|
| 27 |
+
PRICE_UPDATE: "price update"
|
| 28 |
+
STORE_CONFIG: "store configuration"
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
anthropic>=0.40.0
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn[standard]>=0.30.0
|
| 4 |
+
pyyaml>=6.0
|
| 5 |
+
sentence-transformers>=3.0.0
|
| 6 |
+
scikit-learn>=1.5.0
|
| 7 |
+
numpy>=1.26.0
|
| 8 |
+
python-multipart>=0.0.9
|
ui/app.js
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const API = ''; // same origin
|
| 2 |
+
|
| 3 |
+
let state = {
|
| 4 |
+
domain: null,
|
| 5 |
+
client: null,
|
| 6 |
+
domains: {},
|
| 7 |
+
loading: false,
|
| 8 |
+
};
|
| 9 |
+
|
| 10 |
+
// ── Boot ──────────────────────────────────────────────────────────────────
|
| 11 |
+
|
| 12 |
+
async function boot() {
|
| 13 |
+
const res = await fetch(`${API}/config`);
|
| 14 |
+
const data = await res.json();
|
| 15 |
+
state.domains = data.domains;
|
| 16 |
+
|
| 17 |
+
const firstDomain = Object.keys(data.domains)[0];
|
| 18 |
+
renderDomainSwitcher();
|
| 19 |
+
selectDomain(firstDomain);
|
| 20 |
+
|
| 21 |
+
document.getElementById('send-btn').addEventListener('click', handleSend);
|
| 22 |
+
document.getElementById('query-input').addEventListener('keydown', e => {
|
| 23 |
+
if (e.key === 'Enter' && !e.shiftKey) handleSend();
|
| 24 |
+
});
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
// ── Switchers ─────────────────────────────────────────────────────────────
|
| 28 |
+
|
| 29 |
+
function renderDomainSwitcher() {
|
| 30 |
+
const el = document.getElementById('domain-switcher');
|
| 31 |
+
el.innerHTML = Object.keys(state.domains).map(d => `
|
| 32 |
+
<button data-domain="${d}" onclick="selectDomain('${d}')">${capitalize(d)}</button>
|
| 33 |
+
`).join('');
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
function selectDomain(domain) {
|
| 37 |
+
state.domain = domain;
|
| 38 |
+
document.querySelectorAll('#domain-switcher button').forEach(b => {
|
| 39 |
+
b.classList.toggle('active', b.dataset.domain === domain);
|
| 40 |
+
});
|
| 41 |
+
|
| 42 |
+
const clients = state.domains[domain];
|
| 43 |
+
const el = document.getElementById('client-switcher');
|
| 44 |
+
el.innerHTML = clients.map(c => `
|
| 45 |
+
<button data-client="${c.id}" onclick="selectClient('${c.id}')">${c.display}</button>
|
| 46 |
+
`).join('');
|
| 47 |
+
|
| 48 |
+
selectClient(clients[0].id);
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
function selectClient(clientId) {
|
| 52 |
+
state.client = clientId;
|
| 53 |
+
document.querySelectorAll('#client-switcher button').forEach(b => {
|
| 54 |
+
b.classList.toggle('active', b.dataset.client === clientId);
|
| 55 |
+
});
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// ── Send ──────────────────────────────────────────────────────────────────
|
| 59 |
+
|
| 60 |
+
async function handleSend() {
|
| 61 |
+
const input = document.getElementById('query-input');
|
| 62 |
+
const query = input.value.trim();
|
| 63 |
+
if (!query || state.loading) return;
|
| 64 |
+
|
| 65 |
+
input.value = '';
|
| 66 |
+
setLoading(true);
|
| 67 |
+
|
| 68 |
+
appendMessage('user', query);
|
| 69 |
+
const thinkingEl = appendThinking();
|
| 70 |
+
|
| 71 |
+
try {
|
| 72 |
+
const res = await fetch(`${API}/query`, {
|
| 73 |
+
method: 'POST',
|
| 74 |
+
headers: { 'Content-Type': 'application/json' },
|
| 75 |
+
body: JSON.stringify({ query, client: state.client }),
|
| 76 |
+
});
|
| 77 |
+
|
| 78 |
+
if (!res.ok) {
|
| 79 |
+
const err = await res.json().catch(() => ({ detail: res.statusText }));
|
| 80 |
+
throw new Error(err.detail || 'Request failed');
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
const data = await res.json();
|
| 84 |
+
thinkingEl.remove();
|
| 85 |
+
appendBotMessage(data);
|
| 86 |
+
renderEval(data);
|
| 87 |
+
} catch (err) {
|
| 88 |
+
thinkingEl.remove();
|
| 89 |
+
appendMessage('bot', `Error: ${err.message}`);
|
| 90 |
+
} finally {
|
| 91 |
+
setLoading(false);
|
| 92 |
+
}
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
// ── Messages ──────────────────────────────────────────────────────────────
|
| 96 |
+
|
| 97 |
+
function appendMessage(role, text) {
|
| 98 |
+
const el = document.createElement('div');
|
| 99 |
+
el.className = `message ${role}`;
|
| 100 |
+
el.innerHTML = `
|
| 101 |
+
<div class="bubble">${escapeHtml(text)}</div>
|
| 102 |
+
<div class="meta">${role === 'user' ? 'You' : 'Bot'}</div>
|
| 103 |
+
`;
|
| 104 |
+
getMessages().appendChild(el);
|
| 105 |
+
scrollMessages();
|
| 106 |
+
return el;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
function appendBotMessage(data) {
|
| 110 |
+
const overall = data.evaluation.overall_pass;
|
| 111 |
+
const verdictClass = overall ? 'pass' : 'fail';
|
| 112 |
+
const verdictLabel = overall ? '✓ All checks passed' : '✗ Checks failed';
|
| 113 |
+
|
| 114 |
+
const el = document.createElement('div');
|
| 115 |
+
el.className = 'message bot';
|
| 116 |
+
el.innerHTML = `
|
| 117 |
+
<div class="bubble">${escapeHtml(data.answer)}</div>
|
| 118 |
+
<div class="verdict ${verdictClass}">${verdictLabel}</div>
|
| 119 |
+
<div class="meta">${data.client_display}</div>
|
| 120 |
+
`;
|
| 121 |
+
getMessages().appendChild(el);
|
| 122 |
+
scrollMessages();
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
function appendThinking() {
|
| 126 |
+
const wrap = document.createElement('div');
|
| 127 |
+
wrap.className = 'message bot';
|
| 128 |
+
wrap.innerHTML = `
|
| 129 |
+
<div class="thinking">
|
| 130 |
+
<span></span><span></span><span></span>
|
| 131 |
+
</div>
|
| 132 |
+
`;
|
| 133 |
+
getMessages().appendChild(wrap);
|
| 134 |
+
scrollMessages();
|
| 135 |
+
return wrap;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
// ── Eval panel ────────────────────────────────────────────────────────────
|
| 139 |
+
|
| 140 |
+
const METRIC_LABELS = {
|
| 141 |
+
pii_leakage: 'PII Leakage',
|
| 142 |
+
token_budget: 'Token Budget',
|
| 143 |
+
answer_relevancy: 'Answer Relevancy',
|
| 144 |
+
faithfulness: 'Faithfulness',
|
| 145 |
+
chain_terminology: 'Chain Terminology',
|
| 146 |
+
};
|
| 147 |
+
|
| 148 |
+
const METRIC_DESC = {
|
| 149 |
+
pii_leakage: 'Regex scan — no PII in response',
|
| 150 |
+
token_budget: 'Response within token ceiling',
|
| 151 |
+
answer_relevancy: 'Cosine similarity: query ↔ response',
|
| 152 |
+
faithfulness: 'Claude judge: grounded in retrieved context?',
|
| 153 |
+
chain_terminology: 'Deterministic: client-specific terms used',
|
| 154 |
+
};
|
| 155 |
+
|
| 156 |
+
function renderEval(data) {
|
| 157 |
+
const metrics = data.evaluation.metrics;
|
| 158 |
+
const sources = data.sources;
|
| 159 |
+
|
| 160 |
+
const metricCards = Object.entries(metrics).map(([key, m]) => {
|
| 161 |
+
const cls = scoreClass(m.score, key);
|
| 162 |
+
const pct = Math.round(m.score * 100);
|
| 163 |
+
return `
|
| 164 |
+
<div class="metric-card ${cls}">
|
| 165 |
+
<div class="metric-header">
|
| 166 |
+
<span class="metric-name">${METRIC_LABELS[key] || key}</span>
|
| 167 |
+
<span class="score-badge ${cls}">${pct}%</span>
|
| 168 |
+
</div>
|
| 169 |
+
<div class="metric-detail">${escapeHtml(METRIC_DESC[key] || '')}</div>
|
| 170 |
+
<div class="metric-detail" style="margin-top:4px;color:#6a8aaa">${escapeHtml(m.detail)}</div>
|
| 171 |
+
<div class="score-bar-wrap">
|
| 172 |
+
<div class="score-bar-bg">
|
| 173 |
+
<div class="score-bar-fill ${cls}" style="width:${pct}%"></div>
|
| 174 |
+
</div>
|
| 175 |
+
</div>
|
| 176 |
+
</div>
|
| 177 |
+
`;
|
| 178 |
+
}).join('');
|
| 179 |
+
|
| 180 |
+
const sourceItems = sources.map(s => `
|
| 181 |
+
<div class="source-item">
|
| 182 |
+
<span class="source-title">${escapeHtml(s.title)}</span>
|
| 183 |
+
<span class="source-score">${(s.score * 100).toFixed(0)}%</span>
|
| 184 |
+
</div>
|
| 185 |
+
`).join('');
|
| 186 |
+
|
| 187 |
+
document.getElementById('eval-body').innerHTML = `
|
| 188 |
+
<div class="eval-content">
|
| 189 |
+
${metricCards}
|
| 190 |
+
<div class="sources-section">
|
| 191 |
+
<div class="sources-label">Retrieved Sources</div>
|
| 192 |
+
${sourceItems || '<div style="font-size:11px;color:#8aabcc">No sources retrieved</div>'}
|
| 193 |
+
</div>
|
| 194 |
+
</div>
|
| 195 |
+
`;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
function scoreClass(score, metric) {
|
| 199 |
+
// pii_leakage: 1.0 = pass, anything else = fail (binary)
|
| 200 |
+
if (metric === 'pii_leakage') return score === 1.0 ? 'pass' : 'fail';
|
| 201 |
+
if (score >= 0.75) return 'pass';
|
| 202 |
+
if (score >= 0.45) return 'warn';
|
| 203 |
+
return 'fail';
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
// ── Helpers ───────────────────────────────────────────────────────────────
|
| 207 |
+
|
| 208 |
+
function setLoading(val) {
|
| 209 |
+
state.loading = val;
|
| 210 |
+
document.getElementById('send-btn').disabled = val;
|
| 211 |
+
document.getElementById('query-input').disabled = val;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
function getMessages() {
|
| 215 |
+
return document.getElementById('messages');
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
function scrollMessages() {
|
| 219 |
+
const el = getMessages();
|
| 220 |
+
el.scrollTop = el.scrollHeight;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
function capitalize(s) {
|
| 224 |
+
return s.charAt(0).toUpperCase() + s.slice(1);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
function escapeHtml(str) {
|
| 228 |
+
return String(str)
|
| 229 |
+
.replace(/&/g, '&')
|
| 230 |
+
.replace(/</g, '<')
|
| 231 |
+
.replace(/>/g, '>')
|
| 232 |
+
.replace(/"/g, '"');
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
boot();
|
ui/index.html
ADDED
|
@@ -0,0 +1,410 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>AI Response Validator</title>
|
| 7 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 8 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800;900&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
|
| 9 |
+
<style>
|
| 10 |
+
* { margin: 0; padding: 0; box-sizing: border-box; }
|
| 11 |
+
|
| 12 |
+
body {
|
| 13 |
+
font-family: 'Inter', sans-serif;
|
| 14 |
+
background: #eef4fc;
|
| 15 |
+
color: #1a1a1a;
|
| 16 |
+
height: 100vh;
|
| 17 |
+
display: grid;
|
| 18 |
+
grid-template-rows: auto 1fr;
|
| 19 |
+
overflow: hidden;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
/* ── Header ── */
|
| 23 |
+
header {
|
| 24 |
+
background: #fff;
|
| 25 |
+
border-bottom: 2px solid #1e3a5f;
|
| 26 |
+
padding: 14px 28px;
|
| 27 |
+
display: flex;
|
| 28 |
+
align-items: center;
|
| 29 |
+
justify-content: space-between;
|
| 30 |
+
gap: 24px;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
.header-left h1 {
|
| 34 |
+
font-size: 22px;
|
| 35 |
+
font-weight: 900;
|
| 36 |
+
color: #1a1a1a;
|
| 37 |
+
letter-spacing: -0.5px;
|
| 38 |
+
}
|
| 39 |
+
.header-left h1 span { color: #3a6ea8; }
|
| 40 |
+
.header-left .tagline {
|
| 41 |
+
font-size: 11px;
|
| 42 |
+
color: #8aabcc;
|
| 43 |
+
margin-top: 2px;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
/* ── Domain / Client switcher ── */
|
| 47 |
+
.switcher {
|
| 48 |
+
display: flex;
|
| 49 |
+
align-items: center;
|
| 50 |
+
gap: 10px;
|
| 51 |
+
flex-wrap: wrap;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
.switcher label {
|
| 55 |
+
font-size: 10px;
|
| 56 |
+
font-weight: 700;
|
| 57 |
+
text-transform: uppercase;
|
| 58 |
+
letter-spacing: 1.5px;
|
| 59 |
+
color: #8aabcc;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
.btn-group {
|
| 63 |
+
display: flex;
|
| 64 |
+
border: 1px solid #c8dff5;
|
| 65 |
+
border-radius: 5px;
|
| 66 |
+
overflow: hidden;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
.btn-group button {
|
| 70 |
+
background: #fff;
|
| 71 |
+
border: none;
|
| 72 |
+
border-right: 1px solid #c8dff5;
|
| 73 |
+
padding: 6px 14px;
|
| 74 |
+
font-size: 12px;
|
| 75 |
+
font-weight: 600;
|
| 76 |
+
color: #4a6a8a;
|
| 77 |
+
cursor: pointer;
|
| 78 |
+
transition: background 0.15s, color 0.15s;
|
| 79 |
+
}
|
| 80 |
+
.btn-group button:last-child { border-right: none; }
|
| 81 |
+
.btn-group button.active {
|
| 82 |
+
background: #1e3a5f;
|
| 83 |
+
color: #fff;
|
| 84 |
+
}
|
| 85 |
+
.btn-group button:hover:not(.active) { background: #eef4fc; }
|
| 86 |
+
|
| 87 |
+
.divider-v {
|
| 88 |
+
width: 1px;
|
| 89 |
+
height: 28px;
|
| 90 |
+
background: #c8dff5;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
/* ── Main layout ── */
|
| 94 |
+
main {
|
| 95 |
+
display: grid;
|
| 96 |
+
grid-template-columns: 1fr 360px;
|
| 97 |
+
overflow: hidden;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
/* ── Chat panel ── */
|
| 101 |
+
.chat-panel {
|
| 102 |
+
display: flex;
|
| 103 |
+
flex-direction: column;
|
| 104 |
+
border-right: 1px solid #c8dff5;
|
| 105 |
+
overflow: hidden;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
.messages {
|
| 109 |
+
flex: 1;
|
| 110 |
+
overflow-y: auto;
|
| 111 |
+
padding: 24px 28px;
|
| 112 |
+
display: flex;
|
| 113 |
+
flex-direction: column;
|
| 114 |
+
gap: 16px;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.message {
|
| 118 |
+
display: flex;
|
| 119 |
+
flex-direction: column;
|
| 120 |
+
gap: 4px;
|
| 121 |
+
max-width: 80%;
|
| 122 |
+
}
|
| 123 |
+
.message.user { align-self: flex-end; }
|
| 124 |
+
.message.bot { align-self: flex-start; }
|
| 125 |
+
|
| 126 |
+
.message .bubble {
|
| 127 |
+
padding: 12px 16px;
|
| 128 |
+
border-radius: 8px;
|
| 129 |
+
font-size: 13.5px;
|
| 130 |
+
line-height: 1.6;
|
| 131 |
+
}
|
| 132 |
+
.message.user .bubble {
|
| 133 |
+
background: #1e3a5f;
|
| 134 |
+
color: #fff;
|
| 135 |
+
border-radius: 8px 8px 2px 8px;
|
| 136 |
+
}
|
| 137 |
+
.message.bot .bubble {
|
| 138 |
+
background: #fff;
|
| 139 |
+
color: #1a1a1a;
|
| 140 |
+
border: 1px solid #c8dff5;
|
| 141 |
+
border-radius: 8px 8px 8px 2px;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
.message .meta {
|
| 145 |
+
font-size: 10px;
|
| 146 |
+
color: #8aabcc;
|
| 147 |
+
padding: 0 4px;
|
| 148 |
+
}
|
| 149 |
+
.message.user .meta { text-align: right; }
|
| 150 |
+
|
| 151 |
+
/* overall pass/fail badge on bot message */
|
| 152 |
+
.verdict {
|
| 153 |
+
display: inline-flex;
|
| 154 |
+
align-items: center;
|
| 155 |
+
gap: 5px;
|
| 156 |
+
font-size: 10px;
|
| 157 |
+
font-weight: 700;
|
| 158 |
+
padding: 2px 8px;
|
| 159 |
+
border-radius: 3px;
|
| 160 |
+
margin-top: 4px;
|
| 161 |
+
align-self: flex-start;
|
| 162 |
+
}
|
| 163 |
+
.verdict.pass { background: #f1f8f1; color: #2e7d32; border: 1px solid #c8e6c9; }
|
| 164 |
+
.verdict.fail { background: #fdf1f1; color: #c62828; border: 1px solid #ffcdd2; }
|
| 165 |
+
.verdict.warn { background: #fffbf0; color: #a06000; border: 1px solid #ffe082; }
|
| 166 |
+
|
| 167 |
+
/* ── Input bar ── */
|
| 168 |
+
.input-bar {
|
| 169 |
+
padding: 16px 28px;
|
| 170 |
+
background: #fff;
|
| 171 |
+
border-top: 1px solid #c8dff5;
|
| 172 |
+
display: flex;
|
| 173 |
+
gap: 10px;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
.input-bar input {
|
| 177 |
+
flex: 1;
|
| 178 |
+
padding: 10px 14px;
|
| 179 |
+
border: 1px solid #c8dff5;
|
| 180 |
+
border-radius: 6px;
|
| 181 |
+
font-size: 13.5px;
|
| 182 |
+
font-family: 'Inter', sans-serif;
|
| 183 |
+
outline: none;
|
| 184 |
+
transition: border-color 0.15s;
|
| 185 |
+
}
|
| 186 |
+
.input-bar input:focus { border-color: #3a6ea8; }
|
| 187 |
+
.input-bar input:disabled { background: #f5f9ff; color: #8aabcc; }
|
| 188 |
+
|
| 189 |
+
.input-bar button {
|
| 190 |
+
padding: 10px 20px;
|
| 191 |
+
background: #1e3a5f;
|
| 192 |
+
color: #fff;
|
| 193 |
+
border: none;
|
| 194 |
+
border-radius: 6px;
|
| 195 |
+
font-size: 13px;
|
| 196 |
+
font-weight: 700;
|
| 197 |
+
cursor: pointer;
|
| 198 |
+
transition: background 0.15s;
|
| 199 |
+
white-space: nowrap;
|
| 200 |
+
}
|
| 201 |
+
.input-bar button:hover:not(:disabled) { background: #3a6ea8; }
|
| 202 |
+
.input-bar button:disabled { background: #93b8d8; cursor: not-allowed; }
|
| 203 |
+
|
| 204 |
+
/* ── Eval panel ── */
|
| 205 |
+
.eval-panel {
|
| 206 |
+
background: #fff;
|
| 207 |
+
overflow-y: auto;
|
| 208 |
+
display: flex;
|
| 209 |
+
flex-direction: column;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
.eval-panel .panel-header {
|
| 213 |
+
padding: 16px 20px 12px;
|
| 214 |
+
border-bottom: 1px solid #e8f2ff;
|
| 215 |
+
font-size: 10px;
|
| 216 |
+
font-weight: 800;
|
| 217 |
+
text-transform: uppercase;
|
| 218 |
+
letter-spacing: 2px;
|
| 219 |
+
color: #8aabcc;
|
| 220 |
+
position: sticky;
|
| 221 |
+
top: 0;
|
| 222 |
+
background: #fff;
|
| 223 |
+
z-index: 1;
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
.eval-empty {
|
| 227 |
+
flex: 1;
|
| 228 |
+
display: flex;
|
| 229 |
+
flex-direction: column;
|
| 230 |
+
align-items: center;
|
| 231 |
+
justify-content: center;
|
| 232 |
+
gap: 10px;
|
| 233 |
+
color: #b0cce8;
|
| 234 |
+
padding: 40px 20px;
|
| 235 |
+
text-align: center;
|
| 236 |
+
}
|
| 237 |
+
.eval-empty .icon { font-size: 36px; }
|
| 238 |
+
.eval-empty p { font-size: 12px; line-height: 1.6; }
|
| 239 |
+
|
| 240 |
+
.eval-content { padding: 16px 20px; display: flex; flex-direction: column; gap: 20px; }
|
| 241 |
+
|
| 242 |
+
/* Metric card */
|
| 243 |
+
.metric-card {
|
| 244 |
+
border: 1px solid #e0eef8;
|
| 245 |
+
border-left: 3px solid #1e3a5f;
|
| 246 |
+
border-radius: 0 6px 6px 0;
|
| 247 |
+
padding: 12px 14px;
|
| 248 |
+
background: #f5f9ff;
|
| 249 |
+
}
|
| 250 |
+
.metric-card.pass { border-left-color: #4caf50; background: #f0faf3; }
|
| 251 |
+
.metric-card.fail { border-left-color: #c62828; background: #fdf5f5; }
|
| 252 |
+
.metric-card.warn { border-left-color: #f9a825; background: #fffdf0; }
|
| 253 |
+
|
| 254 |
+
.metric-card .metric-header {
|
| 255 |
+
display: flex;
|
| 256 |
+
justify-content: space-between;
|
| 257 |
+
align-items: center;
|
| 258 |
+
margin-bottom: 6px;
|
| 259 |
+
}
|
| 260 |
+
.metric-card .metric-name {
|
| 261 |
+
font-size: 12px;
|
| 262 |
+
font-weight: 800;
|
| 263 |
+
color: #1e3a5f;
|
| 264 |
+
font-family: 'JetBrains Mono', monospace;
|
| 265 |
+
}
|
| 266 |
+
.metric-card.pass .metric-name { color: #2e7d32; }
|
| 267 |
+
.metric-card.fail .metric-name { color: #c62828; }
|
| 268 |
+
|
| 269 |
+
.score-badge {
|
| 270 |
+
font-family: 'JetBrains Mono', monospace;
|
| 271 |
+
font-size: 11px;
|
| 272 |
+
font-weight: 700;
|
| 273 |
+
padding: 2px 8px;
|
| 274 |
+
border-radius: 3px;
|
| 275 |
+
border: 1px solid;
|
| 276 |
+
}
|
| 277 |
+
.score-badge.pass { background: #f1f8f1; color: #2e7d32; border-color: #c8e6c9; }
|
| 278 |
+
.score-badge.fail { background: #fdf1f1; color: #c62828; border-color: #ffcdd2; }
|
| 279 |
+
.score-badge.warn { background: #fffbf0; color: #a06000; border-color: #ffe082; }
|
| 280 |
+
|
| 281 |
+
.metric-card .metric-detail {
|
| 282 |
+
font-size: 11px;
|
| 283 |
+
color: #4a6080;
|
| 284 |
+
line-height: 1.5;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
/* Score bar */
|
| 288 |
+
.score-bar-wrap { margin-top: 8px; }
|
| 289 |
+
.score-bar-bg {
|
| 290 |
+
height: 4px;
|
| 291 |
+
background: #e0eef8;
|
| 292 |
+
border-radius: 2px;
|
| 293 |
+
overflow: hidden;
|
| 294 |
+
}
|
| 295 |
+
.score-bar-fill {
|
| 296 |
+
height: 100%;
|
| 297 |
+
border-radius: 2px;
|
| 298 |
+
transition: width 0.4s ease;
|
| 299 |
+
}
|
| 300 |
+
.score-bar-fill.pass { background: #4caf50; }
|
| 301 |
+
.score-bar-fill.fail { background: #c62828; }
|
| 302 |
+
.score-bar-fill.warn { background: #f9a825; }
|
| 303 |
+
|
| 304 |
+
/* Sources */
|
| 305 |
+
.sources-section .sources-label {
|
| 306 |
+
font-size: 10px;
|
| 307 |
+
font-weight: 700;
|
| 308 |
+
text-transform: uppercase;
|
| 309 |
+
letter-spacing: 1.5px;
|
| 310 |
+
color: #8aabcc;
|
| 311 |
+
margin-bottom: 8px;
|
| 312 |
+
}
|
| 313 |
+
.source-item {
|
| 314 |
+
display: flex;
|
| 315 |
+
justify-content: space-between;
|
| 316 |
+
align-items: center;
|
| 317 |
+
padding: 7px 10px;
|
| 318 |
+
background: #f5f9ff;
|
| 319 |
+
border: 1px solid #e0eef8;
|
| 320 |
+
border-radius: 5px;
|
| 321 |
+
margin-bottom: 5px;
|
| 322 |
+
font-size: 11.5px;
|
| 323 |
+
}
|
| 324 |
+
.source-item .source-title { color: #2a4a6a; font-weight: 500; }
|
| 325 |
+
.source-item .source-score {
|
| 326 |
+
font-family: 'JetBrains Mono', monospace;
|
| 327 |
+
font-size: 10px;
|
| 328 |
+
color: #8aabcc;
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
/* Thinking indicator */
|
| 332 |
+
.thinking {
|
| 333 |
+
display: flex;
|
| 334 |
+
gap: 5px;
|
| 335 |
+
align-items: center;
|
| 336 |
+
padding: 12px 16px;
|
| 337 |
+
background: #fff;
|
| 338 |
+
border: 1px solid #c8dff5;
|
| 339 |
+
border-radius: 8px 8px 8px 2px;
|
| 340 |
+
width: fit-content;
|
| 341 |
+
}
|
| 342 |
+
.thinking span {
|
| 343 |
+
width: 7px; height: 7px;
|
| 344 |
+
background: #3a6ea8;
|
| 345 |
+
border-radius: 50%;
|
| 346 |
+
animation: bounce 1.2s infinite ease-in-out;
|
| 347 |
+
}
|
| 348 |
+
.thinking span:nth-child(2) { animation-delay: 0.2s; }
|
| 349 |
+
.thinking span:nth-child(3) { animation-delay: 0.4s; }
|
| 350 |
+
|
| 351 |
+
@keyframes bounce {
|
| 352 |
+
0%, 80%, 100% { transform: scale(0.6); opacity: 0.4; }
|
| 353 |
+
40% { transform: scale(1); opacity: 1; }
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
/* Scrollbar */
|
| 357 |
+
::-webkit-scrollbar { width: 5px; }
|
| 358 |
+
::-webkit-scrollbar-track { background: transparent; }
|
| 359 |
+
::-webkit-scrollbar-thumb { background: #c8dff5; border-radius: 3px; }
|
| 360 |
+
</style>
|
| 361 |
+
</head>
|
| 362 |
+
<body>
|
| 363 |
+
|
| 364 |
+
<header>
|
| 365 |
+
<div class="header-left">
|
| 366 |
+
<h1>AI Response <span>Validator</span></h1>
|
| 367 |
+
<div class="tagline">Domain-agnostic RAG evaluation · real-time L1 metrics · RosettaStone terminology</div>
|
| 368 |
+
</div>
|
| 369 |
+
|
| 370 |
+
<div class="switcher">
|
| 371 |
+
<label>Domain</label>
|
| 372 |
+
<div class="btn-group" id="domain-switcher"></div>
|
| 373 |
+
|
| 374 |
+
<div class="divider-v"></div>
|
| 375 |
+
|
| 376 |
+
<label>Client</label>
|
| 377 |
+
<div class="btn-group" id="client-switcher"></div>
|
| 378 |
+
</div>
|
| 379 |
+
</header>
|
| 380 |
+
|
| 381 |
+
<main>
|
| 382 |
+
<div class="chat-panel">
|
| 383 |
+
<div class="messages" id="messages">
|
| 384 |
+
<!-- populated by app.js -->
|
| 385 |
+
</div>
|
| 386 |
+
<div class="input-bar">
|
| 387 |
+
<input
|
| 388 |
+
type="text"
|
| 389 |
+
id="query-input"
|
| 390 |
+
placeholder="Ask something…"
|
| 391 |
+
autocomplete="off"
|
| 392 |
+
/>
|
| 393 |
+
<button id="send-btn">Send</button>
|
| 394 |
+
</div>
|
| 395 |
+
</div>
|
| 396 |
+
|
| 397 |
+
<div class="eval-panel">
|
| 398 |
+
<div class="panel-header">Evaluation</div>
|
| 399 |
+
<div id="eval-body">
|
| 400 |
+
<div class="eval-empty">
|
| 401 |
+
<div class="icon">◎</div>
|
| 402 |
+
<p>Send a message to see<br>real-time metric evaluation.</p>
|
| 403 |
+
</div>
|
| 404 |
+
</div>
|
| 405 |
+
</div>
|
| 406 |
+
</main>
|
| 407 |
+
|
| 408 |
+
<script src="/static/app.js"></script>
|
| 409 |
+
</body>
|
| 410 |
+
</html>
|