mbochniak01 commited on
Commit
ebe934f
·
1 Parent(s): b917936

Add full RAG evaluation pipeline with L1 metrics and UI

Browse files

- FastAPI backend: retrieve → generate → grade pipeline
- In-memory semantic retrieval (sentence-transformers, all-MiniLM-L6-v2)
- L1 graders: pii_leakage, token_budget, answer_relevancy, faithfulness, chain_terminology
- RosettaStone: deterministic client-specific terminology validation
- Two domains (retail, pharma) with two fictional clients each
- Plain HTML/JS frontend with real-time metric panel
- 20-pair golden dataset for L2 batch evaluation
- Docker config for HF Spaces (port 7860)

ARCHITECTURE.md ADDED
File without changes
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # System deps for sentence-transformers (tokenizers uses Rust bindings)
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ COPY requirements.txt .
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Pre-download the embedding model so first request isn't slow on HF Spaces
14
+ RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
15
+
16
+ COPY knowledge/ ./knowledge/
17
+ COPY backend/ ./backend/
18
+ COPY ui/ ./ui/
19
+
20
+ WORKDIR /app/backend
21
+
22
+ # HF Spaces requires port 7860
23
+ ENV PORT=7860
24
+ EXPOSE 7860
25
+
26
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,23 @@
1
  ---
2
- title: Ai Response Validator
3
- emoji: 📉
4
  colorFrom: blue
5
- colorTo: green
6
  sdk: docker
 
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: AI Response Validator
3
+ emoji: 🔍
4
  colorFrom: blue
5
+ colorTo: blue
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  ---
10
 
11
+ # AI Response Validator
12
+
13
+ Domain-agnostic RAG evaluation system. Validates AI responses for correctness,
14
+ faithfulness, and client-specific terminology across retail and pharma domains.
15
+
16
+ **Live demo:** select a domain and client, then ask a question in natural language.
17
+ Each response is evaluated in real time across 5 metrics:
18
+
19
+ - **PII Leakage** — regex scan, no personal data in responses
20
+ - **Token Budget** — response within ceiling
21
+ - **Answer Relevancy** — cosine similarity between query and response
22
+ - **Faithfulness** — Claude judge: is the answer grounded in retrieved context?
23
+ - **Chain Terminology** — deterministic check that the bot uses client-specific terms
backend/app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from contextlib import asynccontextmanager
4
+ from pathlib import Path
5
+
6
+ import anthropic
7
+ from fastapi import FastAPI, HTTPException
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from fastapi.responses import FileResponse
10
+ from fastapi.staticfiles import StaticFiles
11
+ from pydantic import BaseModel
12
+
13
+ from config import DOMAIN_CLIENTS, CLIENT_DOMAIN, DISPLAY_NAMES
14
+ from pipeline import run
15
+
16
+ log = logging.getLogger(__name__)
17
+ logging.basicConfig(level=logging.INFO)
18
+
19
+ UI_DIR = Path(__file__).parent.parent / "ui"
20
+
21
+
22
+ @asynccontextmanager
23
+ async def lifespan(app: FastAPI):
24
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
25
+ if not api_key:
26
+ raise RuntimeError("ANTHROPIC_API_KEY not set")
27
+ app.state.anthropic = anthropic.Anthropic(api_key=api_key)
28
+ log.info("Anthropic client ready")
29
+ yield
30
+
31
+
32
+ app = FastAPI(title="AI Response Validator", lifespan=lifespan)
33
+
34
+ app.add_middleware(
35
+ CORSMiddleware,
36
+ allow_origins=["*"],
37
+ allow_methods=["GET", "POST"],
38
+ allow_headers=["*"],
39
+ )
40
+
41
+
42
+ class QueryRequest(BaseModel):
43
+ query: str
44
+ client: str
45
+
46
+
47
+ class QueryResponse(BaseModel):
48
+ query: str
49
+ client: str
50
+ client_display: str
51
+ answer: str
52
+ sources: list[dict]
53
+ evaluation: dict
54
+
55
+
56
+ @app.get("/health")
57
+ def health():
58
+ return {"status": "ok"}
59
+
60
+
61
+ @app.get("/config")
62
+ def get_config():
63
+ """Domain/client structure for the UI switcher."""
64
+ return {
65
+ "domains": {
66
+ domain: [{"id": c, "display": DISPLAY_NAMES[c]} for c in clients]
67
+ for domain, clients in DOMAIN_CLIENTS.items()
68
+ }
69
+ }
70
+
71
+
72
+ @app.post("/query", response_model=QueryResponse)
73
+ def handle_query(req: QueryRequest):
74
+ if req.client not in CLIENT_DOMAIN:
75
+ raise HTTPException(status_code=400, detail=f"Unknown client: {req.client!r}")
76
+ if not req.query.strip():
77
+ raise HTTPException(status_code=400, detail="Query cannot be empty")
78
+
79
+ result = run(
80
+ query=req.query.strip(),
81
+ client=req.client,
82
+ anthropic_client=app.state.anthropic,
83
+ )
84
+ return result.response_payload
85
+
86
+
87
+ app.mount("/static", StaticFiles(directory=UI_DIR), name="static")
88
+
89
+
90
+ @app.get("/")
91
+ def root():
92
+ return FileResponse(UI_DIR / "index.html")
backend/config.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ KNOWLEDGE_ROOT = Path(__file__).parent.parent / "knowledge"
4
+ EMBEDDER_MODEL = "all-MiniLM-L6-v2"
5
+
6
+ DOMAIN_CLIENTS: dict[str, list[str]] = {
7
+ "retail": ["novamart", "shelfwise"],
8
+ "pharma": ["clinixone", "pharmalink"],
9
+ }
10
+
11
+ CLIENT_DOMAIN: dict[str, str] = {
12
+ client: domain
13
+ for domain, clients in DOMAIN_CLIENTS.items()
14
+ for client in clients
15
+ }
16
+
17
+ DISPLAY_NAMES: dict[str, str] = {
18
+ "novamart": "NovaMart",
19
+ "shelfwise": "ShelfWise",
20
+ "clinixone": "ClinixOne",
21
+ "pharmalink": "PharmaLink",
22
+ }
23
+
24
+
25
+ def term_catalog_path(domain: str) -> Path:
26
+ return KNOWLEDGE_ROOT / domain / "term-catalog.yaml"
27
+
28
+
29
+ def features_path(domain: str) -> Path:
30
+ return KNOWLEDGE_ROOT / domain / "features.yaml"
31
+
32
+
33
+ def domain_for(client: str) -> str:
34
+ if client not in CLIENT_DOMAIN:
35
+ raise ValueError(f"Unknown client: {client!r}. Valid: {list(CLIENT_DOMAIN)}")
36
+ return CLIENT_DOMAIN[client]
backend/grader.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ L1 graders — run live on every query.
3
+
4
+ Metrics:
5
+ pii_leakage — regex scan for PII patterns in response
6
+ token_budget — response within allowed token ceiling
7
+ answer_relevancy — cosine similarity between query and response embeddings
8
+ faithfulness — Claude judge: is response grounded in retrieved context?
9
+ chain_terminology — deterministic: client-specific terms used (via RosettaStone)
10
+ """
11
+
12
+ import re
13
+ import json
14
+ import logging
15
+ from dataclasses import dataclass, field
16
+
17
+ import anthropic
18
+ from sentence_transformers import SentenceTransformer
19
+ from sklearn.metrics.pairwise import cosine_similarity
20
+
21
+ from config import EMBEDDER_MODEL
22
+ from rosetta import check_terminology
23
+
24
+ log = logging.getLogger(__name__)
25
+
26
+ _embedder: SentenceTransformer | None = None
27
+
28
+
29
+ def get_embedder() -> SentenceTransformer:
30
+ """Return the shared sentence-transformer instance, loading it on first call."""
31
+ global _embedder
32
+ if _embedder is None:
33
+ _embedder = SentenceTransformer(EMBEDDER_MODEL)
34
+ return _embedder
35
+
36
+
37
+ @dataclass(slots=True)
38
+ class GradeResult:
39
+ metric: str
40
+ passed: bool
41
+ score: float
42
+ detail: str = ""
43
+ metadata: dict = field(default_factory=dict)
44
+
45
+
46
+ @dataclass(slots=True)
47
+ class GradeReport:
48
+ client: str
49
+ query: str
50
+ results: list[GradeResult] = field(default_factory=list)
51
+
52
+ @property
53
+ def overall(self) -> bool:
54
+ return all(r.passed for r in self.results)
55
+
56
+ @property
57
+ def summary(self) -> dict:
58
+ return {
59
+ "overall_pass": self.overall,
60
+ "metrics": {
61
+ r.metric: {"passed": r.passed, "score": round(r.score, 3), "detail": r.detail}
62
+ for r in self.results
63
+ },
64
+ }
65
+
66
+
67
+ _PII_PATTERNS = [
68
+ (r"\b\d{3}-\d{2}-\d{4}\b", "SSN"),
69
+ (r"\b\d{16}\b", "credit card"),
70
+ (r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b", "email"),
71
+ (r"\b\d{3}[\s.\-]?\d{3}[\s.\-]?\d{4}\b", "phone"),
72
+ ]
73
+
74
+ TOKEN_BUDGET = 512
75
+ RELEVANCY_THRESHOLD = 0.45
76
+ FAITHFULNESS_THRESHOLD = 0.7
77
+
78
+ FAITHFULNESS_PROMPT = """\
79
+ You are a faithfulness evaluator. Your task is to assess whether an AI response
80
+ is fully grounded in the provided context and contains no unsupported claims.
81
+
82
+ Context:
83
+ <context>
84
+ {context}
85
+ </context>
86
+
87
+ Response to evaluate:
88
+ <response>
89
+ {response}
90
+ </response>
91
+
92
+ Rules:
93
+ - A claim is faithful if it can be directly inferred from the context.
94
+ - A claim is unfaithful if it introduces facts not present in the context.
95
+ - Ignore stylistic differences; focus only on factual grounding.
96
+
97
+ Respond with JSON only, no explanation outside the JSON:
98
+ {{
99
+ "faithful": true | false,
100
+ "score": 0.0-1.0,
101
+ "unsupported_claims": ["claim1", "claim2"]
102
+ }}"""
103
+
104
+
105
+ def grade_pii_leakage(response: str) -> GradeResult:
106
+ """Scan response for PII patterns; fail on any match."""
107
+ found = [label for pattern, label in _PII_PATTERNS if re.search(pattern, response)]
108
+ return GradeResult(
109
+ metric="pii_leakage",
110
+ passed=not found,
111
+ score=0.0 if found else 1.0,
112
+ detail=f"Detected: {', '.join(found)}" if found else "Clean",
113
+ )
114
+
115
+
116
+ def grade_token_budget(response: str, budget: int = TOKEN_BUDGET) -> GradeResult:
117
+ """Fail if estimated token count exceeds budget."""
118
+ approx_tokens = len(response) // 4
119
+ passed = approx_tokens <= budget
120
+ return GradeResult(
121
+ metric="token_budget",
122
+ passed=passed,
123
+ score=1.0 if passed else max(0.0, 1.0 - approx_tokens / budget),
124
+ detail=f"~{approx_tokens} tokens (budget: {budget})",
125
+ metadata={"approx_tokens": approx_tokens, "budget": budget},
126
+ )
127
+
128
+
129
+ def grade_answer_relevancy(query: str, response: str) -> GradeResult:
130
+ """Score semantic similarity between query and response via cosine distance."""
131
+ embedder = get_embedder()
132
+ q_vec = embedder.encode([query])
133
+ r_vec = embedder.encode([response])
134
+ score = float(cosine_similarity(q_vec, r_vec)[0][0])
135
+ return GradeResult(
136
+ metric="answer_relevancy",
137
+ passed=score >= RELEVANCY_THRESHOLD,
138
+ score=score,
139
+ detail=f"Cosine {score:.3f} (threshold: {RELEVANCY_THRESHOLD})",
140
+ )
141
+
142
+
143
+ def grade_faithfulness(
144
+ response: str,
145
+ context: str,
146
+ anthropic_client: anthropic.Anthropic,
147
+ ) -> GradeResult:
148
+ """Ask Claude to judge whether the response is grounded in retrieved context."""
149
+ prompt = FAITHFULNESS_PROMPT.format(context=context, response=response)
150
+ try:
151
+ message = anthropic_client.messages.create(
152
+ model="claude-haiku-4-5-20251001",
153
+ max_tokens=256,
154
+ messages=[{"role": "user", "content": prompt}],
155
+ )
156
+ parsed = json.loads(message.content[0].text.strip())
157
+ score = float(parsed.get("score", 0.0))
158
+ unsupported = parsed.get("unsupported_claims", [])
159
+ passed = parsed.get("faithful", False) and score >= FAITHFULNESS_THRESHOLD
160
+ detail = f"Score {score:.2f}" + (f" — unsupported: {unsupported}" if unsupported else "")
161
+ return GradeResult(
162
+ metric="faithfulness",
163
+ passed=passed,
164
+ score=score,
165
+ detail=detail,
166
+ metadata={"unsupported_claims": unsupported},
167
+ )
168
+ except (json.JSONDecodeError, anthropic.APIError) as exc:
169
+ log.warning("Faithfulness grader failed: %s", exc)
170
+ return GradeResult(
171
+ metric="faithfulness",
172
+ passed=False,
173
+ score=0.0,
174
+ detail=f"Grader error: {exc}",
175
+ )
176
+
177
+
178
+ def grade_chain_terminology(response: str, client: str) -> GradeResult:
179
+ """Check that the response uses client-specific terms, not rival terminology."""
180
+ result = check_terminology(response, client)
181
+ violations = result["violations"]
182
+ checked = result["checked"]
183
+ score = 1.0 - (len(violations) / checked) if checked else 1.0
184
+ detail = (
185
+ f"{len(violations)} violation(s): " +
186
+ ", ".join(f"{v['found']!r} → should be {v['expected']!r}" for v in violations)
187
+ if violations else f"All {checked} terms correct"
188
+ )
189
+ return GradeResult(
190
+ metric="chain_terminology",
191
+ passed=result["pass"],
192
+ score=score,
193
+ detail=detail,
194
+ metadata={"violations": violations},
195
+ )
196
+
197
+
198
+ def grade(
199
+ query: str,
200
+ response: str,
201
+ context: str,
202
+ client: str,
203
+ anthropic_client: anthropic.Anthropic,
204
+ token_budget: int = TOKEN_BUDGET,
205
+ ) -> GradeReport:
206
+ """Run all L1 graders and return a consolidated report."""
207
+ report = GradeReport(client=client, query=query)
208
+ report.results = [
209
+ grade_pii_leakage(response),
210
+ grade_token_budget(response, token_budget),
211
+ grade_answer_relevancy(query, response),
212
+ grade_faithfulness(response, context, anthropic_client),
213
+ grade_chain_terminology(response, client),
214
+ ]
215
+ return report
backend/pipeline.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG pipeline: retrieve → generate → grade.
3
+
4
+ Retrieval: in-memory semantic search (sentence-transformers, encoded at first use per domain).
5
+ Generation: Claude with retrieved context injected as grounding.
6
+ Grading: L1 metrics via grader.py.
7
+ """
8
+
9
+ import logging
10
+ from dataclasses import dataclass, field
11
+
12
+ import anthropic
13
+ import numpy as np
14
+ import yaml
15
+ from sklearn.metrics.pairwise import cosine_similarity
16
+ from sentence_transformers import SentenceTransformer
17
+
18
+ from config import features_path, domain_for, DISPLAY_NAMES
19
+ from grader import grade, GradeReport, get_embedder
20
+
21
+ log = logging.getLogger(__name__)
22
+
23
+ TOP_K = 3
24
+ MIN_RETRIEVAL_SCORE = 0.1
25
+
26
+ SYSTEM_PROMPT = """\
27
+ You are a helpful assistant for {client_display} ({domain} domain).
28
+ Answer the user's question using only the information in the provided context.
29
+ Be concise. Use the terminology natural to {client_display} — do not use internal
30
+ or competitor terminology. If the context does not contain enough information to
31
+ answer, say so clearly rather than speculating."""
32
+
33
+
34
+ @dataclass(slots=True)
35
+ class RetrievedDoc:
36
+ id: str
37
+ title: str
38
+ content: str
39
+ score: float
40
+
41
+
42
+ @dataclass(slots=True)
43
+ class PipelineResult:
44
+ query: str
45
+ client: str
46
+ answer: str
47
+ retrieved_docs: list[RetrievedDoc]
48
+ grade_report: GradeReport
49
+ context_used: str
50
+
51
+ @property
52
+ def response_payload(self) -> dict:
53
+ return {
54
+ "query": self.query,
55
+ "client": self.client,
56
+ "client_display": DISPLAY_NAMES.get(self.client, self.client),
57
+ "answer": self.answer,
58
+ "sources": [
59
+ {"id": d.id, "title": d.title, "score": round(d.score, 3)}
60
+ for d in self.retrieved_docs
61
+ ],
62
+ "evaluation": self.grade_report.summary,
63
+ }
64
+
65
+
66
+ @dataclass(slots=True)
67
+ class KBIndex:
68
+ docs: list[dict]
69
+ embeddings: np.ndarray
70
+
71
+
72
+ _index_cache: dict[str, KBIndex] = {}
73
+
74
+
75
+ def _build_index(domain: str, embedder: SentenceTransformer) -> KBIndex:
76
+ if domain not in _index_cache:
77
+ data = yaml.safe_load(features_path(domain).read_text())
78
+ docs = data["documents"]
79
+ texts = [f"{d['title']}. {d['content']}" for d in docs]
80
+ embeddings = embedder.encode(texts, show_progress_bar=False)
81
+ _index_cache[domain] = KBIndex(docs=docs, embeddings=np.array(embeddings))
82
+ log.info("Built KB index for domain=%s (%d docs)", domain, len(docs))
83
+ return _index_cache[domain]
84
+
85
+
86
+ def _build_context(docs: list[RetrievedDoc]) -> str:
87
+ return "\n\n".join(f"[{d.title}]\n{d.content.strip()}" for d in docs)
88
+
89
+
90
+ def _generate(
91
+ query: str,
92
+ context: str,
93
+ client: str,
94
+ domain: str,
95
+ anthropic_client: anthropic.Anthropic,
96
+ ) -> str:
97
+ system = SYSTEM_PROMPT.format(
98
+ client_display=DISPLAY_NAMES.get(client, client),
99
+ domain=domain,
100
+ )
101
+ response = anthropic_client.messages.create(
102
+ model="claude-haiku-4-5-20251001",
103
+ max_tokens=512,
104
+ system=system,
105
+ messages=[{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}],
106
+ )
107
+ return response.content[0].text.strip()
108
+
109
+
110
+ def run(
111
+ query: str,
112
+ client: str,
113
+ anthropic_client: anthropic.Anthropic,
114
+ top_k: int = TOP_K,
115
+ ) -> PipelineResult:
116
+ """Retrieve relevant KB docs, generate a grounded answer, and grade it."""
117
+ domain = domain_for(client)
118
+ embedder = get_embedder()
119
+ index = _build_index(domain, embedder)
120
+
121
+ q_vec = embedder.encode([query])
122
+ scores = cosine_similarity(q_vec, index.embeddings)[0]
123
+ top_indices = np.argsort(scores)[::-1][:top_k]
124
+ retrieved = [
125
+ RetrievedDoc(
126
+ id=index.docs[i]["id"],
127
+ title=index.docs[i]["title"],
128
+ content=index.docs[i]["content"],
129
+ score=float(scores[i]),
130
+ )
131
+ for i in top_indices
132
+ if scores[i] > MIN_RETRIEVAL_SCORE
133
+ ]
134
+
135
+ context = _build_context(retrieved)
136
+ answer = _generate(query, context, client, domain, anthropic_client)
137
+ report = grade(
138
+ query=query,
139
+ response=answer,
140
+ context=context,
141
+ client=client,
142
+ anthropic_client=anthropic_client,
143
+ )
144
+
145
+ return PipelineResult(
146
+ query=query,
147
+ client=client,
148
+ answer=answer,
149
+ retrieved_docs=retrieved,
150
+ grade_report=report,
151
+ context_used=context,
152
+ )
backend/rosetta.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RosettaStone: canonical term -> client-specific term translation."""
2
+
3
+ import yaml
4
+ from functools import lru_cache
5
+
6
+ from config import term_catalog_path, domain_for
7
+
8
+
9
+ @lru_cache(maxsize=8)
10
+ def _load_catalog(domain: str) -> dict[str, dict[str, str]]:
11
+ """Returns {client_id: {CANONICAL_KEY: "client term"}}."""
12
+ data = yaml.safe_load(term_catalog_path(domain).read_text())
13
+ return {
14
+ client_id: client_data["terms"]
15
+ for client_id, client_data in data["clients"].items()
16
+ }
17
+
18
+
19
+ def translate(canonical_key: str, client: str) -> str | None:
20
+ """Return client-specific term for a canonical key, or None if not mapped."""
21
+ catalog = _load_catalog(domain_for(client))
22
+ return catalog.get(client, {}).get(canonical_key)
23
+
24
+
25
+ def client_terms(client: str) -> dict[str, str]:
26
+ """Return full {CANONICAL_KEY: client_term} mapping for a client."""
27
+ catalog = _load_catalog(domain_for(client))
28
+ return dict(catalog.get(client, {}))
29
+
30
+
31
+ def check_terminology(response_text: str, client: str) -> dict:
32
+ """
33
+ Deterministic chain_terminology check.
34
+
35
+ Flags cases where a rival client's term appears in the response for a
36
+ canonical key, without the correct client term also being present.
37
+
38
+ Returns:
39
+ {"pass": bool, "violations": [...], "checked": int}
40
+ """
41
+ catalog = _load_catalog(domain_for(client))
42
+ expected = catalog.get(client, {})
43
+ other_clients = {c: terms for c, terms in catalog.items() if c != client}
44
+ text_lower = response_text.lower()
45
+ violations = []
46
+
47
+ for canonical_key, client_term in expected.items():
48
+ client_term_present = client_term.lower() in text_lower
49
+ for other_terms in other_clients.values():
50
+ rival_term = other_terms.get(canonical_key, "")
51
+ if rival_term and rival_term.lower() in text_lower and not client_term_present:
52
+ violations.append({
53
+ "canonical": canonical_key,
54
+ "expected": client_term,
55
+ "found": rival_term,
56
+ })
57
+
58
+ return {
59
+ "pass": len(violations) == 0,
60
+ "violations": violations,
61
+ "checked": len(expected),
62
+ }
eval/golden-dataset.yaml ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Golden dataset — 20 Q&A pairs for L2 batch evaluation
2
+ # 10 retail (5 NovaMart / 5 ShelfWise) + 10 pharma (5 ClinixOne / 5 PharmaLink)
3
+ #
4
+ # Fields:
5
+ # id — stable identifier
6
+ # domain — retail | pharma
7
+ # client — novamart | shelfwise | clinixone | pharmalink
8
+ # question — natural-language query as a recruiter or end-user would type it
9
+ # expected_contains — keyphrases the correct answer must include (used by L2 metrics)
10
+ # expected_answer — full reference answer for answer_correctness / answer_similarity
11
+ # notes — what this pair is testing (for eval engineers)
12
+
13
+ pairs:
14
+
15
+ # ── RETAIL · NovaMart ──────────────────────────────────────────────────
16
+
17
+ - id: retail-nm-001
18
+ domain: retail
19
+ client: novamart
20
+ question: "What happens when a product runs out of stock?"
21
+ expected_contains:
22
+ - availability scan
23
+ - low inventory signal
24
+ - reorder
25
+ expected_answer: >
26
+ When a product runs out of stock, an availability scan detects the shortfall
27
+ against the configured reorder threshold and triggers a low inventory signal.
28
+ The signal is routed to the responsible category manager and the supplying vendor.
29
+ If unacknowledged, it escalates to the regional operations lead after 24 hours.
30
+ notes: "Tests chain_terminology: must say 'availability scan' and 'low inventory signal', not 'stock check' or 'out-of-stock alert'."
31
+
32
+ - id: retail-nm-002
33
+ domain: retail
34
+ client: novamart
35
+ question: "How do I add a new supplier to the system?"
36
+ expected_contains:
37
+ - merchant onboarding
38
+ - legal entity name
39
+ - tax ID
40
+ - purchase order
41
+ expected_answer: >
42
+ To add a new supplier, complete the merchant onboarding process by registering
43
+ the vendor with their legal entity name, tax ID, payment terms, and primary contact.
44
+ Incomplete records will block purchase order creation until all mandatory fields
45
+ are validated.
46
+ notes: "Tests chain_terminology: 'merchant onboarding' not 'supplier setup'."
47
+
48
+ - id: retail-nm-003
49
+ domain: retail
50
+ client: novamart
51
+ question: "Can I turn on a new feature for just one region without deploying code?"
52
+ expected_contains:
53
+ - capability switch
54
+ - activation scope
55
+ - expiry date
56
+ - sign-off
57
+ expected_answer: >
58
+ Yes. A capability switch lets you enable or disable functionality per client,
59
+ region, or user segment without a code deployment. Each switch has an activation
60
+ scope and an expiry date to prevent flag debt. Enabling a switch in production
61
+ requires sign-off from both the product and engineering lead.
62
+ notes: "Tests chain_terminology: 'capability switch' not 'feature toggle' or 'feature flag'."
63
+
64
+ - id: retail-nm-004
65
+ domain: retail
66
+ client: novamart
67
+ question: "Where is the authoritative source for product information like SKU and category?"
68
+ expected_contains:
69
+ - item registry
70
+ - SKU
71
+ - archived
72
+ - 15 minutes
73
+ expected_answer: >
74
+ The item registry is the authoritative source for product attributes including
75
+ SKU, description, category hierarchy, dimensions, and active status.
76
+ Updates sync to downstream systems within 15 minutes. Deactivated products
77
+ remain as archived records and cannot be reactivated without manual review.
78
+ notes: "Tests chain_terminology: 'item registry' not 'product catalog'."
79
+
80
+ - id: retail-nm-005
81
+ domain: retail
82
+ client: novamart
83
+ question: "How are price changes handled and what needs approval?"
84
+ expected_contains:
85
+ - pricing sync
86
+ - 15%
87
+ - four hours
88
+ - escalation
89
+ expected_answer: >
90
+ Price changes are submitted as a pricing sync through the pricing portal.
91
+ Changes greater than 15% of the current price require approval. Approved
92
+ changes go live at the next sync window, which runs every four hours.
93
+ Emergency corrections outside the window require escalation to the pricing team.
94
+ notes: "Tests chain_terminology: 'pricing sync' not 'price update'."
95
+
96
+ # ── RETAIL · ShelfWise ───────────────────────────────────────────────
97
+
98
+ - id: retail-sw-001
99
+ domain: retail
100
+ client: shelfwise
101
+ question: "What triggers an out-of-stock alert?"
102
+ expected_contains:
103
+ - out-of-stock alert
104
+ - reorder point
105
+ - category manager
106
+ - 24 hours
107
+ expected_answer: >
108
+ An out-of-stock alert fires when a product's on-hand quantity drops below
109
+ its configured reorder point. It is routed simultaneously to the responsible
110
+ category manager and the supplying vendor. Unacknowledged alerts escalate
111
+ to the regional operations lead after 24 hours.
112
+ notes: "Tests chain_terminology: 'out-of-stock alert' not 'low inventory signal'."
113
+
114
+ - id: retail-sw-002
115
+ domain: retail
116
+ client: shelfwise
117
+ question: "How do we enable a feature for a subset of users?"
118
+ expected_contains:
119
+ - feature toggle
120
+ - activation scope
121
+ - expiry date
122
+ - engineering lead
123
+ expected_answer: >
124
+ Use a feature toggle to enable or disable functionality per client, region,
125
+ or user segment without a code deployment. Each toggle has an owner, an
126
+ activation scope, and an expiry date. Enabling in production requires
127
+ sign-off from the product and engineering lead.
128
+ notes: "Tests chain_terminology: 'feature toggle' not 'capability switch'."
129
+
130
+ - id: retail-sw-003
131
+ domain: retail
132
+ client: shelfwise
133
+ question: "What information is required to onboard a new supplier?"
134
+ expected_contains:
135
+ - supplier setup
136
+ - tax ID
137
+ - payment terms
138
+ - purchase order
139
+ expected_answer: >
140
+ Supplier setup requires the vendor's legal entity name, tax ID, payment terms,
141
+ and primary contact. Incomplete records block purchase order creation until
142
+ all mandatory fields are validated.
143
+ notes: "Tests chain_terminology: 'supplier setup' not 'merchant onboarding'."
144
+
145
+ - id: retail-sw-004
146
+ domain: retail
147
+ client: shelfwise
148
+ question: "How do compliance reports work and who can access them?"
149
+ expected_contains:
150
+ - compliance report
151
+ - immutable
152
+ - seven years
153
+ - Audit role
154
+ expected_answer: >
155
+ Compliance reports capture a timestamped record of system actions, user
156
+ decisions, and policy rule evaluations. They are immutable once generated
157
+ and stored for a minimum of seven years. Access is restricted to users
158
+ with the Audit role or higher.
159
+ notes: "Tests chain_terminology: 'compliance report' not 'audit trail'."
160
+
161
+ - id: retail-sw-005
162
+ domain: retail
163
+ client: shelfwise
164
+ question: "How quickly do product catalog updates reach downstream systems?"
165
+ expected_contains:
166
+ - product catalog
167
+ - 15 minutes
168
+ - event stream
169
+ - archived
170
+ expected_answer: >
171
+ Product catalog updates sync to all downstream systems within 15 minutes
172
+ via event stream. Deactivated products remain in the catalog as archived
173
+ records and cannot be reactivated without a manual review.
174
+ notes: "Tests chain_terminology: 'product catalog' not 'item registry'."
175
+
176
+ # ── PHARMA · ClinixOne ─────────────────────────────────────────────────
177
+
178
+ - id: pharma-cx-001
179
+ domain: pharma
180
+ client: clinixone
181
+ question: "What is prior authorization and how long does it take?"
182
+ expected_contains:
183
+ - prior authorization
184
+ - clinical justification
185
+ - 72 hours
186
+ - appeal
187
+ expected_answer: >
188
+ Prior authorization is a requirement by a payer that a prescriber obtain
189
+ approval before a specific drug is dispensed and covered. The prescriber
190
+ submits clinical justification and the payer responds within 72 hours for
191
+ standard requests or 24 hours for urgent cases. Denied requests can be
192
+ appealed once with additional clinical documentation.
193
+ notes: "Tests chain_terminology: 'prior authorization' not 'formulary pre-approval'."
194
+
195
+ - id: pharma-cx-002
196
+ domain: pharma
197
+ client: clinixone
198
+ question: "What is the difference between a generic name and a brand name?"
199
+ expected_contains:
200
+ - generic name
201
+ - brand name
202
+ - clinical guidelines
203
+ - authorization
204
+ expected_answer: >
205
+ The generic name is the active ingredient name — non-proprietary and used
206
+ in clinical guidelines and regulatory filings. Brand names are assigned by
207
+ manufacturers and appear in marketing materials and some payer formularies.
208
+ Substituting a brand drug with a generic requires explicit prescriber or
209
+ payer authorization in some jurisdictions.
210
+ notes: "Tests chain_terminology: ClinixOne uses 'generic name' as primary."
211
+
212
+ - id: pharma-cx-003
213
+ domain: pharma
214
+ client: clinixone
215
+ question: "When must an adverse event be reported to regulators?"
216
+ expected_contains:
217
+ - adverse event
218
+ - 15 days
219
+ - 90 days
220
+ - serious unexpected
221
+ expected_answer: >
222
+ Adverse events must be reported to the regulatory authority within 15 days
223
+ for serious unexpected events and within 90 days for expected events.
224
+ An adverse event is any undesirable medical occurrence in a patient
225
+ administered a medicinal product, regardless of causal relationship.
226
+ notes: "Tests chain_terminology: 'adverse event' not 'safety signal'. Key faithfulness test — specific numbers must be grounded in KB."
227
+
228
+ - id: pharma-cx-004
229
+ domain: pharma
230
+ client: clinixone
231
+ question: "What are the phases of a clinical trial?"
232
+ expected_contains:
233
+ - clinical trial
234
+ - Phase I
235
+ - Phase II
236
+ - Phase III
237
+ - inclusion
238
+ expected_answer: >
239
+ Clinical trials are classified by phase: Phase I tests safety in a small
240
+ cohort, Phase II assesses efficacy and side effects, and Phase III compares
241
+ against standard treatment at scale. Enrollment eligibility is defined by
242
+ inclusion and exclusion criteria in the protocol.
243
+ notes: "Tests chain_terminology: 'clinical trial' not 'investigational program'."
244
+
245
+ - id: pharma-cx-005
246
+ domain: pharma
247
+ client: clinixone
248
+ question: "What happens if a prescriber adjusts the dose outside the approved schedule?"
249
+ expected_contains:
250
+ - dose modification
251
+ - titration
252
+ - prior authorization
253
+ - documentation
254
+ expected_answer: >
255
+ A dose modification outside the approved titration schedule requires prescriber
256
+ documentation and may trigger a prior authorization review. Titration schedules
257
+ specify the starting dose, increment size, and minimum interval between increases.
258
+ notes: "Tests chain_terminology: 'dose modification' and 'prior authorization' for ClinixOne."
259
+
260
+ # ── PHARMA · PharmaLink ───────────────────────────────────────────────
261
+
262
+ - id: pharma-pl-001
263
+ domain: pharma
264
+ client: pharmalink
265
+ question: "How do I get a drug approved before dispensing?"
266
+ expected_contains:
267
+ - formulary pre-approval
268
+ - clinical justification
269
+ - 72 hours
270
+ - appeal
271
+ expected_answer: >
272
+ Submit a formulary pre-approval request with clinical justification. The payer
273
+ reviews against formulary criteria and responds within 72 hours for standard
274
+ requests or 24 hours for urgent cases. Denied requests can be appealed once
275
+ with additional clinical documentation.
276
+ notes: "Tests chain_terminology: 'formulary pre-approval' not 'prior authorization'."
277
+
278
+ - id: pharma-pl-002
279
+ domain: pharma
280
+ client: pharmalink
281
+ question: "What is a pharmacovigilance alert and when is it raised?"
282
+ expected_contains:
283
+ - pharmacovigilance alert
284
+ - pattern
285
+ - causal relationship
286
+ - regulatory authority
287
+ expected_answer: >
288
+ A pharmacovigilance alert is raised when a pattern of adverse events suggests
289
+ a previously unknown or incompletely documented causal relationship between a
290
+ drug and an outcome. Serious unexpected events must be reported to the
291
+ regulatory authority within 15 days.
292
+ notes: "Tests chain_terminology: 'pharmacovigilance alert' not 'safety signal' or 'adverse event'. Key cross-client terminology stress test."
293
+
294
+ - id: pharma-pl-003
295
+ domain: pharma
296
+ client: pharmalink
297
+ question: "What are the coverage tiers in the formulary?"
298
+ expected_contains:
299
+ - benefit tier
300
+ - Tier 1
301
+ - generics
302
+ - 60-day notice
303
+ expected_answer: >
304
+ The formulary organizes drugs into benefit tiers that determine patient
305
+ cost-sharing. Tier 1 is typically lowest cost and covers generics; higher
306
+ tiers carry higher copays. Moving a drug to a higher tier requires a formulary
307
+ committee review and a minimum 60-day notice to prescribers.
308
+ notes: "Tests chain_terminology: 'benefit tier' not 'coverage tier'."
309
+
310
+ - id: pharma-pl-004
311
+ domain: pharma
312
+ client: pharmalink
313
+ question: "What is a prescribing pathway and how often is it reviewed?"
314
+ expected_contains:
315
+ - prescribing pathway
316
+ - annually
317
+ - coverage decisions
318
+ - clinical rationale
319
+ expected_answer: >
320
+ A prescribing pathway is an evidence-based document specifying the recommended
321
+ sequence of therapies for a given condition. Pathways are reviewed annually and
322
+ updated when new efficacy or safety data emerges. Payers use pathway adherence
323
+ as a criterion in coverage decisions; deviation requires documented clinical
324
+ rationale.
325
+ notes: "Tests chain_terminology: 'prescribing pathway' not 'clinical guideline' or 'treatment protocol'."
326
+
327
+ - id: pharma-pl-005
328
+ domain: pharma
329
+ client: pharmalink
330
+ question: "What does enrollment authorization involve for a clinical study?"
331
+ expected_contains:
332
+ - enrollment authorization
333
+ - investigational program
334
+ - re-consent
335
+ - inclusion
336
+ expected_answer: >
337
+ Enrollment authorization is the process by which a patient receives and
338
+ acknowledges sufficient information about an investigational program to make
339
+ a voluntary decision. Consent must be obtained before any study procedure.
340
+ If the protocol changes materially, re-consent is required.
341
+ notes: "Tests chain_terminology: 'enrollment authorization' (not 'informed consent') and 'investigational program' (not 'clinical trial')."
eval/metrics.py ADDED
File without changes
knowledge/pharma/features.yaml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pharma domain — knowledge base documents
2
+ # Retrieved by RAG pipeline, grounded against in faithfulness check
3
+ # Each entry: id, title, content (2-4 sentences, retrieval-friendly), tags
4
+
5
+ documents:
6
+ - id: pharma_001
7
+ title: "Prior Authorization and Formulary Pre-Approval"
8
+ content: >
9
+ Prior authorization (formulary pre-approval) is a requirement by a payer that a
10
+ prescriber obtain approval before a specific drug is dispensed and covered.
11
+ The prescriber submits clinical justification; the payer reviews against formulary
12
+ criteria and responds within 72 hours for standard requests or 24 hours for urgent cases.
13
+ Denied requests can be appealed once with additional clinical documentation.
14
+ tags: [prior-auth, formulary, coverage]
15
+
16
+ - id: pharma_002
17
+ title: "Generic vs Brand Drug Names"
18
+ content: >
19
+ Every approved drug has a generic name (the active ingredient, non-proprietary) and
20
+ one or more brand names assigned by manufacturers.
21
+ Generic names are used in clinical guidelines and regulatory filings; brand names
22
+ appear in marketing materials and some payer formularies.
23
+ Substituting a brand drug with a generic equivalent requires explicit prescriber
24
+ or payer authorization in some jurisdictions.
25
+ tags: [drug-name, generic, brand]
26
+
27
+ - id: pharma_003
28
+ title: "Adverse Events and Safety Signals"
29
+ content: >
30
+ An adverse event is any undesirable medical occurrence in a patient administered
31
+ a medicinal product, regardless of causal relationship.
32
+ A safety signal (pharmacovigilance alert) is a pattern of adverse events that
33
+ suggests a previously unknown or incompletely documented causal relationship
34
+ between a drug and an outcome.
35
+ Adverse events must be reported to the regulatory authority within 15 days for
36
+ serious unexpected events and 90 days for expected events.
37
+ tags: [adverse-event, safety, pharmacovigilance]
38
+
39
+ - id: pharma_004
40
+ title: "Drug-Drug Interactions and Contraindications"
41
+ content: >
42
+ A drug-drug interaction (contraindication) occurs when one drug affects the activity
43
+ of another when both are administered together.
44
+ Interactions range from minor (monitoring recommended) to contraindicated (combination
45
+ must not be used). Severity classifications follow the clinical pharmacology guidelines
46
+ maintained in the formulary interaction database.
47
+ Prescribers are alerted at point-of-care when a contraindicated combination is entered.
48
+ tags: [drug-interaction, contraindication, safety]
49
+
50
+ - id: pharma_005
51
+ title: "Clinical Guidelines and Prescribing Pathways"
52
+ content: >
53
+ A treatment protocol (clinical guideline / prescribing pathway) is an evidence-based
54
+ document specifying the recommended sequence of therapies for a given condition.
55
+ Pathways are reviewed annually and updated when new efficacy or safety data emerges.
56
+ Payers use pathway adherence as a criterion in coverage decisions; deviation requires
57
+ documented clinical rationale.
58
+ tags: [protocol, guideline, treatment]
59
+
60
+ - id: pharma_006
61
+ title: "Formulary Coverage Tiers"
62
+ content: >
63
+ A formulary is a list of drugs covered by a payer, organized into tiers (benefit tiers)
64
+ that determine patient cost-sharing. Tier 1 is typically lowest cost (generics);
65
+ higher tiers carry higher copays. Moving a drug to a higher tier requires
66
+ a formulary committee review and a minimum 60-day notice to prescribers.
67
+ tags: [formulary, coverage, tier]
68
+
69
+ - id: pharma_007
70
+ title: "Dosage Adjustment and Titration"
71
+ content: >
72
+ A dosage adjustment (dose modification / titration step) is a change to a patient's
73
+ prescribed dose based on clinical response, tolerability, renal or hepatic function,
74
+ or drug interaction. Titration schedules specify the starting dose, increment size,
75
+ and minimum interval between increases. Adjustments outside the approved titration
76
+ schedule require prescriber documentation and may trigger a prior authorization review.
77
+ tags: [dosage, titration, dose]
78
+
79
+ - id: pharma_008
80
+ title: "Patient Consent and Enrollment Authorization"
81
+ content: >
82
+ Informed consent (enrollment authorization) is the process by which a patient
83
+ receives and acknowledges sufficient information about a treatment or study
84
+ to make a voluntary decision. For clinical trials, consent must be obtained
85
+ before any study procedure. Consent forms are version-controlled; if the
86
+ protocol changes materially, re-consent is required.
87
+ tags: [consent, enrollment, patient]
88
+
89
+ - id: pharma_009
90
+ title: "Clinical Trials and Investigational Programs"
91
+ content: >
92
+ A clinical trial (investigational program) is a structured study that evaluates
93
+ the safety or efficacy of a drug, device, or intervention in human subjects.
94
+ Trials are classified by phase: Phase I tests safety in a small cohort,
95
+ Phase II assesses efficacy and side effects, Phase III compares against
96
+ standard treatment at scale. Enrollment eligibility is defined by inclusion
97
+ and exclusion criteria in the protocol.
98
+ tags: [clinical-trial, study, investigational]
knowledge/pharma/term-catalog.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pharma domain — client-specific terminology map
2
+ # canonical term -> how each client calls it
3
+ # This is what chain_terminology metric validates against
4
+
5
+ clients:
6
+ clinixone:
7
+ display_name: "ClinixOne"
8
+ terms:
9
+ DRUG_APPROVAL: "prior authorization"
10
+ DRUG_NAME: "generic name"
11
+ ADVERSE_EVENT: "adverse event"
12
+ DRUG_INTERACTION: "contraindication"
13
+ TREATMENT_PROTOCOL: "clinical guideline"
14
+ FORMULARY_STATUS: "coverage tier"
15
+ DOSAGE_ADJUSTMENT: "dose modification"
16
+ SAFETY_SIGNAL: "safety signal"
17
+ PATIENT_CONSENT: "informed consent"
18
+ CLINICAL_TRIAL: "study enrollment"
19
+
20
+ pharmalink:
21
+ display_name: "PharmaLink"
22
+ terms:
23
+ DRUG_APPROVAL: "formulary pre-approval"
24
+ DRUG_NAME: "brand name"
25
+ ADVERSE_EVENT: "safety signal"
26
+ DRUG_INTERACTION: "drug-drug interaction"
27
+ TREATMENT_PROTOCOL: "prescribing pathway"
28
+ FORMULARY_STATUS: "benefit tier"
29
+ DOSAGE_ADJUSTMENT: "titration step"
30
+ SAFETY_SIGNAL: "pharmacovigilance alert"
31
+ PATIENT_CONSENT: "enrollment authorization"
32
+ CLINICAL_TRIAL: "investigational program"
knowledge/retail/features.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Retail domain — knowledge base documents
2
+ # Retrieved by RAG pipeline, grounded against in faithfulness check
3
+ # Each entry: id, title, content (2-4 sentences, retrieval-friendly), tags
4
+
5
+ documents:
6
+ - id: retail_001
7
+ title: "Stock Check Process"
8
+ content: >
9
+ A stock check queries real-time inventory levels for a given product and location.
10
+ Results include current quantity on hand, reorder threshold, and last updated timestamp.
11
+ If quantity falls below threshold, an out-of-stock alert is automatically triggered.
12
+ Stock checks can be initiated manually or scheduled on a recurring basis.
13
+ tags: [inventory, stock, availability]
14
+
15
+ - id: retail_002
16
+ title: "Supplier Setup and Onboarding"
17
+ content: >
18
+ Supplier setup is the process of registering a new vendor in the system before
19
+ products can be sourced or orders placed. Required fields include legal entity name,
20
+ tax ID, payment terms, and primary contact. Incomplete supplier records block
21
+ purchase order creation until all mandatory fields are validated.
22
+ tags: [supplier, vendor, onboarding]
23
+
24
+ - id: retail_003
25
+ title: "Compliance Reporting"
26
+ content: >
27
+ Compliance reports capture a timestamped record of system actions, user decisions,
28
+ and policy rule evaluations for regulatory and internal audit purposes.
29
+ Reports are immutable once generated and stored for a minimum of seven years.
30
+ Access is restricted to users with the Audit role or higher.
31
+ tags: [compliance, audit, reporting]
32
+
33
+ - id: retail_004
34
+ title: "Feature Flags and Capability Switches"
35
+ content: >
36
+ Feature flags (also called capability switches) enable or disable product functionality
37
+ per client, region, or user segment without a code deployment.
38
+ Each flag has an owner, an activation scope, and an expiry date to prevent flag debt.
39
+ Enabling a flag in production requires sign-off from both the product and engineering lead.
40
+ tags: [feature-flags, configuration, rollout]
41
+
42
+ - id: retail_005
43
+ title: "Product Catalog Management"
44
+ content: >
45
+ The product catalog (item registry) is the authoritative source of product attributes
46
+ including SKU, description, category hierarchy, dimensions, and active status.
47
+ Catalog updates sync to all downstream systems within 15 minutes via event stream.
48
+ Deactivated products remain in the catalog as archived records and cannot be reactivated
49
+ without a manual review.
50
+ tags: [catalog, products, SKU]
51
+
52
+ - id: retail_006
53
+ title: "Price Update Workflow"
54
+ content: >
55
+ Price updates (pricing syncs) must be submitted through the pricing portal and require
56
+ approval for changes greater than 15% of the current price.
57
+ Approved changes go live at the next scheduled sync window, which runs every four hours.
58
+ Emergency price corrections outside the sync window require escalation to the pricing team.
59
+ tags: [pricing, price-update, workflow]
60
+
61
+ - id: retail_007
62
+ title: "Store Configuration"
63
+ content: >
64
+ Each store location has a configuration profile (location profile) that defines
65
+ operating hours, supported payment methods, fulfillment capabilities, and
66
+ regional compliance rules. Configuration changes take effect at store open
67
+ on the following business day. Misconfigured stores are flagged in the daily
68
+ operations health report.
69
+ tags: [store, configuration, location]
70
+
71
+ - id: retail_008
72
+ title: "Low Inventory Alerts"
73
+ content: >
74
+ A low inventory signal (out-of-stock alert) fires when a product's on-hand quantity
75
+ drops below its configured reorder point. Alerts are routed to the responsible
76
+ category manager and the supplying vendor simultaneously. Unacknowledged alerts
77
+ escalate to the regional operations lead after 24 hours.
78
+ tags: [inventory, alerts, stock]
knowledge/retail/term-catalog.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Retail domain — client-specific terminology map
2
+ # canonical term -> how each client calls it
3
+ # This is what chain_terminology metric validates against
4
+
5
+ clients:
6
+ novamart:
7
+ display_name: "NovaMart"
8
+ terms:
9
+ STOCK_CHECK: "availability scan"
10
+ SUPPLIER_SETUP: "merchant onboarding"
11
+ COMPLIANCE_REPORT: "audit trail"
12
+ FEATURE_FLAG: "capability switch"
13
+ STOCK_ALERT: "low inventory signal"
14
+ PRODUCT_CATALOG: "item registry"
15
+ PRICE_UPDATE: "pricing sync"
16
+ STORE_CONFIG: "location profile"
17
+
18
+ shelfwise:
19
+ display_name: "ShelfWise"
20
+ terms:
21
+ STOCK_CHECK: "stock check"
22
+ SUPPLIER_SETUP: "supplier setup"
23
+ COMPLIANCE_REPORT: "compliance report"
24
+ FEATURE_FLAG: "feature toggle"
25
+ STOCK_ALERT: "out-of-stock alert"
26
+ PRODUCT_CATALOG: "product catalog"
27
+ PRICE_UPDATE: "price update"
28
+ STORE_CONFIG: "store configuration"
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ anthropic>=0.40.0
2
+ fastapi>=0.115.0
3
+ uvicorn[standard]>=0.30.0
4
+ pyyaml>=6.0
5
+ sentence-transformers>=3.0.0
6
+ scikit-learn>=1.5.0
7
+ numpy>=1.26.0
8
+ python-multipart>=0.0.9
ui/app.js ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const API = ''; // same origin
2
+
3
+ let state = {
4
+ domain: null,
5
+ client: null,
6
+ domains: {},
7
+ loading: false,
8
+ };
9
+
10
+ // ── Boot ──────────────────────────────────────────────────────────────────
11
+
12
+ async function boot() {
13
+ const res = await fetch(`${API}/config`);
14
+ const data = await res.json();
15
+ state.domains = data.domains;
16
+
17
+ const firstDomain = Object.keys(data.domains)[0];
18
+ renderDomainSwitcher();
19
+ selectDomain(firstDomain);
20
+
21
+ document.getElementById('send-btn').addEventListener('click', handleSend);
22
+ document.getElementById('query-input').addEventListener('keydown', e => {
23
+ if (e.key === 'Enter' && !e.shiftKey) handleSend();
24
+ });
25
+ }
26
+
27
+ // ── Switchers ─────────────────────────────────────────────────────────────
28
+
29
+ function renderDomainSwitcher() {
30
+ const el = document.getElementById('domain-switcher');
31
+ el.innerHTML = Object.keys(state.domains).map(d => `
32
+ <button data-domain="${d}" onclick="selectDomain('${d}')">${capitalize(d)}</button>
33
+ `).join('');
34
+ }
35
+
36
+ function selectDomain(domain) {
37
+ state.domain = domain;
38
+ document.querySelectorAll('#domain-switcher button').forEach(b => {
39
+ b.classList.toggle('active', b.dataset.domain === domain);
40
+ });
41
+
42
+ const clients = state.domains[domain];
43
+ const el = document.getElementById('client-switcher');
44
+ el.innerHTML = clients.map(c => `
45
+ <button data-client="${c.id}" onclick="selectClient('${c.id}')">${c.display}</button>
46
+ `).join('');
47
+
48
+ selectClient(clients[0].id);
49
+ }
50
+
51
+ function selectClient(clientId) {
52
+ state.client = clientId;
53
+ document.querySelectorAll('#client-switcher button').forEach(b => {
54
+ b.classList.toggle('active', b.dataset.client === clientId);
55
+ });
56
+ }
57
+
58
+ // ── Send ──────────────────────────────────────────────────────────────────
59
+
60
+ async function handleSend() {
61
+ const input = document.getElementById('query-input');
62
+ const query = input.value.trim();
63
+ if (!query || state.loading) return;
64
+
65
+ input.value = '';
66
+ setLoading(true);
67
+
68
+ appendMessage('user', query);
69
+ const thinkingEl = appendThinking();
70
+
71
+ try {
72
+ const res = await fetch(`${API}/query`, {
73
+ method: 'POST',
74
+ headers: { 'Content-Type': 'application/json' },
75
+ body: JSON.stringify({ query, client: state.client }),
76
+ });
77
+
78
+ if (!res.ok) {
79
+ const err = await res.json().catch(() => ({ detail: res.statusText }));
80
+ throw new Error(err.detail || 'Request failed');
81
+ }
82
+
83
+ const data = await res.json();
84
+ thinkingEl.remove();
85
+ appendBotMessage(data);
86
+ renderEval(data);
87
+ } catch (err) {
88
+ thinkingEl.remove();
89
+ appendMessage('bot', `Error: ${err.message}`);
90
+ } finally {
91
+ setLoading(false);
92
+ }
93
+ }
94
+
95
+ // ── Messages ──────────────────────────────────────────────────────────────
96
+
97
+ function appendMessage(role, text) {
98
+ const el = document.createElement('div');
99
+ el.className = `message ${role}`;
100
+ el.innerHTML = `
101
+ <div class="bubble">${escapeHtml(text)}</div>
102
+ <div class="meta">${role === 'user' ? 'You' : 'Bot'}</div>
103
+ `;
104
+ getMessages().appendChild(el);
105
+ scrollMessages();
106
+ return el;
107
+ }
108
+
109
+ function appendBotMessage(data) {
110
+ const overall = data.evaluation.overall_pass;
111
+ const verdictClass = overall ? 'pass' : 'fail';
112
+ const verdictLabel = overall ? '✓ All checks passed' : '✗ Checks failed';
113
+
114
+ const el = document.createElement('div');
115
+ el.className = 'message bot';
116
+ el.innerHTML = `
117
+ <div class="bubble">${escapeHtml(data.answer)}</div>
118
+ <div class="verdict ${verdictClass}">${verdictLabel}</div>
119
+ <div class="meta">${data.client_display}</div>
120
+ `;
121
+ getMessages().appendChild(el);
122
+ scrollMessages();
123
+ }
124
+
125
+ function appendThinking() {
126
+ const wrap = document.createElement('div');
127
+ wrap.className = 'message bot';
128
+ wrap.innerHTML = `
129
+ <div class="thinking">
130
+ <span></span><span></span><span></span>
131
+ </div>
132
+ `;
133
+ getMessages().appendChild(wrap);
134
+ scrollMessages();
135
+ return wrap;
136
+ }
137
+
138
+ // ── Eval panel ────────────────────────────────────────────────────────────
139
+
140
+ const METRIC_LABELS = {
141
+ pii_leakage: 'PII Leakage',
142
+ token_budget: 'Token Budget',
143
+ answer_relevancy: 'Answer Relevancy',
144
+ faithfulness: 'Faithfulness',
145
+ chain_terminology: 'Chain Terminology',
146
+ };
147
+
148
+ const METRIC_DESC = {
149
+ pii_leakage: 'Regex scan — no PII in response',
150
+ token_budget: 'Response within token ceiling',
151
+ answer_relevancy: 'Cosine similarity: query ↔ response',
152
+ faithfulness: 'Claude judge: grounded in retrieved context?',
153
+ chain_terminology: 'Deterministic: client-specific terms used',
154
+ };
155
+
156
+ function renderEval(data) {
157
+ const metrics = data.evaluation.metrics;
158
+ const sources = data.sources;
159
+
160
+ const metricCards = Object.entries(metrics).map(([key, m]) => {
161
+ const cls = scoreClass(m.score, key);
162
+ const pct = Math.round(m.score * 100);
163
+ return `
164
+ <div class="metric-card ${cls}">
165
+ <div class="metric-header">
166
+ <span class="metric-name">${METRIC_LABELS[key] || key}</span>
167
+ <span class="score-badge ${cls}">${pct}%</span>
168
+ </div>
169
+ <div class="metric-detail">${escapeHtml(METRIC_DESC[key] || '')}</div>
170
+ <div class="metric-detail" style="margin-top:4px;color:#6a8aaa">${escapeHtml(m.detail)}</div>
171
+ <div class="score-bar-wrap">
172
+ <div class="score-bar-bg">
173
+ <div class="score-bar-fill ${cls}" style="width:${pct}%"></div>
174
+ </div>
175
+ </div>
176
+ </div>
177
+ `;
178
+ }).join('');
179
+
180
+ const sourceItems = sources.map(s => `
181
+ <div class="source-item">
182
+ <span class="source-title">${escapeHtml(s.title)}</span>
183
+ <span class="source-score">${(s.score * 100).toFixed(0)}%</span>
184
+ </div>
185
+ `).join('');
186
+
187
+ document.getElementById('eval-body').innerHTML = `
188
+ <div class="eval-content">
189
+ ${metricCards}
190
+ <div class="sources-section">
191
+ <div class="sources-label">Retrieved Sources</div>
192
+ ${sourceItems || '<div style="font-size:11px;color:#8aabcc">No sources retrieved</div>'}
193
+ </div>
194
+ </div>
195
+ `;
196
+ }
197
+
198
+ function scoreClass(score, metric) {
199
+ // pii_leakage: 1.0 = pass, anything else = fail (binary)
200
+ if (metric === 'pii_leakage') return score === 1.0 ? 'pass' : 'fail';
201
+ if (score >= 0.75) return 'pass';
202
+ if (score >= 0.45) return 'warn';
203
+ return 'fail';
204
+ }
205
+
206
+ // ── Helpers ───────────────────────────────────────────────────────────────
207
+
208
+ function setLoading(val) {
209
+ state.loading = val;
210
+ document.getElementById('send-btn').disabled = val;
211
+ document.getElementById('query-input').disabled = val;
212
+ }
213
+
214
+ function getMessages() {
215
+ return document.getElementById('messages');
216
+ }
217
+
218
+ function scrollMessages() {
219
+ const el = getMessages();
220
+ el.scrollTop = el.scrollHeight;
221
+ }
222
+
223
+ function capitalize(s) {
224
+ return s.charAt(0).toUpperCase() + s.slice(1);
225
+ }
226
+
227
+ function escapeHtml(str) {
228
+ return String(str)
229
+ .replace(/&/g, '&amp;')
230
+ .replace(/</g, '&lt;')
231
+ .replace(/>/g, '&gt;')
232
+ .replace(/"/g, '&quot;');
233
+ }
234
+
235
+ boot();
ui/index.html ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>AI Response Validator</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com">
8
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800;900&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
9
+ <style>
10
+ * { margin: 0; padding: 0; box-sizing: border-box; }
11
+
12
+ body {
13
+ font-family: 'Inter', sans-serif;
14
+ background: #eef4fc;
15
+ color: #1a1a1a;
16
+ height: 100vh;
17
+ display: grid;
18
+ grid-template-rows: auto 1fr;
19
+ overflow: hidden;
20
+ }
21
+
22
+ /* ── Header ── */
23
+ header {
24
+ background: #fff;
25
+ border-bottom: 2px solid #1e3a5f;
26
+ padding: 14px 28px;
27
+ display: flex;
28
+ align-items: center;
29
+ justify-content: space-between;
30
+ gap: 24px;
31
+ }
32
+
33
+ .header-left h1 {
34
+ font-size: 22px;
35
+ font-weight: 900;
36
+ color: #1a1a1a;
37
+ letter-spacing: -0.5px;
38
+ }
39
+ .header-left h1 span { color: #3a6ea8; }
40
+ .header-left .tagline {
41
+ font-size: 11px;
42
+ color: #8aabcc;
43
+ margin-top: 2px;
44
+ }
45
+
46
+ /* ── Domain / Client switcher ── */
47
+ .switcher {
48
+ display: flex;
49
+ align-items: center;
50
+ gap: 10px;
51
+ flex-wrap: wrap;
52
+ }
53
+
54
+ .switcher label {
55
+ font-size: 10px;
56
+ font-weight: 700;
57
+ text-transform: uppercase;
58
+ letter-spacing: 1.5px;
59
+ color: #8aabcc;
60
+ }
61
+
62
+ .btn-group {
63
+ display: flex;
64
+ border: 1px solid #c8dff5;
65
+ border-radius: 5px;
66
+ overflow: hidden;
67
+ }
68
+
69
+ .btn-group button {
70
+ background: #fff;
71
+ border: none;
72
+ border-right: 1px solid #c8dff5;
73
+ padding: 6px 14px;
74
+ font-size: 12px;
75
+ font-weight: 600;
76
+ color: #4a6a8a;
77
+ cursor: pointer;
78
+ transition: background 0.15s, color 0.15s;
79
+ }
80
+ .btn-group button:last-child { border-right: none; }
81
+ .btn-group button.active {
82
+ background: #1e3a5f;
83
+ color: #fff;
84
+ }
85
+ .btn-group button:hover:not(.active) { background: #eef4fc; }
86
+
87
+ .divider-v {
88
+ width: 1px;
89
+ height: 28px;
90
+ background: #c8dff5;
91
+ }
92
+
93
+ /* ── Main layout ── */
94
+ main {
95
+ display: grid;
96
+ grid-template-columns: 1fr 360px;
97
+ overflow: hidden;
98
+ }
99
+
100
+ /* ── Chat panel ── */
101
+ .chat-panel {
102
+ display: flex;
103
+ flex-direction: column;
104
+ border-right: 1px solid #c8dff5;
105
+ overflow: hidden;
106
+ }
107
+
108
+ .messages {
109
+ flex: 1;
110
+ overflow-y: auto;
111
+ padding: 24px 28px;
112
+ display: flex;
113
+ flex-direction: column;
114
+ gap: 16px;
115
+ }
116
+
117
+ .message {
118
+ display: flex;
119
+ flex-direction: column;
120
+ gap: 4px;
121
+ max-width: 80%;
122
+ }
123
+ .message.user { align-self: flex-end; }
124
+ .message.bot { align-self: flex-start; }
125
+
126
+ .message .bubble {
127
+ padding: 12px 16px;
128
+ border-radius: 8px;
129
+ font-size: 13.5px;
130
+ line-height: 1.6;
131
+ }
132
+ .message.user .bubble {
133
+ background: #1e3a5f;
134
+ color: #fff;
135
+ border-radius: 8px 8px 2px 8px;
136
+ }
137
+ .message.bot .bubble {
138
+ background: #fff;
139
+ color: #1a1a1a;
140
+ border: 1px solid #c8dff5;
141
+ border-radius: 8px 8px 8px 2px;
142
+ }
143
+
144
+ .message .meta {
145
+ font-size: 10px;
146
+ color: #8aabcc;
147
+ padding: 0 4px;
148
+ }
149
+ .message.user .meta { text-align: right; }
150
+
151
+ /* overall pass/fail badge on bot message */
152
+ .verdict {
153
+ display: inline-flex;
154
+ align-items: center;
155
+ gap: 5px;
156
+ font-size: 10px;
157
+ font-weight: 700;
158
+ padding: 2px 8px;
159
+ border-radius: 3px;
160
+ margin-top: 4px;
161
+ align-self: flex-start;
162
+ }
163
+ .verdict.pass { background: #f1f8f1; color: #2e7d32; border: 1px solid #c8e6c9; }
164
+ .verdict.fail { background: #fdf1f1; color: #c62828; border: 1px solid #ffcdd2; }
165
+ .verdict.warn { background: #fffbf0; color: #a06000; border: 1px solid #ffe082; }
166
+
167
+ /* ── Input bar ── */
168
+ .input-bar {
169
+ padding: 16px 28px;
170
+ background: #fff;
171
+ border-top: 1px solid #c8dff5;
172
+ display: flex;
173
+ gap: 10px;
174
+ }
175
+
176
+ .input-bar input {
177
+ flex: 1;
178
+ padding: 10px 14px;
179
+ border: 1px solid #c8dff5;
180
+ border-radius: 6px;
181
+ font-size: 13.5px;
182
+ font-family: 'Inter', sans-serif;
183
+ outline: none;
184
+ transition: border-color 0.15s;
185
+ }
186
+ .input-bar input:focus { border-color: #3a6ea8; }
187
+ .input-bar input:disabled { background: #f5f9ff; color: #8aabcc; }
188
+
189
+ .input-bar button {
190
+ padding: 10px 20px;
191
+ background: #1e3a5f;
192
+ color: #fff;
193
+ border: none;
194
+ border-radius: 6px;
195
+ font-size: 13px;
196
+ font-weight: 700;
197
+ cursor: pointer;
198
+ transition: background 0.15s;
199
+ white-space: nowrap;
200
+ }
201
+ .input-bar button:hover:not(:disabled) { background: #3a6ea8; }
202
+ .input-bar button:disabled { background: #93b8d8; cursor: not-allowed; }
203
+
204
+ /* ── Eval panel ── */
205
+ .eval-panel {
206
+ background: #fff;
207
+ overflow-y: auto;
208
+ display: flex;
209
+ flex-direction: column;
210
+ }
211
+
212
+ .eval-panel .panel-header {
213
+ padding: 16px 20px 12px;
214
+ border-bottom: 1px solid #e8f2ff;
215
+ font-size: 10px;
216
+ font-weight: 800;
217
+ text-transform: uppercase;
218
+ letter-spacing: 2px;
219
+ color: #8aabcc;
220
+ position: sticky;
221
+ top: 0;
222
+ background: #fff;
223
+ z-index: 1;
224
+ }
225
+
226
+ .eval-empty {
227
+ flex: 1;
228
+ display: flex;
229
+ flex-direction: column;
230
+ align-items: center;
231
+ justify-content: center;
232
+ gap: 10px;
233
+ color: #b0cce8;
234
+ padding: 40px 20px;
235
+ text-align: center;
236
+ }
237
+ .eval-empty .icon { font-size: 36px; }
238
+ .eval-empty p { font-size: 12px; line-height: 1.6; }
239
+
240
+ .eval-content { padding: 16px 20px; display: flex; flex-direction: column; gap: 20px; }
241
+
242
+ /* Metric card */
243
+ .metric-card {
244
+ border: 1px solid #e0eef8;
245
+ border-left: 3px solid #1e3a5f;
246
+ border-radius: 0 6px 6px 0;
247
+ padding: 12px 14px;
248
+ background: #f5f9ff;
249
+ }
250
+ .metric-card.pass { border-left-color: #4caf50; background: #f0faf3; }
251
+ .metric-card.fail { border-left-color: #c62828; background: #fdf5f5; }
252
+ .metric-card.warn { border-left-color: #f9a825; background: #fffdf0; }
253
+
254
+ .metric-card .metric-header {
255
+ display: flex;
256
+ justify-content: space-between;
257
+ align-items: center;
258
+ margin-bottom: 6px;
259
+ }
260
+ .metric-card .metric-name {
261
+ font-size: 12px;
262
+ font-weight: 800;
263
+ color: #1e3a5f;
264
+ font-family: 'JetBrains Mono', monospace;
265
+ }
266
+ .metric-card.pass .metric-name { color: #2e7d32; }
267
+ .metric-card.fail .metric-name { color: #c62828; }
268
+
269
+ .score-badge {
270
+ font-family: 'JetBrains Mono', monospace;
271
+ font-size: 11px;
272
+ font-weight: 700;
273
+ padding: 2px 8px;
274
+ border-radius: 3px;
275
+ border: 1px solid;
276
+ }
277
+ .score-badge.pass { background: #f1f8f1; color: #2e7d32; border-color: #c8e6c9; }
278
+ .score-badge.fail { background: #fdf1f1; color: #c62828; border-color: #ffcdd2; }
279
+ .score-badge.warn { background: #fffbf0; color: #a06000; border-color: #ffe082; }
280
+
281
+ .metric-card .metric-detail {
282
+ font-size: 11px;
283
+ color: #4a6080;
284
+ line-height: 1.5;
285
+ }
286
+
287
+ /* Score bar */
288
+ .score-bar-wrap { margin-top: 8px; }
289
+ .score-bar-bg {
290
+ height: 4px;
291
+ background: #e0eef8;
292
+ border-radius: 2px;
293
+ overflow: hidden;
294
+ }
295
+ .score-bar-fill {
296
+ height: 100%;
297
+ border-radius: 2px;
298
+ transition: width 0.4s ease;
299
+ }
300
+ .score-bar-fill.pass { background: #4caf50; }
301
+ .score-bar-fill.fail { background: #c62828; }
302
+ .score-bar-fill.warn { background: #f9a825; }
303
+
304
+ /* Sources */
305
+ .sources-section .sources-label {
306
+ font-size: 10px;
307
+ font-weight: 700;
308
+ text-transform: uppercase;
309
+ letter-spacing: 1.5px;
310
+ color: #8aabcc;
311
+ margin-bottom: 8px;
312
+ }
313
+ .source-item {
314
+ display: flex;
315
+ justify-content: space-between;
316
+ align-items: center;
317
+ padding: 7px 10px;
318
+ background: #f5f9ff;
319
+ border: 1px solid #e0eef8;
320
+ border-radius: 5px;
321
+ margin-bottom: 5px;
322
+ font-size: 11.5px;
323
+ }
324
+ .source-item .source-title { color: #2a4a6a; font-weight: 500; }
325
+ .source-item .source-score {
326
+ font-family: 'JetBrains Mono', monospace;
327
+ font-size: 10px;
328
+ color: #8aabcc;
329
+ }
330
+
331
+ /* Thinking indicator */
332
+ .thinking {
333
+ display: flex;
334
+ gap: 5px;
335
+ align-items: center;
336
+ padding: 12px 16px;
337
+ background: #fff;
338
+ border: 1px solid #c8dff5;
339
+ border-radius: 8px 8px 8px 2px;
340
+ width: fit-content;
341
+ }
342
+ .thinking span {
343
+ width: 7px; height: 7px;
344
+ background: #3a6ea8;
345
+ border-radius: 50%;
346
+ animation: bounce 1.2s infinite ease-in-out;
347
+ }
348
+ .thinking span:nth-child(2) { animation-delay: 0.2s; }
349
+ .thinking span:nth-child(3) { animation-delay: 0.4s; }
350
+
351
+ @keyframes bounce {
352
+ 0%, 80%, 100% { transform: scale(0.6); opacity: 0.4; }
353
+ 40% { transform: scale(1); opacity: 1; }
354
+ }
355
+
356
+ /* Scrollbar */
357
+ ::-webkit-scrollbar { width: 5px; }
358
+ ::-webkit-scrollbar-track { background: transparent; }
359
+ ::-webkit-scrollbar-thumb { background: #c8dff5; border-radius: 3px; }
360
+ </style>
361
+ </head>
362
+ <body>
363
+
364
+ <header>
365
+ <div class="header-left">
366
+ <h1>AI Response <span>Validator</span></h1>
367
+ <div class="tagline">Domain-agnostic RAG evaluation · real-time L1 metrics · RosettaStone terminology</div>
368
+ </div>
369
+
370
+ <div class="switcher">
371
+ <label>Domain</label>
372
+ <div class="btn-group" id="domain-switcher"></div>
373
+
374
+ <div class="divider-v"></div>
375
+
376
+ <label>Client</label>
377
+ <div class="btn-group" id="client-switcher"></div>
378
+ </div>
379
+ </header>
380
+
381
+ <main>
382
+ <div class="chat-panel">
383
+ <div class="messages" id="messages">
384
+ <!-- populated by app.js -->
385
+ </div>
386
+ <div class="input-bar">
387
+ <input
388
+ type="text"
389
+ id="query-input"
390
+ placeholder="Ask something…"
391
+ autocomplete="off"
392
+ />
393
+ <button id="send-btn">Send</button>
394
+ </div>
395
+ </div>
396
+
397
+ <div class="eval-panel">
398
+ <div class="panel-header">Evaluation</div>
399
+ <div id="eval-body">
400
+ <div class="eval-empty">
401
+ <div class="icon">◎</div>
402
+ <p>Send a message to see<br>real-time metric evaluation.</p>
403
+ </div>
404
+ </div>
405
+ </div>
406
+ </main>
407
+
408
+ <script src="/static/app.js"></script>
409
+ </body>
410
+ </html>