""" Populate telemetry with simulated traffic, then run drift detection. Two batches: clean — golden-dataset expected_answers (should match reference distribution) dirty — same questions, hallucinated responses (should show faithfulness drift) Bypasses the API entirely: runs graders + telemetry.record() directly. Usage: cd /Users/praca/ai-response-validator && .venv/bin/python eval/simulate_traffic.py """ import sys import time from pathlib import Path import yaml sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) import telemetry from config import CLIENT_DOMAIN from grader import GradeReport, grade DATASET_PATH = Path(__file__).parent / "golden-dataset.yaml" KNOWLEDGE_ROOT = Path(__file__).parent.parent / "knowledge" # Hallucinated responses — plausible-sounding but contradicts KB facts HALLUCINATED: dict[str, str] = { # retail — NovaMart "retail-nm-001": ( "When a product runs out of stock, the system automatically places a reorder after 72 hours " "with no alerts sent to any manager. The supplier is notified only at month-end review." ), "retail-nm-002": ( "To add a new supplier, send an email to the procurement team with the company name. " "No tax ID or payment terms are required at this stage. " "Purchase orders can be created immediately without waiting for validation." ), "retail-nm-003": ( "Feature flags are permanent once enabled and cannot be disabled without a code deployment. " "There is no expiry date or activation scope. Any employee can enable a flag in production." ), "retail-nm-004": ( "The authoritative source for product information is the pricing portal. " "SKU records are updated manually once per week by the merchandising team. " "Archived products can be reactivated instantly by any store manager." ), "retail-nm-005": ( "Price changes take effect immediately upon submission with no approval required. " "There is no sync window; prices update in real time. " "Emergency corrections are handled automatically without escalation." ), # retail — ShelfWise "retail-sw-001": ( "An out-of-stock alert fires only after a manual stock check is initiated by a store manager. " "The alert is sent exclusively to the regional director. " "No escalation occurs if the alert is unacknowledged." ), "retail-sw-002": ( "Feature toggles are permanent once enabled. " "There is no activation scope and no expiry date requirement. " "Any user can enable toggles in production without sign-off." ), "retail-sw-004": ( "Compliance reports are editable for up to 30 days after creation and are stored for 2 years. " "Any user can access compliance reports from the standard dashboard. " "Reports are generated on demand only." ), "retail-sw-005": ( "Product catalog updates require manual approval for each SKU and can take up to 48 hours. " "Deactivated products are permanently deleted and cannot be recovered." ), # pharma — ClinixOne "pharma-cx-001": ( "Prior authorization is optional and payers respond within 7 business days. " "Denied requests cannot be appealed and the prescriber must choose an alternative drug." ), "pharma-cx-003": ( "Adverse events must be reported to regulators within 30 days for all event types. " "A safety signal is raised automatically by the system when 3 or more events occur. " "Expected events do not require regulatory reporting." ), "pharma-cx-004": ( "Clinical trials have two phases: Phase I for safety and Phase II for market approval. " "Enrollment eligibility is determined by the treating physician with no formal criteria." ), # pharma — PharmaLink "pharma-pl-001": ( "Formulary pre-approval is automatically granted for all branded drugs. " "The payer responds within 30 days and denied requests cannot be appealed." ), "pharma-pl-003": ( "The formulary has two tiers: generic and branded. " "Moving a drug to a higher tier requires a 7-day notice to prescribers. " "Tier assignment is reviewed every 5 years." ), "pharma-pl-004": ( "A prescribing pathway is a marketing document produced by pharmaceutical companies. " "Pathways are reviewed every 5 years and payers do not use them in coverage decisions. " "Deviation from a pathway requires no documentation." ), "pharma-pl-005": ( "Enrollment authorization is a formality — patients sign a standard waiver. " "Consent is obtained after the first study procedure, not before. " "Protocol changes do not require re-consent from existing participants." ), } def _load_kb_context(domain: str) -> str: path = KNOWLEDGE_ROOT / domain / "features.yaml" data = yaml.safe_load(path.read_text()) chunks = [f"[{doc['title']}]\n{doc['content'].strip()}" for doc in data["documents"]] return "\n\n".join(chunks) def _record(pair: dict, response: str, context: str, tag: str) -> GradeReport: client = pair["client"] report = grade( query=pair["question"], response=response, context=context, client=client, ) telemetry.record( client=client, domain=pair["domain"], query_len=len(pair["question"].split()), latency_ms={"retrieve": 12.0, "generate": 180.0, "grade": 45.0}, report=report, docs_retrieved=3, min_retrieval_score=0.72, ) status = "PASS" if report.overall else "FAIL" faith = next(r for r in report.results if r.metric == "faithfulness") print(f" [{tag}] {pair['id']:<20} {status} faith={faith.score:.3f} {faith.detail}") return report def run() -> None: pairs = yaml.safe_load(DATASET_PATH.read_text())["pairs"] kb: dict[str, str] = {} # ── Batch 1: clean traffic ────────────────────────────────────────────── print("\n── Batch 1: clean traffic (expected answers) ──\n") for pair in pairs: domain = pair["domain"] if domain not in kb: kb[domain] = _load_kb_context(domain) response = pair["expected_answer"].strip() _record(pair, response, kb[domain], "clean") time.sleep(0.05) # ── Batch 2: dirty traffic (hallucinated responses) ───────────────────── print("\n── Batch 2: dirty traffic (hallucinated responses) ──\n") dirty_pairs = [p for p in pairs if p["id"] in HALLUCINATED] for pair in dirty_pairs: domain = pair["domain"] response = HALLUCINATED[pair["id"]] _record(pair, response, kb[domain], "dirty") time.sleep(0.05) total = telemetry.live_stats()["total_queries"] print(f"\nTelemetry buffer: {total} events ({len(pairs)} clean + {len(dirty_pairs)} dirty)\n") # ── Drift detection ───────────────────────────────────────────────────── print("=" * 60) print("Running drift detection vs golden-dataset baseline...") print("=" * 60) sys.path.insert(0, str(Path(__file__).parent)) from drift import build_current, build_reference, detect_drift, report_drift print("\nBuilding reference distribution...") reference = build_reference() current = build_current() cur_n = len(next(iter(current.values()), [])) print(f"Reference: {len(next(iter(reference.values())))} pairs") print(f"Current: {cur_n} events\n") results = detect_drift(current, reference) report_drift(results) print() if __name__ == "__main__": run()