Spaces:

below-threshold
/

ai-response-validator

Sleeping

ai-response-validator / eval /simulate_traffic.py

mbochniak01

Replace HHEM with sentence-level NLI, add claim decomposition and drift detection

ffbf46f about 2 months ago

8.01 kB

	"""
	Populate telemetry with simulated traffic, then run drift detection.

	Two batches:
	clean — golden-dataset expected_answers (should match reference distribution)
	dirty — same questions, hallucinated responses (should show faithfulness drift)

	Bypasses the API entirely: runs graders + telemetry.record() directly.

	Usage:
	cd /Users/praca/ai-response-validator && .venv/bin/python eval/simulate_traffic.py
	"""

	import sys
	import time
	from pathlib import Path

	import yaml

	sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))

	import telemetry
	from config import CLIENT_DOMAIN
	from grader import GradeReport, grade

	DATASET_PATH = Path(__file__).parent / "golden-dataset.yaml"
	KNOWLEDGE_ROOT = Path(__file__).parent.parent / "knowledge"

	# Hallucinated responses — plausible-sounding but contradicts KB facts
	HALLUCINATED: dict[str, str] = {
	# retail — NovaMart
	"retail-nm-001": (
	"When a product runs out of stock, the system automatically places a reorder after 72 hours "
	"with no alerts sent to any manager. The supplier is notified only at month-end review."
	),
	"retail-nm-002": (
	"To add a new supplier, send an email to the procurement team with the company name. "
	"No tax ID or payment terms are required at this stage. "
	"Purchase orders can be created immediately without waiting for validation."
	),
	"retail-nm-003": (
	"Feature flags are permanent once enabled and cannot be disabled without a code deployment. "
	"There is no expiry date or activation scope. Any employee can enable a flag in production."
	),
	"retail-nm-004": (
	"The authoritative source for product information is the pricing portal. "
	"SKU records are updated manually once per week by the merchandising team. "
	"Archived products can be reactivated instantly by any store manager."
	),
	"retail-nm-005": (
	"Price changes take effect immediately upon submission with no approval required. "
	"There is no sync window; prices update in real time. "
	"Emergency corrections are handled automatically without escalation."
	),
	# retail — ShelfWise
	"retail-sw-001": (
	"An out-of-stock alert fires only after a manual stock check is initiated by a store manager. "
	"The alert is sent exclusively to the regional director. "
	"No escalation occurs if the alert is unacknowledged."
	),
	"retail-sw-002": (
	"Feature toggles are permanent once enabled. "
	"There is no activation scope and no expiry date requirement. "
	"Any user can enable toggles in production without sign-off."
	),
	"retail-sw-004": (
	"Compliance reports are editable for up to 30 days after creation and are stored for 2 years. "
	"Any user can access compliance reports from the standard dashboard. "
	"Reports are generated on demand only."
	),
	"retail-sw-005": (
	"Product catalog updates require manual approval for each SKU and can take up to 48 hours. "
	"Deactivated products are permanently deleted and cannot be recovered."
	),
	# pharma — ClinixOne
	"pharma-cx-001": (
	"Prior authorization is optional and payers respond within 7 business days. "
	"Denied requests cannot be appealed and the prescriber must choose an alternative drug."
	),
	"pharma-cx-003": (
	"Adverse events must be reported to regulators within 30 days for all event types. "
	"A safety signal is raised automatically by the system when 3 or more events occur. "
	"Expected events do not require regulatory reporting."
	),
	"pharma-cx-004": (
	"Clinical trials have two phases: Phase I for safety and Phase II for market approval. "
	"Enrollment eligibility is determined by the treating physician with no formal criteria."
	),
	# pharma — PharmaLink
	"pharma-pl-001": (
	"Formulary pre-approval is automatically granted for all branded drugs. "
	"The payer responds within 30 days and denied requests cannot be appealed."
	),
	"pharma-pl-003": (
	"The formulary has two tiers: generic and branded. "
	"Moving a drug to a higher tier requires a 7-day notice to prescribers. "
	"Tier assignment is reviewed every 5 years."
	),
	"pharma-pl-004": (
	"A prescribing pathway is a marketing document produced by pharmaceutical companies. "
	"Pathways are reviewed every 5 years and payers do not use them in coverage decisions. "
	"Deviation from a pathway requires no documentation."
	),
	"pharma-pl-005": (
	"Enrollment authorization is a formality — patients sign a standard waiver. "
	"Consent is obtained after the first study procedure, not before. "
	"Protocol changes do not require re-consent from existing participants."
	),
	}


	def _load_kb_context(domain: str) -> str:
	path = KNOWLEDGE_ROOT / domain / "features.yaml"
	data = yaml.safe_load(path.read_text())
	chunks = [f"[{doc['title']}]\n{doc['content'].strip()}" for doc in data["documents"]]
	return "\n\n".join(chunks)


	def _record(pair: dict, response: str, context: str, tag: str) -> GradeReport:
	client = pair["client"]
	report = grade(
	query=pair["question"],
	response=response,
	context=context,
	client=client,
	)
	telemetry.record(
	client=client,
	domain=pair["domain"],
	query_len=len(pair["question"].split()),
	latency_ms={"retrieve": 12.0, "generate": 180.0, "grade": 45.0},
	report=report,
	docs_retrieved=3,
	min_retrieval_score=0.72,
	)
	status = "PASS" if report.overall else "FAIL"
	faith = next(r for r in report.results if r.metric == "faithfulness")
	print(f" [{tag}] {pair['id']:<20} {status} faith={faith.score:.3f} {faith.detail}")
	return report


	def run() -> None:
	pairs = yaml.safe_load(DATASET_PATH.read_text())["pairs"]
	kb: dict[str, str] = {}

	# ── Batch 1: clean traffic ──────────────────────────────────────────────
	print("\n── Batch 1: clean traffic (expected answers) ──\n")
	for pair in pairs:
	domain = pair["domain"]
	if domain not in kb:
	kb[domain] = _load_kb_context(domain)
	response = pair["expected_answer"].strip()
	_record(pair, response, kb[domain], "clean")
	time.sleep(0.05)

	# ── Batch 2: dirty traffic (hallucinated responses) ─────────────────────
	print("\n── Batch 2: dirty traffic (hallucinated responses) ──\n")
	dirty_pairs = [p for p in pairs if p["id"] in HALLUCINATED]
	for pair in dirty_pairs:
	domain = pair["domain"]
	response = HALLUCINATED[pair["id"]]
	_record(pair, response, kb[domain], "dirty")
	time.sleep(0.05)

	total = telemetry.live_stats()["total_queries"]
	print(f"\nTelemetry buffer: {total} events ({len(pairs)} clean + {len(dirty_pairs)} dirty)\n")

	# ── Drift detection ─────────────────────────────────────────────────────
	print("=" * 60)
	print("Running drift detection vs golden-dataset baseline...")
	print("=" * 60)

	sys.path.insert(0, str(Path(__file__).parent))
	from drift import build_current, build_reference, detect_drift, report_drift

	print("\nBuilding reference distribution...")
	reference = build_reference()

	current = build_current()
	cur_n = len(next(iter(current.values()), []))
	print(f"Reference: {len(next(iter(reference.values())))} pairs")
	print(f"Current: {cur_n} events\n")

	results = detect_drift(current, reference)
	report_drift(results)
	print()


	if __name__ == "__main__":
	run()