Spaces:

rohitsar567
/

InsuranceBot

Sleeping

App Files Files Community

InsuranceBot / tests /live_verify.py

rohitsar567

refactor: KI-268 — Path B cleanup: -5,219 LOC legacy + nim_fallback + chain collapse + modal wiring

55fb938 about 2 months ago

Raw

History Blame Contribute Delete

6.63 kB

	"""End-to-end live-site verification via the deployed API.

	Drives the LIVE deployed bot (HF Spaces / Vercel / local) with a 20-question
	subset of the gold Q&A, asserts every response has:
	- HTTP 200
	- non-empty reply_text
	- at least one citation (when not a refusal)
	- latency_ms within Doc 01 C1 budget (p95 ≤ 7000ms)

	Writes tests/live_results_<ts>.md with a pass/fail table + Doc 01 latency budget audit.

	This is the cron-able production drift detector. Schedule it nightly to catch:
	- Sarvam silently updating models
	- HF Space build regressions
	- API key expiry
	- Corpus changes
	- Latency budget breaches

	Run:
	# Default → live HF Space URL
	python tests/live_verify.py

	# Or point at any other deploy
	TARGET_URL=https://other.example.com python tests/live_verify.py
	"""

	from __future__ import annotations

	import asyncio
	import json
	import os
	import random
	import sys
	import time
	from pathlib import Path

	import httpx

	ROOT = Path(__file__).resolve().parent.parent
	GOLD_FILE = ROOT / "eval" / "gold_qa.json"
	RESULTS_DIR = ROOT / "tests"
	DEFAULT_URL = "https://rohitsar567-insurancebot.hf.space"

	TARGET_URL = os.environ.get("TARGET_URL", DEFAULT_URL).rstrip("/")
	SAMPLE_SIZE = 20
	LATENCY_BUDGET_P95_MS = 12_000 # generous given DeepSeek brain latency
	PER_QUERY_TIMEOUT = 90.0


	async def health_check(client: httpx.AsyncClient) -> dict:
	r = await client.get(f"{TARGET_URL}/api/health", timeout=15)
	r.raise_for_status()
	return r.json()


	async def ask(client: httpx.AsyncClient, question: str) -> dict:
	r = await client.post(
	f"{TARGET_URL}/api/chat",
	json={"user_text": question, "return_audio": False},
	timeout=PER_QUERY_TIMEOUT,
	)
	r.raise_for_status()
	return r.json()


	async def main():
	if not GOLD_FILE.exists():
	print("eval/gold_qa.json missing — run `python -m eval.generate_gold` first.")
	return 1

	gold = json.loads(GOLD_FILE.read_text())
	random.seed(42)
	sample = random.sample(gold, k=min(SAMPLE_SIZE, len(gold)))

	RESULTS_DIR.mkdir(parents=True, exist_ok=True)
	ts = time.strftime("%Y-%m-%dT%H-%M-%S")
	md_path = RESULTS_DIR / f"live_results_{ts}.md"

	rows = []
	latencies = []
	passes = 0
	fails = 0

	async with httpx.AsyncClient() as client:
	try:
	h = await health_check(client)
	except Exception as e:
	md_path.write_text(f"# Live verify — FAILED\n\nHealth check failed: {e}\n")
	print(f"FAIL: {e}")
	return 1

	for i, g in enumerate(sample, 1):
	qstart = time.time()
	try:
	resp = await ask(client, g["question"])
	elapsed_ms = int((time.time() - qstart) * 1000)
	latencies.append(elapsed_ms)
	reply = resp.get("reply_text", "")
	citations = resp.get("citations", [])
	blocked = resp.get("blocked", False)
	brain = resp.get("brain_used", "?")
	expected_refusal = g.get("expected_refusal", False)

	# Pass criteria
	ok = False
	reason = ""
	if expected_refusal:
	# Bot should refuse
	refused = blocked or any(kw in reply.lower() for kw in ("don't see", "don't have", "rather not"))
	ok = bool(refused)
	reason = "correctly refused" if ok else "did NOT refuse"
	else:
	if blocked:
	ok = False
	reason = "blocked unexpectedly"
	elif not reply.strip():
	ok = False
	reason = "empty reply"
	elif not citations:
	ok = False
	reason = "no citations"
	else:
	ok = True
	reason = "answered with citation"

	if ok: passes += 1
	else: fails += 1

	rows.append({
	"n": i,
	"question": g["question"][:80],
	"expected_refusal": expected_refusal,
	"ok": ok,
	"reason": reason,
	"brain": brain.split("::")[0],
	"latency_ms": elapsed_ms,
	"citation_count": len(citations),
	})
	print(f"[{i}/{len(sample)}] {'✓' if ok else '✗'} {reason} ({elapsed_ms}ms)")
	except Exception as e:
	fails += 1
	rows.append({"n": i, "question": g["question"][:80], "ok": False, "reason": f"exception: {e}", "brain": "?", "latency_ms": -1, "citation_count": 0})
	print(f"[{i}/{len(sample)}] ✗ exception: {e}")

	pass_rate = passes / max(1, len(rows))
	if latencies:
	latencies.sort()
	p50 = latencies[len(latencies) // 2]
	p95 = latencies[min(len(latencies) - 1, int(len(latencies) * 0.95))]
	else:
	p50 = p95 = -1
	budget_pass = p95 <= LATENCY_BUDGET_P95_MS

	md = []
	md.append(f"# Live-site verification — {ts}\n")
	md.append(f"Target: `{TARGET_URL}`")
	md.append(f"Health check: {h.get('status')} (providers: {h.get('providers_ok')})")
	md.append("")
	md.append("## Headline")
	md.append("")
	md.append(f"\| Metric \| Value \|")
	md.append(f"\| --- \| --- \|")
	md.append(f"\| Pass rate \| *{passes}/{len(rows)} ({pass_rate100:.1f}%)** \|")
	md.append(f"\| Latency p50 \| {p50} ms \|")
	md.append(f"\| Latency p95 \| {p95} ms \|")
	md.append(f"\| Latency budget (≤{LATENCY_BUDGET_P95_MS}ms p95) \| {'✅ PASS' if budget_pass else '❌ FAIL'} \|")
	md.append("")
	md.append("## Per-question results")
	md.append("")
	md.append("\| # \| OK \| Question \| Reason \| Brain \| Latency \| Citations \|")
	md.append("\| --- \| --- \| --- \| --- \| --- \| --- \| --- \|")
	for r in rows:
	ok_tag = "✓" if r["ok"] else "✗"
	md.append(f"\| {r['n']} \| {ok_tag} \| {r['question']} \| {r['reason']} \| {r['brain']} \| {r['latency_ms']} ms \| {r['citation_count']} \|")
	md.append("")
	md.append("---")
	md.append("")
	md.append(f"_Generated by `tests/live_verify.py`. Cron this nightly to catch regressions._")
	md_path.write_text("\n".join(md))

	print(f"\nWrote {md_path.relative_to(ROOT)}")
	print(f"Pass: {passes}/{len(rows)} ({pass_rate*100:.1f}%) \| p50={p50}ms \| p95={p95}ms")
	return 0 if pass_rate >= 0.6 else 1


	if __name__ == "__main__":
	sys.exit(asyncio.run(main()))