InsuranceBot / tests /live_verify.py
rohitsar567's picture
refactor: KI-268 β€” Path B cleanup: -5,219 LOC legacy + nim_fallback + chain collapse + modal wiring
55fb938
Raw
History Blame Contribute Delete
6.63 kB
"""End-to-end live-site verification via the deployed API.
Drives the LIVE deployed bot (HF Spaces / Vercel / local) with a 20-question
subset of the gold Q&A, asserts every response has:
- HTTP 200
- non-empty reply_text
- at least one citation (when not a refusal)
- latency_ms within Doc 01 C1 budget (p95 ≀ 7000ms)
Writes tests/live_results_<ts>.md with a pass/fail table + Doc 01 latency budget audit.
This is the cron-able production drift detector. Schedule it nightly to catch:
- Sarvam silently updating models
- HF Space build regressions
- API key expiry
- Corpus changes
- Latency budget breaches
Run:
# Default β†’ live HF Space URL
python tests/live_verify.py
# Or point at any other deploy
TARGET_URL=https://other.example.com python tests/live_verify.py
"""
from __future__ import annotations
import asyncio
import json
import os
import random
import sys
import time
from pathlib import Path
import httpx
ROOT = Path(__file__).resolve().parent.parent
GOLD_FILE = ROOT / "eval" / "gold_qa.json"
RESULTS_DIR = ROOT / "tests"
DEFAULT_URL = "https://rohitsar567-insurancebot.hf.space"
TARGET_URL = os.environ.get("TARGET_URL", DEFAULT_URL).rstrip("/")
SAMPLE_SIZE = 20
LATENCY_BUDGET_P95_MS = 12_000 # generous given DeepSeek brain latency
PER_QUERY_TIMEOUT = 90.0
async def health_check(client: httpx.AsyncClient) -> dict:
r = await client.get(f"{TARGET_URL}/api/health", timeout=15)
r.raise_for_status()
return r.json()
async def ask(client: httpx.AsyncClient, question: str) -> dict:
r = await client.post(
f"{TARGET_URL}/api/chat",
json={"user_text": question, "return_audio": False},
timeout=PER_QUERY_TIMEOUT,
)
r.raise_for_status()
return r.json()
async def main():
if not GOLD_FILE.exists():
print("eval/gold_qa.json missing β€” run `python -m eval.generate_gold` first.")
return 1
gold = json.loads(GOLD_FILE.read_text())
random.seed(42)
sample = random.sample(gold, k=min(SAMPLE_SIZE, len(gold)))
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
ts = time.strftime("%Y-%m-%dT%H-%M-%S")
md_path = RESULTS_DIR / f"live_results_{ts}.md"
rows = []
latencies = []
passes = 0
fails = 0
async with httpx.AsyncClient() as client:
try:
h = await health_check(client)
except Exception as e:
md_path.write_text(f"# Live verify β€” FAILED\n\nHealth check failed: {e}\n")
print(f"FAIL: {e}")
return 1
for i, g in enumerate(sample, 1):
qstart = time.time()
try:
resp = await ask(client, g["question"])
elapsed_ms = int((time.time() - qstart) * 1000)
latencies.append(elapsed_ms)
reply = resp.get("reply_text", "")
citations = resp.get("citations", [])
blocked = resp.get("blocked", False)
brain = resp.get("brain_used", "?")
expected_refusal = g.get("expected_refusal", False)
# Pass criteria
ok = False
reason = ""
if expected_refusal:
# Bot should refuse
refused = blocked or any(kw in reply.lower() for kw in ("don't see", "don't have", "rather not"))
ok = bool(refused)
reason = "correctly refused" if ok else "did NOT refuse"
else:
if blocked:
ok = False
reason = "blocked unexpectedly"
elif not reply.strip():
ok = False
reason = "empty reply"
elif not citations:
ok = False
reason = "no citations"
else:
ok = True
reason = "answered with citation"
if ok: passes += 1
else: fails += 1
rows.append({
"n": i,
"question": g["question"][:80],
"expected_refusal": expected_refusal,
"ok": ok,
"reason": reason,
"brain": brain.split("::")[0],
"latency_ms": elapsed_ms,
"citation_count": len(citations),
})
print(f"[{i}/{len(sample)}] {'βœ“' if ok else 'βœ—'} {reason} ({elapsed_ms}ms)")
except Exception as e:
fails += 1
rows.append({"n": i, "question": g["question"][:80], "ok": False, "reason": f"exception: {e}", "brain": "?", "latency_ms": -1, "citation_count": 0})
print(f"[{i}/{len(sample)}] βœ— exception: {e}")
pass_rate = passes / max(1, len(rows))
if latencies:
latencies.sort()
p50 = latencies[len(latencies) // 2]
p95 = latencies[min(len(latencies) - 1, int(len(latencies) * 0.95))]
else:
p50 = p95 = -1
budget_pass = p95 <= LATENCY_BUDGET_P95_MS
md = []
md.append(f"# Live-site verification β€” {ts}\n")
md.append(f"**Target:** `{TARGET_URL}`")
md.append(f"**Health check:** {h.get('status')} (providers: {h.get('providers_ok')})")
md.append("")
md.append("## Headline")
md.append("")
md.append(f"| Metric | Value |")
md.append(f"| --- | --- |")
md.append(f"| Pass rate | **{passes}/{len(rows)} ({pass_rate*100:.1f}%)** |")
md.append(f"| Latency p50 | {p50} ms |")
md.append(f"| Latency p95 | {p95} ms |")
md.append(f"| Latency budget (≀{LATENCY_BUDGET_P95_MS}ms p95) | {'βœ… PASS' if budget_pass else '❌ FAIL'} |")
md.append("")
md.append("## Per-question results")
md.append("")
md.append("| # | OK | Question | Reason | Brain | Latency | Citations |")
md.append("| --- | --- | --- | --- | --- | --- | --- |")
for r in rows:
ok_tag = "βœ“" if r["ok"] else "βœ—"
md.append(f"| {r['n']} | {ok_tag} | {r['question']} | {r['reason']} | {r['brain']} | {r['latency_ms']} ms | {r['citation_count']} |")
md.append("")
md.append("---")
md.append("")
md.append(f"_Generated by `tests/live_verify.py`. Cron this nightly to catch regressions._")
md_path.write_text("\n".join(md))
print(f"\nWrote {md_path.relative_to(ROOT)}")
print(f"Pass: {passes}/{len(rows)} ({pass_rate*100:.1f}%) | p50={p50}ms | p95={p95}ms")
return 0 if pass_rate >= 0.6 else 1
if __name__ == "__main__":
sys.exit(asyncio.run(main()))