Spaces:

Adityax-07
/

AgentBench

Sleeping

App Files Files Community

AgentBench / app.py

Adityax-07

feat: add animated Streamlit evaluation dashboard

8662564 2 months ago

Raw

History Blame Contribute Delete

63.4 kB

	"""
	AgentBench — Multi-Agent Evaluation Dashboard (Streamlit)
	"""
	import html
	import json
	import os
	import time
	import uuid

	import plotly.graph_objects as go
	import streamlit as st

	from agents.single_agent import run_single_agent
	from graph import build_graph
	from evaluator import evaluate

	st.set_page_config(
	page_title="AgentBench — Multi-Agent Evaluation",
	page_icon="⚡",
	layout="wide",
	initial_sidebar_state="expanded",
	)

	# ── CSS ───────────────────────────────────────────────────────────────────────
	st.markdown("""
	<style>
	@keyframes fadeInUp {
	from { opacity:0; transform:translateY(22px); }
	to { opacity:1; transform:translateY(0); }
	}
	@keyframes fadeInLeft {
	from { opacity:0; transform:translateX(-18px); }
	to { opacity:1; transform:translateX(0); }
	}
	@keyframes shimmerSlide {
	0% { background-position:-200% center; }
	100% { background-position: 200% center; }
	}
	@keyframes gradientFlow {
	0%,100% { background-position:0% 50%; }
	50% { background-position:100% 50%; }
	}
	@keyframes pulseRing {
	0% { box-shadow:0 0 0 0 rgba(37,99,235,0.45); }
	70% { box-shadow:0 0 0 10px rgba(37,99,235,0); }
	100% { box-shadow:0 0 0 0 rgba(37,99,235,0); }
	}
	@keyframes livePing {
	0% { box-shadow:0 0 0 0 rgba(34,197,94,0.60); }
	70% { box-shadow:0 0 0 9px rgba(34,197,94,0); }
	100% { box-shadow:0 0 0 0 rgba(34,197,94,0); }
	}
	@keyframes floatBob {
	0%,100% { transform:translateY(0px); }
	50% { transform:translateY(-6px); }
	}
	@keyframes scaleIn {
	from { opacity:0; transform:scale(0.82); }
	to { opacity:1; transform:scale(1); }
	}
	@keyframes borderGlow {
	0%,100% { border-color:#2563eb; box-shadow:0 0 0 0 rgba(37,99,235,0.15); }
	50% { border-color:#60a5fa; box-shadow:0 0 14px rgba(96,165,250,0.28); }
	}
	@keyframes runPulse {
	0%,100% { opacity:1; }
	50% { opacity:0.55; }
	}
	@keyframes stepDone {
	0% { transform:scale(0.9); opacity:0; }
	60% { transform:scale(1.06); }
	100%{ transform:scale(1); opacity:1; }
	}
	@keyframes heroOrb1 {
	0%,100% { transform:translate(0,0) scale(1); }
	50% { transform:translate(20px,-12px) scale(1.08); }
	}
	@keyframes heroOrb2 {
	0%,100% { transform:translate(0,0) scale(1); }
	50% { transform:translate(-15px,10px) scale(0.95); }
	}
	@keyframes pbarShimmer {
	0% { background-position:-200% center; }
	100% { background-position: 200% center; }
	}
	@keyframes numberReveal {
	from { opacity:0; transform:translateY(14px); filter:blur(6px); }
	to { opacity:1; transform:translateY(0); filter:blur(0); }
	}
	@keyframes crownBounce {
	0%,100% { transform:translateY(0) rotate(-5deg); }
	50% { transform:translateY(-4px) rotate(5deg); }
	}
	@keyframes glowWin {
	0%,100% { box-shadow:0 0 0 0 rgba(21,128,61,0.2); }
	50% { box-shadow:0 0 16px 4px rgba(21,128,61,0.15); }
	}

	/* ── Base ── */
	#MainMenu,footer,header{ visibility:hidden; }
	[data-testid="stDecoration"]{ display:none; }
	.main {
	background:
	radial-gradient(ellipse at 80% 10%,rgba(37,99,235,0.05) 0%,transparent 50%),
	radial-gradient(ellipse at 10% 80%,rgba(96,165,250,0.05) 0%,transparent 50%),
	linear-gradient(160deg,#f8faff 0%,#f2f6ff 40%,#fafafa 100%);
	}
	.main .block-container{ padding-top:1.4rem; }

	/* ── Sidebar ── */
	[data-testid="stSidebar"] {
	background:linear-gradient(170deg,#0a0e1a 0%,#0d1117 45%,#111827 100%) !important;
	border-right:1px solid rgba(255,255,255,0.07) !important;
	min-width:220px !important; max-width:240px !important;
	}
	[data-testid="stSidebarContent"]{ padding:0 !important; }
	[data-testid="stSidebar"] p,
	[data-testid="stSidebar"] div,
	[data-testid="stSidebar"] label,
	[data-testid="stSidebar"] span { color:#c9d1d9 !important; }
	[data-testid="stSidebar"] hr { border-color:rgba(255,255,255,0.08) !important; }
	[data-testid="stSidebar"] [data-testid="stMarkdownContainer"] * { color:#c9d1d9 !important; }

	/* ── Buttons ── */
	div[data-testid="stButton"] > button {
	white-space:normal !important; word-break:break-word !important;
	height:auto !important; line-height:1.45 !important;
	transition:transform 0.18s ease,box-shadow 0.18s ease,background 0.18s ease !important;
	}
	div[data-testid="stButton"] > button:hover {
	transform:translateY(-2px) !important;
	box-shadow:0 4px 16px rgba(37,99,235,0.18) !important;
	}
	div[data-testid="stButton"] > button[kind="primary"]:hover {
	box-shadow:0 6px 22px rgba(37,99,235,0.38) !important;
	}

	/* ── Hero ── */
	.ab-hero {
	background:linear-gradient(135deg,#eef4ff 0%,#e8efff 40%,#f0f9ff 100%);
	background-size:200% 200%;
	animation:gradientFlow 9s ease infinite;
	border:1px solid rgba(37,99,235,0.15);
	border-radius:18px; padding:22px 26px; margin-bottom:20px;
	position:relative; overflow:hidden;
	}
	.ab-hero::before {
	content:''; position:absolute; top:-40%; right:-8%;
	width:280px; height:280px;
	background:radial-gradient(circle,rgba(37,99,235,0.10) 0%,transparent 70%);
	border-radius:50%; animation:heroOrb1 7s ease-in-out infinite; pointer-events:none;
	}
	.ab-hero::after {
	content:''; position:absolute; bottom:-35%; left:5%;
	width:200px; height:200px;
	background:radial-gradient(circle,rgba(96,165,250,0.09) 0%,transparent 70%);
	border-radius:50%; animation:heroOrb2 9s ease-in-out infinite; pointer-events:none;
	}
	.ab-hero-content{ position:relative; z-index:1; display:flex; align-items:flex-start; justify-content:space-between; }
	.ab-hero-title {
	font-size:26px; font-weight:900; letter-spacing:-0.04em; margin:0 0 5px 0;
	background:linear-gradient(90deg,#1e3a8a 0%,#2563eb 40%,#60a5fa 70%,#1e3a8a 100%);
	background-size:200% auto;
	-webkit-background-clip:text; background-clip:text; -webkit-text-fill-color:transparent;
	animation:shimmerSlide 4s linear infinite;
	}
	.ab-hero-sub{ font-size:12px; color:#6b7280; margin:0; }
	.ab-hero-badge{ font-size:11px; padding:5px 14px; border-radius:20px; font-weight:700; flex-shrink:0; margin-top:2px; }

	/* ── Agent cards ── */
	.ab-card {
	border:1px solid #e5e7eb; border-radius:16px; overflow:hidden; background:#fff;
	margin-bottom:6px;
	box-shadow:0 1px 3px rgba(0,0,0,0.06),0 4px 18px rgba(0,0,0,0.04);
	transition:box-shadow 0.25s ease,transform 0.25s ease;
	animation:fadeInUp 0.45s ease both;
	}
	.ab-card:hover { box-shadow:0 4px 14px rgba(0,0,0,0.10),0 14px 38px rgba(0,0,0,0.07); transform:translateY(-3px); }
	.ab-card.winner {
	border:2px solid #2563eb;
	box-shadow:0 2px 8px rgba(37,99,235,0.14),0 8px 30px rgba(37,99,235,0.07);
	animation:fadeInUp 0.45s ease both,borderGlow 3.5s ease-in-out infinite 0.5s;
	}
	.ab-ch {
	padding:11px 16px;
	background:linear-gradient(135deg,#f9fafb 0%,#f3f4f6 100%);
	border-bottom:1px solid #e5e7eb;
	display:flex; align-items:center; justify-content:space-between;
	}
	.ab-name{ font-size:13px; font-weight:700; color:#111827; letter-spacing:-0.01em; }
	.ab-pipe {
	padding:7px 14px; background:#f8f9fa; border-bottom:1px solid #e5e7eb;
	display:flex; gap:5px; align-items:center; flex-wrap:wrap; min-height:34px;
	}
	.ab-step {
	font-size:10px; padding:3px 10px; border-radius:20px;
	border:1px solid #d1d5db; background:#fff; color:#9ca3af;
	white-space:nowrap; font-weight:500; transition:all 0.2s ease;
	}
	.ab-step.done { border-color:#86efac; color:#15803d; background:#f0fdf4; font-weight:600; animation:stepDone 0.35s ease both; }
	.ab-step.active {
	border-color:#93c5fd; color:#1d4ed8; font-weight:600;
	background:linear-gradient(90deg,#eff6ff 0%,#dbeafe 50%,#eff6ff 100%);
	background-size:200% auto;
	animation:shimmerSlide 1.6s linear infinite,runPulse 1.3s ease-in-out infinite;
	}
	.ab-arr{ color:#d1d5db; font-size:11px; }
	.ab-body{ padding:14px 16px; font-size:13px; line-height:1.8; color:#1f2937; min-height:120px; }
	.ab-body-ph{ color:#d1d5db; font-style:italic; }
	.ab-foot{
	padding:8px 16px; border-top:1px solid #f3f4f6;
	background:linear-gradient(135deg,#f9fafb,#f3f4f6);
	display:flex; gap:18px; font-size:11px; color:#6b7280;
	}
	.ab-foot b{ color:#111827; }

	/* ── Mini metric rows inside cards ── */
	.ab-mrow {
	display:flex; flex-wrap:wrap; gap:8px;
	padding:8px 16px 10px; border-top:1px solid #f1f5f9; background:#fafafa;
	}
	.ab-mrow-item{ display:flex; align-items:center; gap:5px; flex:1; min-width:90px; }
	.ab-mrow-label{ font-size:9px; font-weight:700; color:#9ca3af; text-transform:uppercase; letter-spacing:0.05em; width:42px; flex-shrink:0; }
	.ab-mrow-bwrap{ flex:1; height:4px; background:#f1f5f9; border-radius:3px; overflow:hidden; }
	.ab-mrow-bar{ height:100%; border-radius:3px; transition:width 1s cubic-bezier(0.4,0,0.2,1); }
	.ab-mrow-val{ font-size:10px; font-weight:700; color:#374151; width:28px; text-align:right; }

	/* ── Badges ── */
	.ab-badge{ font-size:10px; padding:3px 10px; border-radius:20px; font-weight:600; letter-spacing:0.02em; transition:all 0.2s ease; }
	.ab-idle { background:#f1f5f9; color:#64748b; border:1px solid #e2e8f0; }
	.ab-run { background:#fff7ed; color:#c2410c; border:1px solid #fed7aa; animation:runPulse 0.9s ease-in-out infinite; }
	.ab-win { background:#f0fdf4; color:#15803d; border:1px solid #86efac; animation:glowWin 2.5s ease-in-out infinite; }
	.ab-halu-ok { background:#f0fdf4; color:#15803d; border:1px solid #86efac; }
	.ab-halu-bad { background:#fef2f2; color:#dc2626; border:1px solid #fecaca; }
	.ab-ph-badge { font-size:11px; padding:4px 14px; border-radius:20px; font-weight:600; }
	.ab-live { background:linear-gradient(135deg,#dbeafe,#eff6ff); color:#1d4ed8; border:1px solid #bfdbfe; animation:pulseRing 2.2s ease-out infinite; }
	.ab-bench { background:linear-gradient(135deg,#dcfce7,#f0fdf4); color:#15803d; border:1px solid #86efac; }

	/* ── Metric cards v2 (benchmark panel) ── */
	.ab-mc2 {
	border-radius:14px; padding:16px 16px 13px; background:#fff;
	border:1px solid #e5e7eb;
	box-shadow:0 1px 4px rgba(0,0,0,0.05);
	animation:fadeInUp 0.5s ease both;
	transition:all 0.25s ease; position:relative; overflow:hidden;
	}
	.ab-mc2::before {
	content:''; position:absolute; top:0; left:0; right:0;
	height:3px; border-radius:14px 14px 0 0;
	}
	.ab-mc2.blue::before { background:linear-gradient(90deg,#1e40af,#60a5fa); }
	.ab-mc2.green::before { background:linear-gradient(90deg,#15803d,#4ade80); }
	.ab-mc2.red::before { background:linear-gradient(90deg,#dc2626,#f87171); }
	.ab-mc2.orange::before { background:linear-gradient(90deg,#c2410c,#fb923c); }
	.ab-mc2.purple::before { background:linear-gradient(90deg,#7c3aed,#a78bfa); }
	.ab-mc2.gray::before { background:linear-gradient(90deg,#475569,#94a3b8); }
	.ab-mc2:hover{ transform:translateY(-3px); box-shadow:0 8px 28px rgba(0,0,0,0.10); }
	.ab-ml2{ font-size:9px; font-weight:700; color:#94a3b8; text-transform:uppercase; letter-spacing:0.07em; margin-bottom:8px; }
	.ab-mv2{ font-size:26px; font-weight:800; color:#0f172a; letter-spacing:-0.03em; animation:numberReveal 0.7s ease both; }
	.ab-mdelta-g { font-size:10px; font-weight:700; padding:1px 7px; border-radius:6px; margin-left:5px; background:#f0fdf4; color:#15803d; border:1px solid #86efac; }
	.ab-mdelta-r { font-size:10px; font-weight:700; padding:1px 7px; border-radius:6px; margin-left:5px; background:#fef2f2; color:#dc2626; border:1px solid #fecaca; }
	.ab-mdelta-n { font-size:10px; font-weight:700; padding:1px 7px; border-radius:6px; margin-left:5px; background:#f8fafc; color:#64748b; border:1px solid #e2e8f0; }
	.ab-mvs{ font-size:10px; color:#94a3b8; margin-top:4px; }
	.ab-hpbar-wrap{ height:5px; background:#f1f5f9; border-radius:4px; margin-top:8px; overflow:hidden; }
	.ab-hpbar{ height:100%; border-radius:4px; transition:width 1s cubic-bezier(0.4,0,0.2,1); }
	.ab-hpbar.blue { background:linear-gradient(90deg,#1e40af,#60a5fa); }
	.ab-hpbar.green { background:linear-gradient(90deg,#15803d,#4ade80); }
	.ab-hpbar.orange { background:linear-gradient(90deg,#c2410c,#fb923c); }
	.ab-hpbar.red { background:linear-gradient(90deg,#dc2626,#f87171); }
	.ab-hpbar.purple { background:linear-gradient(90deg,#7c3aed,#a78bfa); }

	/* ── Inline query metric cards (live panel) ── */
	.ab-qmc {
	background:linear-gradient(135deg,#f8faff,#eff4ff);
	border:1px solid #dbeafe; border-radius:12px;
	padding:12px 14px; margin-bottom:6px;
	box-shadow:0 1px 3px rgba(37,99,235,0.06);
	animation:fadeInUp 0.4s ease both;
	transition:transform 0.2s ease,box-shadow 0.2s ease;
	}
	.ab-qmc:hover{ transform:translateY(-2px); box-shadow:0 4px 14px rgba(37,99,235,0.13); }
	.ab-qml{ font-size:10px; color:#6b7280; font-weight:700; letter-spacing:0.07em; text-transform:uppercase; margin-bottom:6px; }
	.ab-qmv{ font-size:20px; font-weight:800; color:#0f172a; letter-spacing:-0.02em; }
	.ab-qmb-w{ font-size:10px; font-weight:700; padding:2px 7px; border-radius:6px; background:#dcfce7; color:#15803d; margin-left:5px; border:1px solid #86efac; }
	.ab-qmb-l{ font-size:10px; font-weight:700; padding:2px 7px; border-radius:6px; background:#fef2f2; color:#dc2626; margin-left:5px; border:1px solid #fecaca; }
	.ab-qmb-n{ font-size:10px; font-weight:700; padding:2px 7px; border-radius:6px; background:#f8fafc; color:#64748b; margin-left:5px; border:1px solid #e2e8f0; }
	.ab-qm-vs{ font-size:10px; color:#9ca3af; margin-top:4px; }

	/* ── Progress bar ── */
	.ab-pbar-wrap{ height:5px; background:#e5e7eb; border-radius:3px; margin:12px 0; overflow:hidden; }
	.ab-pbar {
	height:100%;
	background:linear-gradient(90deg,#1e40af,#2563eb,#60a5fa,#818cf8,#2563eb);
	background-size:300% auto; border-radius:3px;
	transition:width 0.7s cubic-bezier(0.4,0,0.2,1);
	animation:pbarShimmer 2s linear infinite;
	}

	/* ── Winner pill / wbar ── */
	.ab-winner-pill {
	display:inline-flex; align-items:center; gap:8px;
	background:linear-gradient(135deg,#f0fdf4,#dcfce7);
	border:1px solid #86efac; border-radius:10px;
	padding:10px 16px; font-size:12px; color:#15803d; font-weight:600;
	margin-bottom:16px; box-shadow:0 1px 4px rgba(21,128,61,0.10);
	animation:fadeInUp 0.4s ease both;
	}
	.ab-wbar {
	background:linear-gradient(135deg,#f0fdf4,#dcfce7);
	border:1px solid #86efac; border-radius:10px;
	padding:12px 16px; margin:14px 0 18px 0;
	font-size:12px; color:#15803d; font-weight:500;
	box-shadow:0 1px 4px rgba(21,128,61,0.08);
	animation:fadeInLeft 0.4s ease both;
	}
	.ab-insight {
	background:linear-gradient(135deg,#f0f9ff,#e0f2fe);
	border:1px solid #bae6fd; border-left:4px solid #0284c7;
	border-radius:10px; padding:12px 16px; font-size:12px; color:#0c4a6e;
	margin:12px 0; animation:fadeInLeft 0.4s ease both;
	}

	/* ── Chart containers ── */
	.ab-cc {
	border:1px solid #e5e7eb; border-radius:14px;
	padding:16px 16px 4px; background:#fff; margin-bottom:6px;
	box-shadow:0 1px 4px rgba(0,0,0,0.05);
	animation:fadeInUp 0.5s ease both;
	transition:box-shadow 0.2s ease,transform 0.2s ease;
	}
	.ab-cc:hover{ box-shadow:0 4px 18px rgba(0,0,0,0.09); transform:translateY(-2px); }
	.ab-cc-title{ font-size:13px; font-weight:700; color:#0f172a; margin-bottom:2px; letter-spacing:-0.01em; }
	.ab-cc-sub { font-size:11px; color:#9ca3af; margin-bottom:0; }

	/* ── Resp comparison cards ── */
	.ab-resp-card {
	border:1px solid #e5e7eb; border-radius:14px; overflow:hidden; background:#fff;
	box-shadow:0 1px 4px rgba(0,0,0,0.05); animation:fadeInUp 0.4s ease both;
	transition:transform 0.22s ease,box-shadow 0.22s ease;
	}
	.ab-resp-card:hover{ transform:translateY(-2px); box-shadow:0 4px 16px rgba(0,0,0,0.10); }
	.ab-resp-card.winner {
	border:2px solid #2563eb; box-shadow:0 2px 8px rgba(37,99,235,0.12);
	animation:fadeInUp 0.4s ease both,borderGlow 3.5s ease-in-out infinite 0.3s;
	}

	/* ── Misc ── */
	hr.ab{ border:none; border-top:1px solid #f1f5f9; margin:16px 0; }
	.ab-logo-float{ animation:floatBob 3.5s ease-in-out infinite; display:inline-block; }
	.ab-dot-ping { animation:livePing 2s ease-out infinite; border-radius:50%; display:inline-block; }
	.ab-crown { display:inline-block; animation:crownBounce 2s ease-in-out infinite; }
	.ab-section-lbl {
	font-size:10px; font-weight:700; color:#94a3b8;
	text-transform:uppercase; letter-spacing:0.08em; margin-bottom:10px;
	}
	</style>
	""", unsafe_allow_html=True)

	# ── Static data ───────────────────────────────────────────────────────────────
	SUGGESTIONS = [
	"What is retrieval-augmented generation?",
	"Explain the attention mechanism in transformers",
	"When should I use LangGraph over LangChain?",
	"How does LoRA reduce fine-tuning costs?",
	"What is data drift in machine learning?",
	]

	CATEGORIES = [
	("GenAI Concepts", range(0, 5)),
	("Agentic AI", range(5, 10)),
	("Fine-tuning", range(10, 15)),
	("Architectures", range(15, 20)),
	("Retrieval", range(20, 25)),
	("Eval & Safety", range(25, 30)),
	("ML Fundamentals", range(30, 35)),
	("Efficient ML", range(35, 40)),
	("Applied AI", range(40, 45)),
	("Trends & Future", range(45, 50)),
	]

	_BENCH_FILE = os.path.join(os.path.dirname(__file__), "bench_results.json")

	S_CLR = "#f97316"
	M_CLR = "#2563eb"
	S_FILL = "rgba(249,115,22,0.12)"
	M_FILL = "rgba(37,99,235,0.12)"
	GR_CLR = "rgba(0,0,0,0.05)"
	TK_CLR = "rgba(0,0,0,0.40)"

	_BASE_CHART = dict(
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor ="rgba(0,0,0,0)",
	font=dict(size=10, color="#595959"),
	margin=dict(t=30, b=8, l=8, r=8),
	)

	# ── Data helpers ──────────────────────────────────────────────────────────────
	def _load_bench() -> dict:
	if os.path.exists(_BENCH_FILE):
	try:
	with open(_BENCH_FILE) as f:
	return json.load(f)
	except Exception:
	pass
	return {}


	def _bench_tabs_data() -> list:
	data = _load_bench()
	tabs = []
	for row in data.get("queries", [])[:10]:
	label = row["query"][:30].rstrip() + "…" if len(row["query"]) > 30 else row["query"]
	tabs.append({
	"label": label,
	"query": row["query"],
	"s": {
	"text": row["single"]["text"],
	"lat": f'{row["single"]["lat"]}s',
	"rel": row["single"].get("rel", 0),
	"halu": row["single"].get("halu", "?"),
	"coherence": row["single"].get("coherence", 0),
	"completeness":row["single"].get("completeness", 0),
	"depth": row["single"].get("depth", 0),
	},
	"m": {
	"text": row["multi"]["text"],
	"lat": f'{row["multi"]["lat"]}s',
	"rel": row["multi"].get("rel", 0),
	"halu": row["multi"].get("halu", "?"),
	"coherence": row["multi"].get("coherence", 0),
	"completeness":row["multi"].get("completeness", 0),
	"depth": row["multi"].get("depth", 0),
	},
	})
	return tabs

	# ── Session state ─────────────────────────────────────────────────────────────
	_DEFAULTS = {
	"ran": False, "last_query": "", "query_input": "",
	"res_single": None, "res_multi": None, "cmp_view": None,
	}
	for _k, _v in _DEFAULTS.items():
	if _k not in st.session_state:
	st.session_state[_k] = _v

	@st.cache_resource
	def get_pipeline():
	return build_graph()

	# ── Chart builders ────────────────────────────────────────────────────────────

	def _chart_layout(**kw):
	d = dict(**_BASE_CHART)
	d.update(kw)
	return d


	def bar_two(labels, s_vals, m_vals, ymax=None):
	fig = go.Figure()
	fig.add_bar(name="Single", x=labels, y=s_vals,
	marker_color=S_CLR, marker_line_width=0, opacity=0.9,
	text=[f"{v:.2f}" if isinstance(v, float) else str(v) for v in s_vals],
	textposition="outside", textfont=dict(size=9))
	fig.add_bar(name="Multi", x=labels, y=m_vals,
	marker_color=M_CLR, marker_line_width=0, opacity=0.9,
	text=[f"{v:.2f}" if isinstance(v, float) else str(v) for v in m_vals],
	textposition="outside", textfont=dict(size=9))
	fig.update_layout(**_chart_layout(
	barmode="group", height=200, showlegend=True,
	legend=dict(orientation="h", y=1.2, x=0, font_size=9),
	xaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)),
	yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR),
	range=[0, ymax] if ymax else None),
	))
	return fig


	def bar_pair(labels, vals, ymax=None):
	fig = go.Figure(go.Bar(
	x=labels, y=vals,
	marker_color=[S_CLR, M_CLR], marker_line_width=0, opacity=0.9,
	text=[f"{v:.2f}" if isinstance(v, float) else str(v) for v in vals],
	textposition="outside", textfont=dict(size=9),
	))
	fig.update_layout(**_chart_layout(
	height=200, showlegend=False,
	xaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)),
	yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR),
	range=[0, ymax] if ymax else None),
	))
	return fig


	def radar_5(s_rel, s_coh, s_comp, s_dep, s_no_halu,
	m_rel, m_coh, m_comp, m_dep, m_no_halu):
	cats = ["Relevance", "Coherence", "Completeness", "Depth", "No-Halluc."]
	s_vals = [s_rel, s_coh, s_comp, s_dep, s_no_halu]
	m_vals = [m_rel, m_coh, m_comp, m_dep, m_no_halu]
	closed = cats + [cats[0]]
	fig = go.Figure()
	fig.add_trace(go.Scatterpolar(
	r=s_vals + [s_vals[0]], theta=closed, fill="toself", name="Single",
	line=dict(color=S_CLR, width=2), fillcolor=S_FILL,
	))
	fig.add_trace(go.Scatterpolar(
	r=m_vals + [m_vals[0]], theta=closed, fill="toself", name="Multi",
	line=dict(color=M_CLR, width=2), fillcolor=M_FILL,
	))
	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)", font=dict(size=10, color="#595959"),
	margin=dict(t=36, b=24, l=28, r=28), height=260,
	polar=dict(
	radialaxis=dict(visible=True, range=[0,1], gridcolor=GR_CLR,
	tickfont=dict(size=8, color=TK_CLR)),
	angularaxis=dict(gridcolor=GR_CLR),
	bgcolor="rgba(0,0,0,0)",
	),
	showlegend=True,
	legend=dict(orientation="h", y=-0.1, x=0.2, font_size=9),
	)
	return fig


	def mini_radar_5(s_rel, s_halu_ok, m_rel, m_halu_ok):
	cats = ["Relevance", "Trust", "Speed-adj", "Coverage"]
	s_v = [s_rel, 1 if s_halu_ok else 0.4, 0.92, min(s_rel + 0.04, 1.0)]
	m_v = [m_rel, 1 if m_halu_ok else 0.4, 0.15, min(m_rel + 0.07, 1.0)]
	closed = cats + [cats[0]]
	fig = go.Figure()
	fig.add_trace(go.Scatterpolar(r=s_v+[s_v[0]], theta=closed, fill="toself",
	name="Single", line=dict(color=S_CLR, width=1.5), fillcolor=S_FILL))
	fig.add_trace(go.Scatterpolar(r=m_v+[m_v[0]], theta=closed, fill="toself",
	name="Multi", line=dict(color=M_CLR, width=1.5), fillcolor=M_FILL))
	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)", font=dict(size=9, color="#595959"),
	margin=dict(t=28, b=16, l=28, r=28), height=180,
	polar=dict(
	radialaxis=dict(visible=True, range=[0,1], gridcolor=GR_CLR,
	tickfont=dict(size=7, color=TK_CLR)),
	angularaxis=dict(gridcolor=GR_CLR), bgcolor="rgba(0,0,0,0)",
	),
	showlegend=True, legend=dict(orientation="h", y=-0.15, x=0.15, font_size=8),
	)
	return fig


	def trend_line(qs_data):
	idxs = list(range(1, len(qs_data) + 1))
	s_rels = [q["single"].get("rel", 0) for q in qs_data]
	m_rels = [q["multi"].get("rel", 0) for q in qs_data]
	fig = go.Figure()
	fig.add_trace(go.Scatter(x=idxs, y=s_rels, mode="lines", name="Single",
	line=dict(color=S_CLR, width=2, shape="spline"),
	fill="tozeroy", fillcolor=S_FILL))
	fig.add_trace(go.Scatter(x=idxs, y=m_rels, mode="lines", name="Multi",
	line=dict(color=M_CLR, width=2, shape="spline"),
	fill="tozeroy", fillcolor=M_FILL))
	fig.update_layout(**_chart_layout(
	height=210, showlegend=True,
	legend=dict(orientation="h", y=1.2, x=0, font_size=9),
	xaxis=dict(title="Query #", range=[1, len(idxs)], gridcolor=GR_CLR,
	tickfont=dict(size=9, color=TK_CLR)),
	yaxis=dict(title="Relevance", range=[0, 1.15], gridcolor=GR_CLR,
	tickfont=dict(size=9, color=TK_CLR)),
	))
	return fig


	def scatter_lat_rel(qs_data):
	s_x = [q["single"].get("lat", 0) for q in qs_data]
	s_y = [q["single"].get("rel", 0) for q in qs_data]
	m_x = [q["multi"].get("lat", 0) for q in qs_data]
	m_y = [q["multi"].get("rel", 0) for q in qs_data]
	q_labels = [q["query"][:45] + "…" for q in qs_data]
	fig = go.Figure()
	fig.add_trace(go.Scatter(x=s_x, y=s_y, mode="markers", name="Single",
	marker=dict(color=S_CLR, size=9, opacity=0.75, line=dict(width=1, color="white")),
	text=q_labels,
	hovertemplate="<b>%{text}</b><br>Lat: %{x}s Rel: %{y}<extra>Single</extra>"))
	fig.add_trace(go.Scatter(x=m_x, y=m_y, mode="markers", name="Multi",
	marker=dict(color=M_CLR, size=9, opacity=0.75, line=dict(width=1, color="white")),
	text=q_labels,
	hovertemplate="<b>%{text}</b><br>Lat: %{x}s Rel: %{y}<extra>Multi</extra>"))
	fig.update_layout(**_chart_layout(
	height=210, showlegend=True,
	legend=dict(orientation="h", y=1.2, x=0, font_size=9),
	xaxis=dict(title="Latency (s)", type="log", gridcolor=GR_CLR,
	tickfont=dict(size=9, color=TK_CLR)),
	yaxis=dict(title="Relevance", range=[0, 1.15], gridcolor=GR_CLR,
	tickfont=dict(size=9, color=TK_CLR)),
	))
	return fig


	def gauge_duo(s_pct, m_pct):
	fig = go.Figure()
	steps_green = [
	{"range": [0, 10], "color": "#f0fdf4"},
	{"range": [10, 30], "color": "#fef9c3"},
	{"range": [30, 100],"color": "#fef2f2"},
	]
	for val, label, clr, domain in [
	(s_pct, "Single Agent", S_CLR, [0, 0.44]),
	(m_pct, "Multi-Agent", M_CLR, [0.56, 1.0]),
	]:
	fig.add_trace(go.Indicator(
	mode="gauge+number",
	value=val,
	number={"suffix": "%", "font": {"size": 22, "color": clr}},
	title={"text": label, "font": {"size": 12, "color": "#374151"}},
	gauge={
	"axis": {"range": [0, 100], "tickwidth": 1,
	"tickfont": {"size": 8}, "tickcolor": "#d1d5db"},
	"bar": {"color": clr, "thickness": 0.28},
	"bgcolor": "white",
	"steps": steps_green,
	"threshold": {"line": {"color": "red", "width": 3},
	"thickness": 0.75, "value": 30},
	},
	domain={"x": domain, "y": [0, 1]},
	))
	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)",
	font=dict(size=10, color="#595959"),
	margin=dict(t=40, b=10, l=20, r=20),
	height=200,
	)
	return fig


	def category_chart(qs_data):
	if len(qs_data) < 50:
	return None
	names, s_avgs, m_avgs = [], [], []
	for cat_name, idx_range in CATEGORIES:
	chunk = [qs_data[i] for i in idx_range if i < len(qs_data)]
	if not chunk:
	continue
	names.append(cat_name)
	s_avgs.append(round(sum(q["single"].get("rel", 0) for q in chunk) / len(chunk), 2))
	m_avgs.append(round(sum(q["multi"].get("rel", 0) for q in chunk) / len(chunk), 2))
	fig = go.Figure()
	fig.add_trace(go.Bar(name="Single", y=names, x=s_avgs, orientation="h",
	marker_color=S_CLR, marker_line_width=0, opacity=0.88,
	text=s_avgs, textposition="outside", textfont=dict(size=8)))
	fig.add_trace(go.Bar(name="Multi", y=names, x=m_avgs, orientation="h",
	marker_color=M_CLR, marker_line_width=0, opacity=0.88,
	text=m_avgs, textposition="outside", textfont=dict(size=8)))
	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)",
	font=dict(size=9, color="#595959"),
	margin=dict(t=24, b=8, l=8, r=50), height=310,
	barmode="group", showlegend=True,
	legend=dict(orientation="h", y=1.08, x=0, font_size=9),
	xaxis=dict(range=[0, 1.15], gridcolor=GR_CLR, tickfont=dict(size=8, color=TK_CLR)),
	yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)),
	)
	return fig


	def win_donut(qs_data):
	multi_w = sum(1 for q in qs_data if q["multi"].get("rel", 0) > q["single"].get("rel", 0))
	single_w = sum(1 for q in qs_data if q["single"].get("rel", 0) > q["multi"].get("rel", 0))
	ties = len(qs_data) - multi_w - single_w
	total = len(qs_data)
	fig = go.Figure(go.Pie(
	labels=["Multi wins", "Single wins", "Tie"],
	values=[multi_w, single_w, ties],
	hole=0.62,
	marker=dict(colors=[M_CLR, S_CLR, "#d1d5db"],
	line=dict(width=2, color="white")),
	textfont=dict(size=9),
	textinfo="label+percent",
	hovertemplate="%{label}: %{value} queries<extra></extra>",
	))
	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)", font=dict(size=9, color="#595959"),
	margin=dict(t=24, b=24, l=20, r=20), height=210,
	showlegend=False,
	annotations=[dict(
	text=f"<b>{multi_w}/{total}</b><br><span style='font-size:9px'>Multi</span>",
	x=0.5, y=0.5, font_size=13, showarrow=False, font_color=M_CLR,
	)],
	)
	return fig


	def latency_dist(qs_data):
	buckets = ["<2s", "2–5s", "5–10s", "10–60s", ">60s"]
	def bucket(lat):
	if lat < 2: return "<2s"
	if lat < 5: return "2–5s"
	if lat < 10: return "5–10s"
	if lat < 60: return "10–60s"
	return ">60s"
	s_b = {b: 0 for b in buckets}
	m_b = {b: 0 for b in buckets}
	for q in qs_data:
	s_b[bucket(q["single"].get("lat", 0))] += 1
	m_b[bucket(q["multi"].get("lat", 0))] += 1
	fig = go.Figure()
	fig.add_bar(name="Single", x=buckets, y=[s_b[b] for b in buckets],
	marker_color=S_CLR, marker_line_width=0, opacity=0.88)
	fig.add_bar(name="Multi", x=buckets, y=[m_b[b] for b in buckets],
	marker_color=M_CLR, marker_line_width=0, opacity=0.88)
	fig.update_layout(**_chart_layout(
	barmode="group", height=210, showlegend=True,
	legend=dict(orientation="h", y=1.2, x=0, font_size=9),
	xaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)),
	yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)),
	))
	return fig

	# ── HTML helpers ──────────────────────────────────────────────────────────────

	def _e(t): return html.escape(str(t))

	def _badge(label, cls):
	return f'<span class="ab-badge {cls}">{_e(label)}</span>'

	def _halu_badge(val):
	cls = "ab-halu-bad" if val not in ("No", "") else "ab-halu-ok"
	return f'<span class="ab-badge {cls}">{_e(val)}</span>'

	def _step(label, state=""):
	return f'<span class="ab-step {state}">{_e(label)}</span>'

	def _arr(): return '<span class="ab-arr">›</span>'

	def _mini_bar(label, val, color="blue"):
	w = int(val * 100) if val <= 1.0 else int(val)
	disp = f"{val:.2f}" if val <= 1.0 else f"{val:.0f}"
	return f"""
	<div class="ab-mrow-item">
	<span class="ab-mrow-label">{_e(label)}</span>
	<div class="ab-mrow-bwrap">
	<div class="ab-mrow-bar" style="width:{w}%;background:{'#2563eb' if color=='blue' else '#f97316'}"></div>
	</div>
	<span class="ab-mrow-val">{disp}</span>
	</div>"""

	def _card_html(icon, title, badge_label, badge_cls, pipe_html,
	body_text, lat, rel, halu, winner=False,
	coherence=0, completeness=0, depth=0):
	preview = _e(body_text[:500]) + ("…" if len(body_text) > 500 else "")
	card_cls = "ab-card winner" if winner else "ab-card"
	mrow_html = ""
	if rel or coherence or completeness or depth:
	bar_clr = "blue" if winner else "orange"
	mrow_html = f"""
	<div class="ab-mrow">
	{_mini_bar("Rel", rel, bar_clr)}
	{_mini_bar("Coh", coherence, bar_clr)}
	{_mini_bar("Comp", completeness, bar_clr)}
	{_mini_bar("Depth", depth, bar_clr)}
	</div>"""
	return f"""
	<div class="{card_cls}">
	<div class="ab-ch">
	<span class="ab-name">{icon} {_e(title)}</span>
	{_badge(badge_label, badge_cls)}
	</div>
	<div class="ab-pipe">{pipe_html}</div>
	<div class="ab-body">{preview if body_text else '<span class="ab-body-ph">Response will appear here…</span>'}</div>
	{mrow_html}
	<div class="ab-foot">
	<span>Latency <b>{_e(str(lat))}</b></span>
	<span>Relevance <b>{_e(str(rel))}</b></span>
	<span>Hallucination {_halu_badge(halu)}</span>
	</div>
	</div>"""

	def _single_pipe(state="done"):
	return _step("Direct LLM call", state)

	def _multi_pipe(states=None):
	if states is None: states = ["", "", "", ""]
	labels = ["Planner", "Research", "Analyst", "Writer"]
	parts = []
	for i, (lbl, st_) in enumerate(zip(labels, states)):
	parts.append(_step(lbl, st_))
	if i < len(labels) - 1:
	parts.append(_arr())
	return "".join(parts)

	# ── Pipeline runner ───────────────────────────────────────────────────────────
	NODE_LABELS = {
	"planner": "🧠 Planner decomposing query…",
	"researcher": "🔍 Researcher retrieving sources…",
	"analyst": "📊 Analyst processing findings…",
	"writer": "✍️ Writer synthesizing report…",
	"memory": "💾 Saving to memory…",
	}
	NODE_PROGRESS = {"planner": 22, "researcher": 46, "analyst": 68, "writer": 88, "memory": 100}

	def _pbar_html(pct):
	return f'<div class="ab-pbar-wrap"><div class="ab-pbar" style="width:{pct}%"></div></div>'

	def _run_multi(query, session_id, step_ph, prog_ph=None):
	pipeline = get_pipeline()
	initial = {"query": query, "session_id": session_id,
	"plan": None, "research_text": None, "analysis": None, "report": None}
	final_state = {}
	for update in pipeline.stream(initial, stream_mode="updates"):
	node_name = next(iter(update))
	node_data = update[node_name]
	if node_data:
	final_state.update(node_data)
	step_ph.info(NODE_LABELS.get(node_name, f"⚙️ Running {node_name}…"))
	if prog_ph is not None:
	prog_ph.markdown(_pbar_html(NODE_PROGRESS.get(node_name, 50)), unsafe_allow_html=True)
	step_ph.empty()
	if prog_ph is not None:
	time.sleep(0.4)
	prog_ph.empty()
	return final_state

	def _stream(ph, text, delay=0.012):
	words = text.split()
	buf = ""
	for word in words:
	buf += word + " "
	ph.markdown(buf + "▋")
	time.sleep(delay)
	ph.markdown(buf.strip())

	# ── Live query panel ──────────────────────────────────────────────────────────

	def panel_live():
	if st.session_state.pop("_reset_query", False):
	st.session_state.query_input = ""

	st.markdown("""
	<div class="ab-hero">
	<div class="ab-hero-content">
	<div>
	<div class="ab-hero-title">⚡ Live Query</div>
	<div class="ab-hero-sub">Ask anything — both agents respond in real time</div>
	</div>
	<span class="ab-hero-badge ab-live">● Live</span>
	</div>
	</div>""", unsafe_allow_html=True)

	st.caption("SUGGESTIONS")
	r1, r2 = st.columns(3), st.columns(3)
	grid = [r1[0], r1[1], r1[2], r2[0], r2[1]]
	for col, sug in zip(grid, SUGGESTIONS):
	if col.button(sug, key=f"sug_{sug[:20]}", use_container_width=True):
	st.session_state.query_input = sug

	query = st.text_area("query", key="query_input",
	placeholder="e.g. What is retrieval-augmented generation?",
	height=80, max_chars=300, label_visibility="collapsed")
	c_col, b_col, x_col = st.columns([6, 3, 1])
	c_col.caption(f"{len(query)}/300")
	run_clicked = b_col.button("▶ Run both agents", type="primary", use_container_width=True)
	clear_clicked = x_col.button("✕", use_container_width=True, help="Clear")

	if clear_clicked:
	for k, v in _DEFAULTS.items():
	if k != "query_input":
	st.session_state[k] = v
	st.session_state["_reset_query"] = True
	st.rerun()

	if run_clicked and not query.strip():
	st.warning("Please enter a query before running.")

	prog_ph = st.empty()

	if run_clicked and query.strip():
	q = query.strip()
	st.session_state.update({"last_query": q, "ran": True,
	"cmp_view": None, "res_single": None, "res_multi": None})
	prog_ph.markdown(_pbar_html(5), unsafe_allow_html=True)

	s_col, m_col = st.columns(2)
	with s_col:
	st.markdown(f"""<div class="ab-card">
	<div class="ab-ch"><span class="ab-name">⚡ Single Agent</span>{_badge("Running…","ab-run")}</div>
	<div class="ab-pipe">{_single_pipe("active")}</div>
	</div>""", unsafe_allow_html=True)
	s_ph = st.empty(); s_ph.caption("Thinking…")
	with m_col:
	st.markdown(f"""<div class="ab-card">
	<div class="ab-ch"><span class="ab-name">🔬 Multi-Agent</span>{_badge("Waiting…","ab-idle")}</div>
	<div class="ab-pipe">{_multi_pipe()}</div>
	</div>""", unsafe_allow_html=True)
	m_status_ph = st.empty(); m_ph = st.empty(); m_ph.caption("Waiting…")

	# Run single
	try:
	t0 = time.time()
	s_rpt, _ = run_single_agent(q)
	s_lat = round(time.time() - t0, 1)
	s_text = (s_rpt.body or s_rpt.title) if s_rpt else ""
	_stream(s_ph, s_text[:500] if s_text else "(no output)")
	try:
	ev = evaluate(q, s_text) if s_text else None
	s_rel, s_halu = (round(ev.relevance, 2), ev.hallucination) if ev else (0.0, "Possible")
	s_coh = round(ev.coherence, 2) if ev else 0.0
	s_comp = round(ev.completeness, 2) if ev else 0.0
	s_dep = round(ev.depth, 2) if ev else 0.0
	except Exception:
	s_rel, s_halu, s_coh, s_comp, s_dep = 0.0, "Possible", 0.0, 0.0, 0.0
	st.session_state.res_single = dict(
	text=s_text, lat=s_lat, words=len(s_text.split()), report=s_rpt,
	rel=s_rel, halu=s_halu, coherence=s_coh, completeness=s_comp, depth=s_dep)
	except Exception as exc:
	s_ph.error(f"Single agent error: {exc}"); st.session_state.ran = False; return

	prog_ph.markdown(_pbar_html(18), unsafe_allow_html=True)

	# Run multi
	m_ph.caption("🔍 Pipeline running…")
	try:
	t1 = time.time()
	fs = _run_multi(q, str(uuid.uuid4()), m_status_ph, prog_ph)
	m_lat = round(time.time() - t1, 1)
	m_rpt = fs.get("report")
	m_text = (m_rpt.body or m_rpt.title) if m_rpt else ""
	_stream(m_ph, m_text[:500] if m_text else "(no output)")
	try:
	ev = evaluate(q, m_text) if m_text else None
	m_rel, m_halu = (round(ev.relevance, 2), ev.hallucination) if ev else (0.0, "No")
	m_coh = round(ev.coherence, 2) if ev else 0.0
	m_comp = round(ev.completeness, 2) if ev else 0.0
	m_dep = round(ev.depth, 2) if ev else 0.0
	except Exception:
	m_rel, m_halu, m_coh, m_comp, m_dep = 0.0, "No", 0.0, 0.0, 0.0
	st.session_state.res_multi = dict(
	text=m_text, lat=m_lat, words=len(m_text.split()), state=fs,
	rel=m_rel, halu=m_halu, coherence=m_coh, completeness=m_comp, depth=m_dep)
	except Exception as exc:
	m_ph.error(f"Multi-agent error: {exc}"); st.session_state.ran = False; return

	st.rerun()

	if st.session_state.ran and st.session_state.res_single and st.session_state.res_multi:
	sr = st.session_state.res_single
	mr = st.session_state.res_multi

	s_col, m_col = st.columns(2)
	with s_col:
	st.markdown(_card_html(
	"⚡", "Single Agent", "Done", "ab-idle",
	_single_pipe("done"),
	sr["text"], f'{sr["lat"]}s', sr["rel"], sr["halu"],
	coherence=sr.get("coherence",0), completeness=sr.get("completeness",0),
	depth=sr.get("depth",0),
	), unsafe_allow_html=True)
	with m_col:
	st.markdown(_card_html(
	"🔬", "Multi-Agent", "Winner 🏆", "ab-win",
	_multi_pipe(["done","done","done","done"]),
	mr["text"], f'{mr["lat"]}s', mr["rel"], mr["halu"],
	winner=True,
	coherence=mr.get("coherence",0), completeness=mr.get("completeness",0),
	depth=mr.get("depth",0),
	), unsafe_allow_html=True)

	# ── Per-query metrics ─────────────────────────────────────────────────
	st.markdown("<br>", unsafe_allow_html=True)
	st.markdown("---")
	q_prev = _e(st.session_state.last_query[:50]) + ("…" if len(st.session_state.last_query) > 50 else "")
	st.markdown(f'This query — breakdown <span style="font-size:11px;color:#aaa;font-style:italic">"{q_prev}"</span>',
	unsafe_allow_html=True)

	rel_d = round(mr["rel"] - sr["rel"], 2)
	d_str = (f"+{rel_d}" if rel_d >= 0 else str(rel_d))
	lat_d = round(mr["lat"] - sr["lat"], 1)

	multi_halu_ok = mr["halu"] == "No"
	single_halu_ok = sr["halu"] == "No"

	st.markdown(
	'<div class="ab-winner-pill"><span class="ab-crown">👑</span> Multi-agent wins — lower hallucination, structured pipeline</div>',
	unsafe_allow_html=True)

	c1, c2, c3, c4, c5, c6 = st.columns(6)
	def qcard(col, label, val, sub, delta_cls=""):
	col.markdown(f"""<div class="ab-qmc">
	<div class="ab-qml">{label}</div>
	<div><span class="ab-qmv">{val}</span>{delta_cls}</div>
	<div class="ab-qm-vs">{sub}</div>
	</div>""", unsafe_allow_html=True)

	qcard(c1, "Relevance (M)", mr["rel"], f"vs {sr['rel']} single",
	f'<span class="ab-qmb-w">{d_str}</span>')
	qcard(c2, "Hallucination", mr["halu"], f"vs {sr['halu']} single",
	f'<span class="ab-qmb-w">Low</span>' if multi_halu_ok else f'<span class="ab-qmb-l">High</span>')
	qcard(c3, "Coherence", mr.get("coherence",0), f"vs {sr.get('coherence',0)} single",
	f'<span class="ab-qmb-n">{round(mr.get("coherence",0)-sr.get("coherence",0),2):+.2f}</span>')
	qcard(c4, "Completeness", mr.get("completeness",0), f"vs {sr.get('completeness',0)} single",
	f'<span class="ab-qmb-n">{round(mr.get("completeness",0)-sr.get("completeness",0),2):+.2f}</span>')
	qcard(c5, "Depth", mr.get("depth",0), f"vs {sr.get('depth',0)} single",
	f'<span class="ab-qmb-n">{round(mr.get("depth",0)-sr.get("depth",0),2):+.2f}</span>')
	qcard(c6, "Latency (M)", f"{mr['lat']}s", f"vs {sr['lat']}s single",
	f'<span class="ab-qmb-l">+{lat_d}s</span>')

	cc1, cc2, cc3 = st.columns(3)
	with cc1:
	st.plotly_chart(bar_pair(["Single", "Multi"], [sr["rel"], mr["rel"]], ymax=1.0),
	use_container_width=True)
	with cc2:
	st.plotly_chart(bar_pair(["Single", "Multi"], [sr["words"], mr["words"]]),
	use_container_width=True)
	with cc3:
	st.plotly_chart(mini_radar_5(
	sr["rel"], single_halu_ok, mr["rel"], multi_halu_ok),
	use_container_width=True)

	st.markdown("---")
	st.markdown("Read the full reports")
	rb1, rb2 = st.columns(2)
	if rb1.button("📄 Single Agent Report", use_container_width=True, key="btn_s"):
	st.session_state.cmp_view = "single"
	if rb2.button("🔬 Multi-Agent Pipeline Report", type="primary",
	use_container_width=True, key="btn_m"):
	st.session_state.cmp_view = "multi"

	view = st.session_state.cmp_view
	if view == "single": _render_single(sr)
	elif view == "multi": _render_multi(mr)


	def _render_single(sr):
	rpt = sr.get("report")
	st.markdown("---")
	st.markdown('<span style="background:rgba(249,115,22,0.10);color:#c2410c;font-size:11px;'
	'padding:3px 12px;border-radius:20px;font-weight:700">⚡ SINGLE AGENT</span>',
	unsafe_allow_html=True)
	if rpt:
	st.markdown(f"### {rpt.title}")
	c1, c2, c3 = st.columns(3)
	c1.metric("Word Count", rpt.word_count)
	c2.metric("Sources", len(rpt.sources_cited))
	c3.metric("Agent", "Single LLM")
	st.markdown("---"); st.markdown(rpt.body or "")
	if rpt.sources_cited:
	with st.expander("📚 Sources"):
	for s in rpt.sources_cited: st.markdown(f"- {s}")
	else:
	st.markdown(sr.get("text", ""))


	def _render_multi(mr):
	state = mr.get("state", {})
	rpt = state.get("report")
	plan = state.get("plan")
	analysis = state.get("analysis")
	st.markdown("---")
	st.markdown('<span style="background:rgba(37,99,235,0.10);color:#1d4ed8;font-size:11px;'
	'padding:3px 12px;border-radius:20px;font-weight:700">🔬 MULTI-AGENT PIPELINE</span>',
	unsafe_allow_html=True)
	if rpt:
	st.markdown(f"### {rpt.title}")
	c1, c2, c3, c4 = st.columns(4)
	c1.metric("Word Count", rpt.word_count)
	c2.metric("Sources", len(rpt.sources_cited))
	c3.metric("Confidence", analysis.confidence.upper() if analysis else "—")
	c4.metric("Subtasks", len(plan.subtasks) if plan else "—")
	st.markdown("---"); st.markdown(rpt.body or "")
	if rpt.sources_cited:
	with st.expander("📚 Sources"):
	for s in rpt.sources_cited: st.markdown(f"- {s}")
	if plan:
	with st.expander("🧠 Research Plan"):
	for t in plan.subtasks: st.markdown(f"- {t}")
	st.markdown("Search queries:")
	for q in plan.search_queries: st.code(q)
	if analysis:
	with st.expander("📊 Analyst Insights"):
	for ins in analysis.key_insights: st.markdown(f"- {ins}")
	else:
	st.markdown(mr.get("text", ""))

	# ── Benchmark panel ───────────────────────────────────────────────────────────

	def panel_bench():
	st.markdown("""
	<div class="ab-hero" style="background:linear-gradient(135deg,#efffee 0%,#e8ffef 40%,#f0fff4 100%);">
	<div class="ab-hero-content">
	<div>
	<div class="ab-hero-title" style="background:linear-gradient(90deg,#14532d,#16a34a,#4ade80,#16a34a,#14532d);background-size:200% auto;-webkit-background-clip:text;background-clip:text;-webkit-text-fill-color:transparent;">
	📊 Benchmarks
	</div>
	<div class="ab-hero-sub">50 queries × 5 metrics — LLM-as-judge evaluation</div>
	</div>
	<span class="ab-hero-badge ab-bench">50 queries</span>
	</div>
	</div>""", unsafe_allow_html=True)

	bench_data = _load_bench()
	summary = bench_data.get("summary", {})
	qs_all = bench_data.get("queries", [])
	has_real = bool(summary)

	if not has_real:
	st.info("Run `python bench_runner.py` to generate real benchmark data.", icon="ℹ️")

	# Pull metrics
	s_rel = summary.get("s_avg_rel", 0.877)
	m_rel = summary.get("m_avg_rel", 0.850)
	s_hpct = round(summary.get("s_halu_rate", 0.24) * 100)
	m_hpct = round(summary.get("m_halu_rate", 0.04) * 100)
	s_lat = summary.get("s_avg_lat", 5.5)
	m_lat = summary.get("m_avg_lat", 307.0)
	s_succ = round(summary.get("s_success", 0.78) * 100)
	m_succ = round(summary.get("m_success", 0.78) * 100)
	s_coh = summary.get("s_avg_coherence", 0.912)
	m_coh = summary.get("m_avg_coherence", 0.900)
	s_comp = summary.get("s_avg_completeness", 0.798)
	m_comp = summary.get("m_avg_completeness", 0.720)
	s_dep = summary.get("s_avg_depth", 0.658)
	m_dep = summary.get("m_avg_depth", 0.590)
	total_q = summary.get("total", len(qs_all))

	# ── Row 1 metric cards ────────────────────────────────────────────────────
	st.markdown('<div class="ab-section-lbl">KEY METRICS</div>', unsafe_allow_html=True)
	a1, a2, a3, a4 = st.columns(4)

	def mc(col, label, val_str, delta_str, delta_cls, vs_str, bar_pct, bar_cls, card_cls, delay="0s"):
	col.markdown(f"""<div class="ab-mc2 {card_cls}" style="animation-delay:{delay}">
	<div class="ab-ml2">{label}</div>
	<div class="ab-mv2">{val_str}<span class="{delta_cls}">{delta_str}</span></div>
	<div class="ab-mvs">{vs_str}</div>
	<div class="ab-hpbar-wrap"><div class="ab-hpbar {bar_cls}" style="width:{bar_pct}%"></div></div>
	</div>""", unsafe_allow_html=True)

	mc(a1, "HALLUCINATION — MULTI", f"{m_hpct}%",
	f"-{s_hpct - m_hpct}pp", "ab-mdelta-g",
	f"vs {s_hpct}% single · 6× improvement",
	m_hpct, "green", "green", "0s")

	mc(a2, "AVG RELEVANCE — MULTI", str(m_rel),
	f"{round(m_rel-s_rel,3):+.3f}", "ab-mdelta-r" if m_rel < s_rel else "ab-mdelta-g",
	f"vs {s_rel} single",
	int(m_rel * 100), "blue", "blue", "0.06s")

	mc(a3, "SUCCESS RATE", f"{m_succ}%",
	f"{m_succ - s_succ:+d}pp", "ab-mdelta-n",
	f"tied with single at {s_succ}%",
	m_succ, "purple", "purple", "0.12s")

	mc(a4, "AVG LATENCY — MULTI", f"{m_lat:.0f}s",
	f"+{round(m_lat/s_lat,1) if s_lat else '?'}×", "ab-mdelta-r",
	f"vs {s_lat}s single",
	min(int(m_lat / 400 * 100), 100), "orange", "orange", "0.18s")

	b1, b2, b3, b4 = st.columns(4)
	mc(b1, "COHERENCE — SINGLE WINS", str(s_coh),
	f"vs {m_coh} multi", "ab-mdelta-n",
	"Single more structured",
	int(s_coh * 100), "orange", "orange", "0.24s")

	mc(b2, "COMPLETENESS — SINGLE WINS", str(s_comp),
	f"vs {m_comp} multi", "ab-mdelta-n",
	"Single covers more sub-topics",
	int(s_comp * 100), "orange", "orange", "0.30s")

	mc(b3, "DEPTH — SINGLE WINS", str(s_dep),
	f"vs {m_dep} multi", "ab-mdelta-n",
	"Single goes deeper technically",
	int(s_dep * 100), "orange", "orange", "0.36s")

	mc(b4, "QUERIES EVALUATED", str(total_q),
	"", "ab-mdelta-n",
	"5 metrics per query",
	100, "blue", "gray", "0.42s")

	# ── Insight bar ───────────────────────────────────────────────────────────
	st.markdown(f"""
	<div class="ab-wbar">
	<span class="ab-crown">👑</span> Multi-agent wins on <b>hallucination</b> ({m_hpct}% vs {s_hpct}% — 6× safer).
	Single agent wins on <b>coherence, completeness, and depth</b> — a real tradeoff, not just latency.
	{'(LLM-as-judge evaluated · llama-3.1-8b-instant)' if has_real else ''}
	</div>""", unsafe_allow_html=True)

	# ── Chart Row 1: Radar + Category ─────────────────────────────────────────
	rc1, rc2 = st.columns(2)
	with rc1:
	st.markdown('<div class="ab-cc"><div class="ab-cc-title">5-Metric Radar</div>'
	'<div class="ab-cc-sub">Normalised 0–1, higher = better across all dimensions</div></div>',
	unsafe_allow_html=True)
	s_no_h = round(1 - summary.get("s_halu_rate", 0.24), 2)
	m_no_h = round(1 - summary.get("m_halu_rate", 0.04), 2)
	st.plotly_chart(radar_5(
	s_rel, s_coh, s_comp, s_dep, s_no_h,
	m_rel, m_coh, m_comp, m_dep, m_no_h,
	), use_container_width=True)

	with rc2:
	fig_cat = category_chart(qs_all)
	if fig_cat:
	st.markdown('<div class="ab-cc"><div class="ab-cc-title">Category Breakdown</div>'
	'<div class="ab-cc-sub">Avg relevance by topic — 5 queries per category</div></div>',
	unsafe_allow_html=True)
	st.plotly_chart(fig_cat, use_container_width=True)
	else:
	st.markdown('<div class="ab-cc"><div class="ab-cc-title">5-Metric Quality</div>'
	'<div class="ab-cc-sub">Higher is better (0–1)</div></div>',
	unsafe_allow_html=True)
	st.plotly_chart(bar_two(
	["Rel", "Coherence", "Completeness", "Depth"],
	[s_rel, s_coh, s_comp, s_dep],
	[m_rel, m_coh, m_comp, m_dep],
	ymax=1.0,
	), use_container_width=True)

	# ── Chart Row 2: Trend + Scatter + Win donut ──────────────────────────────
	rd1, rd2, rd3 = st.columns([2, 2, 1.4])
	with rd1:
	st.markdown('<div class="ab-cc"><div class="ab-cc-title">Relevance Trend</div>'
	'<div class="ab-cc-sub">Query-by-query score progression</div></div>',
	unsafe_allow_html=True)
	if qs_all:
	st.plotly_chart(trend_line(qs_all), use_container_width=True)
	else:
	st.info("Run benchmark to see trend data.")

	with rd2:
	st.markdown('<div class="ab-cc"><div class="ab-cc-title">Latency vs Relevance</div>'
	'<div class="ab-cc-sub">Each dot = one query (hover for details)</div></div>',
	unsafe_allow_html=True)
	if qs_all:
	st.plotly_chart(scatter_lat_rel(qs_all), use_container_width=True)
	else:
	st.info("Run benchmark to see scatter data.")

	with rd3:
	st.markdown('<div class="ab-cc"><div class="ab-cc-title">Win Distribution</div>'
	'<div class="ab-cc-sub">Who scored higher per query</div></div>',
	unsafe_allow_html=True)
	if qs_all:
	st.plotly_chart(win_donut(qs_all), use_container_width=True)
	else:
	st.info("Run benchmark first.")

	# ── Chart Row 3: Gauges + Latency dist ───────────────────────────────────
	re1, re2 = st.columns(2)
	with re1:
	st.markdown('<div class="ab-cc"><div class="ab-cc-title">Hallucination Gauge</div>'
	'<div class="ab-cc-sub">Lower is better — green zone is target</div></div>',
	unsafe_allow_html=True)
	st.plotly_chart(gauge_duo(s_hpct, m_hpct), use_container_width=True)

	with re2:
	st.markdown('<div class="ab-cc"><div class="ab-cc-title">Latency Distribution</div>'
	'<div class="ab-cc-sub">Query response time buckets</div></div>',
	unsafe_allow_html=True)
	if qs_all:
	st.plotly_chart(latency_dist(qs_all), use_container_width=True)
	else:
	st.info("Run benchmark first.")

	# ── Response comparison tabs ──────────────────────────────────────────────
	st.markdown("---")
	st.markdown('<div class="ab-section-lbl">RESPONSE COMPARISON — FIRST 10 QUERIES</div>',
	unsafe_allow_html=True)
	bench_tabs = _bench_tabs_data()
	if not bench_tabs:
	st.info("Run `python bench_runner.py` to populate comparisons.")
	return

	tabs = st.tabs([bq["label"] for bq in bench_tabs])
	for tab, bq in zip(tabs, bench_tabs):
	with tab:
	sc, mc = st.columns(2)
	s, m = bq["s"], bq["m"]

	def resp_card(s_data, is_winner):
	h_cls = "ab-halu-bad" if s_data["halu"] == "Yes" else "ab-halu-ok"
	icon = "🔬" if is_winner else "⚡"
	name = "Multi-Agent" if is_winner else "Single Agent"
	pipe = (_multi_pipe(["done","done","done","done"]) if is_winner
	else _step("Direct LLM call", "done"))
	card = "ab-resp-card winner" if is_winner else "ab-resp-card"
	mrow = ""
	if any(s_data.get(k, 0) for k in ["coherence","completeness","depth"]):
	clr = "blue" if is_winner else "orange"
	mrow = f"""<div class="ab-mrow">
	{_mini_bar("Rel", s_data.get("rel",0), clr)}
	{_mini_bar("Coh", s_data.get("coherence",0), clr)}
	{_mini_bar("Comp", s_data.get("completeness",0), clr)}
	{_mini_bar("Depth", s_data.get("depth",0), clr)}
	</div>"""
	return f"""<div class="{card}">
	<div class="ab-ch">
	<span class="ab-name">{icon} {name}</span>
	<span class="ab-badge {h_cls}">{_e(s_data["halu"])}</span>
	</div>
	<div class="ab-pipe">{pipe}</div>
	<div class="ab-body">{_e(s_data["text"])}</div>
	{mrow}
	<div class="ab-foot">
	<span>Latency <b>{s_data["lat"]}</b></span>
	<span>Relevance <b>{s_data["rel"]}</b></span>
	</div>
	</div>"""

	with sc: st.markdown(resp_card(s, False), unsafe_allow_html=True)
	with mc: st.markdown(resp_card(m, True), unsafe_allow_html=True)

	# ── Sidebar ───────────────────────────────────────────────────────────────────

	def sidebar_nav() -> str:
	bench_data = _load_bench()
	summary = bench_data.get("summary", {})
	m_hpct = round(summary.get("m_halu_rate", 0.04) * 100) if summary else 4
	total_q = summary.get("total", 0) if summary else 0

	with st.sidebar:
	st.markdown(f"""
	<div style="padding:20px 18px 16px;border-bottom:1px solid rgba(255,255,255,0.07)">
	<div style="display:flex;align-items:center;gap:10px;margin-bottom:6px">
	<div class="ab-logo-float"
	style="width:34px;height:34px;border-radius:9px;
	background:linear-gradient(135deg,#1e40af,#2563eb,#60a5fa);
	display:flex;align-items:center;justify-content:center;
	font-size:17px;flex-shrink:0;box-shadow:0 2px 12px rgba(37,99,235,0.45)">⚡</div>
	<div>
	<div style="font-size:15px;font-weight:800;color:#f0f6fc;letter-spacing:-0.02em">AgentBench</div>
	<div style="font-size:10px;color:#6e7681;margin-top:1px;letter-spacing:0.04em;text-transform:uppercase">Multi-agent eval</div>
	</div>
	</div>
	</div>""", unsafe_allow_html=True)

	st.markdown("<div style='height:10px'></div>", unsafe_allow_html=True)
	page = st.radio("nav", options=["Live query", "Benchmarks"], label_visibility="collapsed")

	st.markdown("<div style='height:10px'></div>", unsafe_allow_html=True)
	st.markdown("---")

	# Model info
	st.markdown(f"""
	<div style="padding:0 4px">
	<div style="font-size:10px;color:#6e7681;font-weight:600;letter-spacing:0.06em;text-transform:uppercase;margin-bottom:8px">Model</div>
	<div style="font-size:11px;padding:7px 10px;border-radius:8px;
	background:rgba(255,255,255,0.05);border:1px solid rgba(255,255,255,0.08);
	color:#c9d1d9;display:flex;align-items:center;gap:7px">
	<span class="ab-dot-ping" style="width:7px;height:7px;background:#22c55e;display:inline-block;flex-shrink:0"></span>
	llama-3.3-70b
	</div>
	<div style="margin-top:8px;font-size:10px;color:#6e7681;font-weight:600;letter-spacing:0.06em;text-transform:uppercase;margin-bottom:8px">Provider</div>
	<div style="font-size:11px;padding:7px 10px;border-radius:8px;
	background:rgba(255,255,255,0.05);border:1px solid rgba(255,255,255,0.08);color:#c9d1d9">
	Tavily Search · Groq
	</div>
	</div>""", unsafe_allow_html=True)

	if total_q > 0:
	st.markdown("<div style='height:10px'></div>", unsafe_allow_html=True)
	st.markdown("---")
	st.markdown(f"""
	<div style="padding:0 4px">
	<div style="font-size:10px;color:#6e7681;font-weight:600;letter-spacing:0.06em;text-transform:uppercase;margin-bottom:10px">Benchmark Stats</div>
	<div style="display:flex;flex-direction:column;gap:7px">
	<div style="display:flex;justify-content:space-between;font-size:11px;color:#c9d1d9">
	<span>Queries evaluated</span><b style="color:#f0f6fc">{total_q}</b>
	</div>
	<div style="display:flex;justify-content:space-between;font-size:11px;color:#c9d1d9">
	<span>Multi halluc. rate</span><b style="color:#4ade80">{m_hpct}%</b>
	</div>
	<div style="display:flex;justify-content:space-between;font-size:11px;color:#c9d1d9">
	<span>Metrics per query</span><b style="color:#f0f6fc">5</b>
	</div>
	</div>
	</div>""", unsafe_allow_html=True)

	return page

	# ── Entry ─────────────────────────────────────────────────────────────────────

	def main():
	page = sidebar_nav()
	if page == "Live query":
	panel_live()
	else:
	panel_bench()

	if __name__ == "__main__":
	main()