"""
AgentBench — Multi-Agent Evaluation Dashboard (Streamlit)
"""
import html
import json
import os
import time
import uuid
import plotly.graph_objects as go
import streamlit as st
from agents.single_agent import run_single_agent
from graph import build_graph
from evaluator import evaluate
st.set_page_config(
page_title="AgentBench — Multi-Agent Evaluation",
page_icon="⚡",
layout="wide",
initial_sidebar_state="expanded",
)
# ── CSS ───────────────────────────────────────────────────────────────────────
st.markdown("""
""", unsafe_allow_html=True)
# ── Static data ───────────────────────────────────────────────────────────────
SUGGESTIONS = [
"What is retrieval-augmented generation?",
"Explain the attention mechanism in transformers",
"When should I use LangGraph over LangChain?",
"How does LoRA reduce fine-tuning costs?",
"What is data drift in machine learning?",
]
CATEGORIES = [
("GenAI Concepts", range(0, 5)),
("Agentic AI", range(5, 10)),
("Fine-tuning", range(10, 15)),
("Architectures", range(15, 20)),
("Retrieval", range(20, 25)),
("Eval & Safety", range(25, 30)),
("ML Fundamentals", range(30, 35)),
("Efficient ML", range(35, 40)),
("Applied AI", range(40, 45)),
("Trends & Future", range(45, 50)),
]
_BENCH_FILE = os.path.join(os.path.dirname(__file__), "bench_results.json")
S_CLR = "#f97316"
M_CLR = "#2563eb"
S_FILL = "rgba(249,115,22,0.12)"
M_FILL = "rgba(37,99,235,0.12)"
GR_CLR = "rgba(0,0,0,0.05)"
TK_CLR = "rgba(0,0,0,0.40)"
_BASE_CHART = dict(
paper_bgcolor="rgba(0,0,0,0)",
plot_bgcolor ="rgba(0,0,0,0)",
font=dict(size=10, color="#595959"),
margin=dict(t=30, b=8, l=8, r=8),
)
# ── Data helpers ──────────────────────────────────────────────────────────────
def _load_bench() -> dict:
if os.path.exists(_BENCH_FILE):
try:
with open(_BENCH_FILE) as f:
return json.load(f)
except Exception:
pass
return {}
def _bench_tabs_data() -> list:
data = _load_bench()
tabs = []
for row in data.get("queries", [])[:10]:
label = row["query"][:30].rstrip() + "…" if len(row["query"]) > 30 else row["query"]
tabs.append({
"label": label,
"query": row["query"],
"s": {
"text": row["single"]["text"],
"lat": f'{row["single"]["lat"]}s',
"rel": row["single"].get("rel", 0),
"halu": row["single"].get("halu", "?"),
"coherence": row["single"].get("coherence", 0),
"completeness":row["single"].get("completeness", 0),
"depth": row["single"].get("depth", 0),
},
"m": {
"text": row["multi"]["text"],
"lat": f'{row["multi"]["lat"]}s',
"rel": row["multi"].get("rel", 0),
"halu": row["multi"].get("halu", "?"),
"coherence": row["multi"].get("coherence", 0),
"completeness":row["multi"].get("completeness", 0),
"depth": row["multi"].get("depth", 0),
},
})
return tabs
# ── Session state ─────────────────────────────────────────────────────────────
_DEFAULTS = {
"ran": False, "last_query": "", "query_input": "",
"res_single": None, "res_multi": None, "cmp_view": None,
}
for _k, _v in _DEFAULTS.items():
if _k not in st.session_state:
st.session_state[_k] = _v
@st.cache_resource
def get_pipeline():
return build_graph()
# ── Chart builders ────────────────────────────────────────────────────────────
def _chart_layout(**kw):
d = dict(**_BASE_CHART)
d.update(kw)
return d
def bar_two(labels, s_vals, m_vals, ymax=None):
fig = go.Figure()
fig.add_bar(name="Single", x=labels, y=s_vals,
marker_color=S_CLR, marker_line_width=0, opacity=0.9,
text=[f"{v:.2f}" if isinstance(v, float) else str(v) for v in s_vals],
textposition="outside", textfont=dict(size=9))
fig.add_bar(name="Multi", x=labels, y=m_vals,
marker_color=M_CLR, marker_line_width=0, opacity=0.9,
text=[f"{v:.2f}" if isinstance(v, float) else str(v) for v in m_vals],
textposition="outside", textfont=dict(size=9))
fig.update_layout(**_chart_layout(
barmode="group", height=200, showlegend=True,
legend=dict(orientation="h", y=1.2, x=0, font_size=9),
xaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)),
yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR),
range=[0, ymax] if ymax else None),
))
return fig
def bar_pair(labels, vals, ymax=None):
fig = go.Figure(go.Bar(
x=labels, y=vals,
marker_color=[S_CLR, M_CLR], marker_line_width=0, opacity=0.9,
text=[f"{v:.2f}" if isinstance(v, float) else str(v) for v in vals],
textposition="outside", textfont=dict(size=9),
))
fig.update_layout(**_chart_layout(
height=200, showlegend=False,
xaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)),
yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR),
range=[0, ymax] if ymax else None),
))
return fig
def radar_5(s_rel, s_coh, s_comp, s_dep, s_no_halu,
m_rel, m_coh, m_comp, m_dep, m_no_halu):
cats = ["Relevance", "Coherence", "Completeness", "Depth", "No-Halluc."]
s_vals = [s_rel, s_coh, s_comp, s_dep, s_no_halu]
m_vals = [m_rel, m_coh, m_comp, m_dep, m_no_halu]
closed = cats + [cats[0]]
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=s_vals + [s_vals[0]], theta=closed, fill="toself", name="Single",
line=dict(color=S_CLR, width=2), fillcolor=S_FILL,
))
fig.add_trace(go.Scatterpolar(
r=m_vals + [m_vals[0]], theta=closed, fill="toself", name="Multi",
line=dict(color=M_CLR, width=2), fillcolor=M_FILL,
))
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)", font=dict(size=10, color="#595959"),
margin=dict(t=36, b=24, l=28, r=28), height=260,
polar=dict(
radialaxis=dict(visible=True, range=[0,1], gridcolor=GR_CLR,
tickfont=dict(size=8, color=TK_CLR)),
angularaxis=dict(gridcolor=GR_CLR),
bgcolor="rgba(0,0,0,0)",
),
showlegend=True,
legend=dict(orientation="h", y=-0.1, x=0.2, font_size=9),
)
return fig
def mini_radar_5(s_rel, s_halu_ok, m_rel, m_halu_ok):
cats = ["Relevance", "Trust", "Speed-adj", "Coverage"]
s_v = [s_rel, 1 if s_halu_ok else 0.4, 0.92, min(s_rel + 0.04, 1.0)]
m_v = [m_rel, 1 if m_halu_ok else 0.4, 0.15, min(m_rel + 0.07, 1.0)]
closed = cats + [cats[0]]
fig = go.Figure()
fig.add_trace(go.Scatterpolar(r=s_v+[s_v[0]], theta=closed, fill="toself",
name="Single", line=dict(color=S_CLR, width=1.5), fillcolor=S_FILL))
fig.add_trace(go.Scatterpolar(r=m_v+[m_v[0]], theta=closed, fill="toself",
name="Multi", line=dict(color=M_CLR, width=1.5), fillcolor=M_FILL))
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)", font=dict(size=9, color="#595959"),
margin=dict(t=28, b=16, l=28, r=28), height=180,
polar=dict(
radialaxis=dict(visible=True, range=[0,1], gridcolor=GR_CLR,
tickfont=dict(size=7, color=TK_CLR)),
angularaxis=dict(gridcolor=GR_CLR), bgcolor="rgba(0,0,0,0)",
),
showlegend=True, legend=dict(orientation="h", y=-0.15, x=0.15, font_size=8),
)
return fig
def trend_line(qs_data):
idxs = list(range(1, len(qs_data) + 1))
s_rels = [q["single"].get("rel", 0) for q in qs_data]
m_rels = [q["multi"].get("rel", 0) for q in qs_data]
fig = go.Figure()
fig.add_trace(go.Scatter(x=idxs, y=s_rels, mode="lines", name="Single",
line=dict(color=S_CLR, width=2, shape="spline"),
fill="tozeroy", fillcolor=S_FILL))
fig.add_trace(go.Scatter(x=idxs, y=m_rels, mode="lines", name="Multi",
line=dict(color=M_CLR, width=2, shape="spline"),
fill="tozeroy", fillcolor=M_FILL))
fig.update_layout(**_chart_layout(
height=210, showlegend=True,
legend=dict(orientation="h", y=1.2, x=0, font_size=9),
xaxis=dict(title="Query #", range=[1, len(idxs)], gridcolor=GR_CLR,
tickfont=dict(size=9, color=TK_CLR)),
yaxis=dict(title="Relevance", range=[0, 1.15], gridcolor=GR_CLR,
tickfont=dict(size=9, color=TK_CLR)),
))
return fig
def scatter_lat_rel(qs_data):
s_x = [q["single"].get("lat", 0) for q in qs_data]
s_y = [q["single"].get("rel", 0) for q in qs_data]
m_x = [q["multi"].get("lat", 0) for q in qs_data]
m_y = [q["multi"].get("rel", 0) for q in qs_data]
q_labels = [q["query"][:45] + "…" for q in qs_data]
fig = go.Figure()
fig.add_trace(go.Scatter(x=s_x, y=s_y, mode="markers", name="Single",
marker=dict(color=S_CLR, size=9, opacity=0.75, line=dict(width=1, color="white")),
text=q_labels,
hovertemplate="%{text}
Lat: %{x}s Rel: %{y}Single"))
fig.add_trace(go.Scatter(x=m_x, y=m_y, mode="markers", name="Multi",
marker=dict(color=M_CLR, size=9, opacity=0.75, line=dict(width=1, color="white")),
text=q_labels,
hovertemplate="%{text}
Lat: %{x}s Rel: %{y}Multi"))
fig.update_layout(**_chart_layout(
height=210, showlegend=True,
legend=dict(orientation="h", y=1.2, x=0, font_size=9),
xaxis=dict(title="Latency (s)", type="log", gridcolor=GR_CLR,
tickfont=dict(size=9, color=TK_CLR)),
yaxis=dict(title="Relevance", range=[0, 1.15], gridcolor=GR_CLR,
tickfont=dict(size=9, color=TK_CLR)),
))
return fig
def gauge_duo(s_pct, m_pct):
fig = go.Figure()
steps_green = [
{"range": [0, 10], "color": "#f0fdf4"},
{"range": [10, 30], "color": "#fef9c3"},
{"range": [30, 100],"color": "#fef2f2"},
]
for val, label, clr, domain in [
(s_pct, "Single Agent", S_CLR, [0, 0.44]),
(m_pct, "Multi-Agent", M_CLR, [0.56, 1.0]),
]:
fig.add_trace(go.Indicator(
mode="gauge+number",
value=val,
number={"suffix": "%", "font": {"size": 22, "color": clr}},
title={"text": label, "font": {"size": 12, "color": "#374151"}},
gauge={
"axis": {"range": [0, 100], "tickwidth": 1,
"tickfont": {"size": 8}, "tickcolor": "#d1d5db"},
"bar": {"color": clr, "thickness": 0.28},
"bgcolor": "white",
"steps": steps_green,
"threshold": {"line": {"color": "red", "width": 3},
"thickness": 0.75, "value": 30},
},
domain={"x": domain, "y": [0, 1]},
))
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)",
font=dict(size=10, color="#595959"),
margin=dict(t=40, b=10, l=20, r=20),
height=200,
)
return fig
def category_chart(qs_data):
if len(qs_data) < 50:
return None
names, s_avgs, m_avgs = [], [], []
for cat_name, idx_range in CATEGORIES:
chunk = [qs_data[i] for i in idx_range if i < len(qs_data)]
if not chunk:
continue
names.append(cat_name)
s_avgs.append(round(sum(q["single"].get("rel", 0) for q in chunk) / len(chunk), 2))
m_avgs.append(round(sum(q["multi"].get("rel", 0) for q in chunk) / len(chunk), 2))
fig = go.Figure()
fig.add_trace(go.Bar(name="Single", y=names, x=s_avgs, orientation="h",
marker_color=S_CLR, marker_line_width=0, opacity=0.88,
text=s_avgs, textposition="outside", textfont=dict(size=8)))
fig.add_trace(go.Bar(name="Multi", y=names, x=m_avgs, orientation="h",
marker_color=M_CLR, marker_line_width=0, opacity=0.88,
text=m_avgs, textposition="outside", textfont=dict(size=8)))
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)",
font=dict(size=9, color="#595959"),
margin=dict(t=24, b=8, l=8, r=50), height=310,
barmode="group", showlegend=True,
legend=dict(orientation="h", y=1.08, x=0, font_size=9),
xaxis=dict(range=[0, 1.15], gridcolor=GR_CLR, tickfont=dict(size=8, color=TK_CLR)),
yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)),
)
return fig
def win_donut(qs_data):
multi_w = sum(1 for q in qs_data if q["multi"].get("rel", 0) > q["single"].get("rel", 0))
single_w = sum(1 for q in qs_data if q["single"].get("rel", 0) > q["multi"].get("rel", 0))
ties = len(qs_data) - multi_w - single_w
total = len(qs_data)
fig = go.Figure(go.Pie(
labels=["Multi wins", "Single wins", "Tie"],
values=[multi_w, single_w, ties],
hole=0.62,
marker=dict(colors=[M_CLR, S_CLR, "#d1d5db"],
line=dict(width=2, color="white")),
textfont=dict(size=9),
textinfo="label+percent",
hovertemplate="%{label}: %{value} queries",
))
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)", font=dict(size=9, color="#595959"),
margin=dict(t=24, b=24, l=20, r=20), height=210,
showlegend=False,
annotations=[dict(
text=f"{multi_w}/{total}
Multi",
x=0.5, y=0.5, font_size=13, showarrow=False, font_color=M_CLR,
)],
)
return fig
def latency_dist(qs_data):
buckets = ["<2s", "2–5s", "5–10s", "10–60s", ">60s"]
def bucket(lat):
if lat < 2: return "<2s"
if lat < 5: return "2–5s"
if lat < 10: return "5–10s"
if lat < 60: return "10–60s"
return ">60s"
s_b = {b: 0 for b in buckets}
m_b = {b: 0 for b in buckets}
for q in qs_data:
s_b[bucket(q["single"].get("lat", 0))] += 1
m_b[bucket(q["multi"].get("lat", 0))] += 1
fig = go.Figure()
fig.add_bar(name="Single", x=buckets, y=[s_b[b] for b in buckets],
marker_color=S_CLR, marker_line_width=0, opacity=0.88)
fig.add_bar(name="Multi", x=buckets, y=[m_b[b] for b in buckets],
marker_color=M_CLR, marker_line_width=0, opacity=0.88)
fig.update_layout(**_chart_layout(
barmode="group", height=210, showlegend=True,
legend=dict(orientation="h", y=1.2, x=0, font_size=9),
xaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)),
yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)),
))
return fig
# ── HTML helpers ──────────────────────────────────────────────────────────────
def _e(t): return html.escape(str(t))
def _badge(label, cls):
return f'{_e(label)}'
def _halu_badge(val):
cls = "ab-halu-bad" if val not in ("No", "") else "ab-halu-ok"
return f'{_e(val)}'
def _step(label, state=""):
return f'{_e(label)}'
def _arr(): return '›'
def _mini_bar(label, val, color="blue"):
w = int(val * 100) if val <= 1.0 else int(val)
disp = f"{val:.2f}" if val <= 1.0 else f"{val:.0f}"
return f"""
"""
def _card_html(icon, title, badge_label, badge_cls, pipe_html,
body_text, lat, rel, halu, winner=False,
coherence=0, completeness=0, depth=0):
preview = _e(body_text[:500]) + ("…" if len(body_text) > 500 else "")
card_cls = "ab-card winner" if winner else "ab-card"
mrow_html = ""
if rel or coherence or completeness or depth:
bar_clr = "blue" if winner else "orange"
mrow_html = f"""
{_mini_bar("Rel", rel, bar_clr)}
{_mini_bar("Coh", coherence, bar_clr)}
{_mini_bar("Comp", completeness, bar_clr)}
{_mini_bar("Depth", depth, bar_clr)}
"""
return f"""
{icon} {_e(title)}
{_badge(badge_label, badge_cls)}
{pipe_html}
{preview if body_text else 'Response will appear here…'}
{mrow_html}
"""
def _single_pipe(state="done"):
return _step("Direct LLM call", state)
def _multi_pipe(states=None):
if states is None: states = ["", "", "", ""]
labels = ["Planner", "Research", "Analyst", "Writer"]
parts = []
for i, (lbl, st_) in enumerate(zip(labels, states)):
parts.append(_step(lbl, st_))
if i < len(labels) - 1:
parts.append(_arr())
return "".join(parts)
# ── Pipeline runner ───────────────────────────────────────────────────────────
NODE_LABELS = {
"planner": "🧠 Planner decomposing query…",
"researcher": "🔍 Researcher retrieving sources…",
"analyst": "📊 Analyst processing findings…",
"writer": "✍️ Writer synthesizing report…",
"memory": "💾 Saving to memory…",
}
NODE_PROGRESS = {"planner": 22, "researcher": 46, "analyst": 68, "writer": 88, "memory": 100}
def _pbar_html(pct):
return f''
def _run_multi(query, session_id, step_ph, prog_ph=None):
pipeline = get_pipeline()
initial = {"query": query, "session_id": session_id,
"plan": None, "research_text": None, "analysis": None, "report": None}
final_state = {}
for update in pipeline.stream(initial, stream_mode="updates"):
node_name = next(iter(update))
node_data = update[node_name]
if node_data:
final_state.update(node_data)
step_ph.info(NODE_LABELS.get(node_name, f"⚙️ Running {node_name}…"))
if prog_ph is not None:
prog_ph.markdown(_pbar_html(NODE_PROGRESS.get(node_name, 50)), unsafe_allow_html=True)
step_ph.empty()
if prog_ph is not None:
time.sleep(0.4)
prog_ph.empty()
return final_state
def _stream(ph, text, delay=0.012):
words = text.split()
buf = ""
for word in words:
buf += word + " "
ph.markdown(buf + "▋")
time.sleep(delay)
ph.markdown(buf.strip())
# ── Live query panel ──────────────────────────────────────────────────────────
def panel_live():
if st.session_state.pop("_reset_query", False):
st.session_state.query_input = ""
st.markdown("""
⚡ Live Query
Ask anything — both agents respond in real time
● Live
""", unsafe_allow_html=True)
st.caption("SUGGESTIONS")
r1, r2 = st.columns(3), st.columns(3)
grid = [r1[0], r1[1], r1[2], r2[0], r2[1]]
for col, sug in zip(grid, SUGGESTIONS):
if col.button(sug, key=f"sug_{sug[:20]}", use_container_width=True):
st.session_state.query_input = sug
query = st.text_area("query", key="query_input",
placeholder="e.g. What is retrieval-augmented generation?",
height=80, max_chars=300, label_visibility="collapsed")
c_col, b_col, x_col = st.columns([6, 3, 1])
c_col.caption(f"{len(query)}/300")
run_clicked = b_col.button("▶ Run both agents", type="primary", use_container_width=True)
clear_clicked = x_col.button("✕", use_container_width=True, help="Clear")
if clear_clicked:
for k, v in _DEFAULTS.items():
if k != "query_input":
st.session_state[k] = v
st.session_state["_reset_query"] = True
st.rerun()
if run_clicked and not query.strip():
st.warning("Please enter a query before running.")
prog_ph = st.empty()
if run_clicked and query.strip():
q = query.strip()
st.session_state.update({"last_query": q, "ran": True,
"cmp_view": None, "res_single": None, "res_multi": None})
prog_ph.markdown(_pbar_html(5), unsafe_allow_html=True)
s_col, m_col = st.columns(2)
with s_col:
st.markdown(f"""
⚡ Single Agent{_badge("Running…","ab-run")}
{_single_pipe("active")}
""", unsafe_allow_html=True)
s_ph = st.empty(); s_ph.caption("Thinking…")
with m_col:
st.markdown(f"""
🔬 Multi-Agent{_badge("Waiting…","ab-idle")}
{_multi_pipe()}
""", unsafe_allow_html=True)
m_status_ph = st.empty(); m_ph = st.empty(); m_ph.caption("Waiting…")
# Run single
try:
t0 = time.time()
s_rpt, _ = run_single_agent(q)
s_lat = round(time.time() - t0, 1)
s_text = (s_rpt.body or s_rpt.title) if s_rpt else ""
_stream(s_ph, s_text[:500] if s_text else "(no output)")
try:
ev = evaluate(q, s_text) if s_text else None
s_rel, s_halu = (round(ev.relevance, 2), ev.hallucination) if ev else (0.0, "Possible")
s_coh = round(ev.coherence, 2) if ev else 0.0
s_comp = round(ev.completeness, 2) if ev else 0.0
s_dep = round(ev.depth, 2) if ev else 0.0
except Exception:
s_rel, s_halu, s_coh, s_comp, s_dep = 0.0, "Possible", 0.0, 0.0, 0.0
st.session_state.res_single = dict(
text=s_text, lat=s_lat, words=len(s_text.split()), report=s_rpt,
rel=s_rel, halu=s_halu, coherence=s_coh, completeness=s_comp, depth=s_dep)
except Exception as exc:
s_ph.error(f"Single agent error: {exc}"); st.session_state.ran = False; return
prog_ph.markdown(_pbar_html(18), unsafe_allow_html=True)
# Run multi
m_ph.caption("🔍 Pipeline running…")
try:
t1 = time.time()
fs = _run_multi(q, str(uuid.uuid4()), m_status_ph, prog_ph)
m_lat = round(time.time() - t1, 1)
m_rpt = fs.get("report")
m_text = (m_rpt.body or m_rpt.title) if m_rpt else ""
_stream(m_ph, m_text[:500] if m_text else "(no output)")
try:
ev = evaluate(q, m_text) if m_text else None
m_rel, m_halu = (round(ev.relevance, 2), ev.hallucination) if ev else (0.0, "No")
m_coh = round(ev.coherence, 2) if ev else 0.0
m_comp = round(ev.completeness, 2) if ev else 0.0
m_dep = round(ev.depth, 2) if ev else 0.0
except Exception:
m_rel, m_halu, m_coh, m_comp, m_dep = 0.0, "No", 0.0, 0.0, 0.0
st.session_state.res_multi = dict(
text=m_text, lat=m_lat, words=len(m_text.split()), state=fs,
rel=m_rel, halu=m_halu, coherence=m_coh, completeness=m_comp, depth=m_dep)
except Exception as exc:
m_ph.error(f"Multi-agent error: {exc}"); st.session_state.ran = False; return
st.rerun()
if st.session_state.ran and st.session_state.res_single and st.session_state.res_multi:
sr = st.session_state.res_single
mr = st.session_state.res_multi
s_col, m_col = st.columns(2)
with s_col:
st.markdown(_card_html(
"⚡", "Single Agent", "Done", "ab-idle",
_single_pipe("done"),
sr["text"], f'{sr["lat"]}s', sr["rel"], sr["halu"],
coherence=sr.get("coherence",0), completeness=sr.get("completeness",0),
depth=sr.get("depth",0),
), unsafe_allow_html=True)
with m_col:
st.markdown(_card_html(
"🔬", "Multi-Agent", "Winner 🏆", "ab-win",
_multi_pipe(["done","done","done","done"]),
mr["text"], f'{mr["lat"]}s', mr["rel"], mr["halu"],
winner=True,
coherence=mr.get("coherence",0), completeness=mr.get("completeness",0),
depth=mr.get("depth",0),
), unsafe_allow_html=True)
# ── Per-query metrics ─────────────────────────────────────────────────
st.markdown("
", unsafe_allow_html=True)
st.markdown("---")
q_prev = _e(st.session_state.last_query[:50]) + ("…" if len(st.session_state.last_query) > 50 else "")
st.markdown(f'**This query — breakdown** "{q_prev}"',
unsafe_allow_html=True)
rel_d = round(mr["rel"] - sr["rel"], 2)
d_str = (f"+{rel_d}" if rel_d >= 0 else str(rel_d))
lat_d = round(mr["lat"] - sr["lat"], 1)
multi_halu_ok = mr["halu"] == "No"
single_halu_ok = sr["halu"] == "No"
st.markdown(
'👑 Multi-agent wins — lower hallucination, structured pipeline
',
unsafe_allow_html=True)
c1, c2, c3, c4, c5, c6 = st.columns(6)
def qcard(col, label, val, sub, delta_cls=""):
col.markdown(f"""
{label}
{val}{delta_cls}
{sub}
""", unsafe_allow_html=True)
qcard(c1, "Relevance (M)", mr["rel"], f"vs {sr['rel']} single",
f'{d_str}')
qcard(c2, "Hallucination", mr["halu"], f"vs {sr['halu']} single",
f'Low' if multi_halu_ok else f'High')
qcard(c3, "Coherence", mr.get("coherence",0), f"vs {sr.get('coherence',0)} single",
f'{round(mr.get("coherence",0)-sr.get("coherence",0),2):+.2f}')
qcard(c4, "Completeness", mr.get("completeness",0), f"vs {sr.get('completeness',0)} single",
f'{round(mr.get("completeness",0)-sr.get("completeness",0),2):+.2f}')
qcard(c5, "Depth", mr.get("depth",0), f"vs {sr.get('depth',0)} single",
f'{round(mr.get("depth",0)-sr.get("depth",0),2):+.2f}')
qcard(c6, "Latency (M)", f"{mr['lat']}s", f"vs {sr['lat']}s single",
f'+{lat_d}s')
cc1, cc2, cc3 = st.columns(3)
with cc1:
st.plotly_chart(bar_pair(["Single", "Multi"], [sr["rel"], mr["rel"]], ymax=1.0),
use_container_width=True)
with cc2:
st.plotly_chart(bar_pair(["Single", "Multi"], [sr["words"], mr["words"]]),
use_container_width=True)
with cc3:
st.plotly_chart(mini_radar_5(
sr["rel"], single_halu_ok, mr["rel"], multi_halu_ok),
use_container_width=True)
st.markdown("---")
st.markdown("**Read the full reports**")
rb1, rb2 = st.columns(2)
if rb1.button("📄 Single Agent Report", use_container_width=True, key="btn_s"):
st.session_state.cmp_view = "single"
if rb2.button("🔬 Multi-Agent Pipeline Report", type="primary",
use_container_width=True, key="btn_m"):
st.session_state.cmp_view = "multi"
view = st.session_state.cmp_view
if view == "single": _render_single(sr)
elif view == "multi": _render_multi(mr)
def _render_single(sr):
rpt = sr.get("report")
st.markdown("---")
st.markdown('⚡ SINGLE AGENT',
unsafe_allow_html=True)
if rpt:
st.markdown(f"### {rpt.title}")
c1, c2, c3 = st.columns(3)
c1.metric("Word Count", rpt.word_count)
c2.metric("Sources", len(rpt.sources_cited))
c3.metric("Agent", "Single LLM")
st.markdown("---"); st.markdown(rpt.body or "")
if rpt.sources_cited:
with st.expander("📚 Sources"):
for s in rpt.sources_cited: st.markdown(f"- {s}")
else:
st.markdown(sr.get("text", ""))
def _render_multi(mr):
state = mr.get("state", {})
rpt = state.get("report")
plan = state.get("plan")
analysis = state.get("analysis")
st.markdown("---")
st.markdown('🔬 MULTI-AGENT PIPELINE',
unsafe_allow_html=True)
if rpt:
st.markdown(f"### {rpt.title}")
c1, c2, c3, c4 = st.columns(4)
c1.metric("Word Count", rpt.word_count)
c2.metric("Sources", len(rpt.sources_cited))
c3.metric("Confidence", analysis.confidence.upper() if analysis else "—")
c4.metric("Subtasks", len(plan.subtasks) if plan else "—")
st.markdown("---"); st.markdown(rpt.body or "")
if rpt.sources_cited:
with st.expander("📚 Sources"):
for s in rpt.sources_cited: st.markdown(f"- {s}")
if plan:
with st.expander("🧠 Research Plan"):
for t in plan.subtasks: st.markdown(f"- {t}")
st.markdown("**Search queries:**")
for q in plan.search_queries: st.code(q)
if analysis:
with st.expander("📊 Analyst Insights"):
for ins in analysis.key_insights: st.markdown(f"- {ins}")
else:
st.markdown(mr.get("text", ""))
# ── Benchmark panel ───────────────────────────────────────────────────────────
def panel_bench():
st.markdown("""
📊 Benchmarks
50 queries × 5 metrics — LLM-as-judge evaluation
50 queries
""", unsafe_allow_html=True)
bench_data = _load_bench()
summary = bench_data.get("summary", {})
qs_all = bench_data.get("queries", [])
has_real = bool(summary)
if not has_real:
st.info("Run `python bench_runner.py` to generate real benchmark data.", icon="ℹ️")
# Pull metrics
s_rel = summary.get("s_avg_rel", 0.877)
m_rel = summary.get("m_avg_rel", 0.850)
s_hpct = round(summary.get("s_halu_rate", 0.24) * 100)
m_hpct = round(summary.get("m_halu_rate", 0.04) * 100)
s_lat = summary.get("s_avg_lat", 5.5)
m_lat = summary.get("m_avg_lat", 307.0)
s_succ = round(summary.get("s_success", 0.78) * 100)
m_succ = round(summary.get("m_success", 0.78) * 100)
s_coh = summary.get("s_avg_coherence", 0.912)
m_coh = summary.get("m_avg_coherence", 0.900)
s_comp = summary.get("s_avg_completeness", 0.798)
m_comp = summary.get("m_avg_completeness", 0.720)
s_dep = summary.get("s_avg_depth", 0.658)
m_dep = summary.get("m_avg_depth", 0.590)
total_q = summary.get("total", len(qs_all))
# ── Row 1 metric cards ────────────────────────────────────────────────────
st.markdown('KEY METRICS
', unsafe_allow_html=True)
a1, a2, a3, a4 = st.columns(4)
def mc(col, label, val_str, delta_str, delta_cls, vs_str, bar_pct, bar_cls, card_cls, delay="0s"):
col.markdown(f"""
{label}
{val_str}{delta_str}
{vs_str}
""", unsafe_allow_html=True)
mc(a1, "HALLUCINATION — MULTI", f"{m_hpct}%",
f"-{s_hpct - m_hpct}pp", "ab-mdelta-g",
f"vs {s_hpct}% single · 6× improvement",
m_hpct, "green", "green", "0s")
mc(a2, "AVG RELEVANCE — MULTI", str(m_rel),
f"{round(m_rel-s_rel,3):+.3f}", "ab-mdelta-r" if m_rel < s_rel else "ab-mdelta-g",
f"vs {s_rel} single",
int(m_rel * 100), "blue", "blue", "0.06s")
mc(a3, "SUCCESS RATE", f"{m_succ}%",
f"{m_succ - s_succ:+d}pp", "ab-mdelta-n",
f"tied with single at {s_succ}%",
m_succ, "purple", "purple", "0.12s")
mc(a4, "AVG LATENCY — MULTI", f"{m_lat:.0f}s",
f"+{round(m_lat/s_lat,1) if s_lat else '?'}×", "ab-mdelta-r",
f"vs {s_lat}s single",
min(int(m_lat / 400 * 100), 100), "orange", "orange", "0.18s")
b1, b2, b3, b4 = st.columns(4)
mc(b1, "COHERENCE — SINGLE WINS", str(s_coh),
f"vs {m_coh} multi", "ab-mdelta-n",
"Single more structured",
int(s_coh * 100), "orange", "orange", "0.24s")
mc(b2, "COMPLETENESS — SINGLE WINS", str(s_comp),
f"vs {m_comp} multi", "ab-mdelta-n",
"Single covers more sub-topics",
int(s_comp * 100), "orange", "orange", "0.30s")
mc(b3, "DEPTH — SINGLE WINS", str(s_dep),
f"vs {m_dep} multi", "ab-mdelta-n",
"Single goes deeper technically",
int(s_dep * 100), "orange", "orange", "0.36s")
mc(b4, "QUERIES EVALUATED", str(total_q),
"", "ab-mdelta-n",
"5 metrics per query",
100, "blue", "gray", "0.42s")
# ── Insight bar ───────────────────────────────────────────────────────────
st.markdown(f"""
👑 Multi-agent wins on hallucination ({m_hpct}% vs {s_hpct}% — 6× safer).
Single agent wins on coherence, completeness, and depth — a real tradeoff, not just latency.
{'(LLM-as-judge evaluated · llama-3.1-8b-instant)' if has_real else ''}
""", unsafe_allow_html=True)
# ── Chart Row 1: Radar + Category ─────────────────────────────────────────
rc1, rc2 = st.columns(2)
with rc1:
st.markdown('5-Metric Radar
'
'
Normalised 0–1, higher = better across all dimensions
',
unsafe_allow_html=True)
s_no_h = round(1 - summary.get("s_halu_rate", 0.24), 2)
m_no_h = round(1 - summary.get("m_halu_rate", 0.04), 2)
st.plotly_chart(radar_5(
s_rel, s_coh, s_comp, s_dep, s_no_h,
m_rel, m_coh, m_comp, m_dep, m_no_h,
), use_container_width=True)
with rc2:
fig_cat = category_chart(qs_all)
if fig_cat:
st.markdown('Category Breakdown
'
'
Avg relevance by topic — 5 queries per category
',
unsafe_allow_html=True)
st.plotly_chart(fig_cat, use_container_width=True)
else:
st.markdown('5-Metric Quality
'
'
Higher is better (0–1)
',
unsafe_allow_html=True)
st.plotly_chart(bar_two(
["Rel", "Coherence", "Completeness", "Depth"],
[s_rel, s_coh, s_comp, s_dep],
[m_rel, m_coh, m_comp, m_dep],
ymax=1.0,
), use_container_width=True)
# ── Chart Row 2: Trend + Scatter + Win donut ──────────────────────────────
rd1, rd2, rd3 = st.columns([2, 2, 1.4])
with rd1:
st.markdown('Relevance Trend
'
'
Query-by-query score progression
',
unsafe_allow_html=True)
if qs_all:
st.plotly_chart(trend_line(qs_all), use_container_width=True)
else:
st.info("Run benchmark to see trend data.")
with rd2:
st.markdown('Latency vs Relevance
'
'
Each dot = one query (hover for details)
',
unsafe_allow_html=True)
if qs_all:
st.plotly_chart(scatter_lat_rel(qs_all), use_container_width=True)
else:
st.info("Run benchmark to see scatter data.")
with rd3:
st.markdown('Win Distribution
'
'
Who scored higher per query
',
unsafe_allow_html=True)
if qs_all:
st.plotly_chart(win_donut(qs_all), use_container_width=True)
else:
st.info("Run benchmark first.")
# ── Chart Row 3: Gauges + Latency dist ───────────────────────────────────
re1, re2 = st.columns(2)
with re1:
st.markdown('Hallucination Gauge
'
'
Lower is better — green zone is target
',
unsafe_allow_html=True)
st.plotly_chart(gauge_duo(s_hpct, m_hpct), use_container_width=True)
with re2:
st.markdown('Latency Distribution
'
'
Query response time buckets
',
unsafe_allow_html=True)
if qs_all:
st.plotly_chart(latency_dist(qs_all), use_container_width=True)
else:
st.info("Run benchmark first.")
# ── Response comparison tabs ──────────────────────────────────────────────
st.markdown("---")
st.markdown('RESPONSE COMPARISON — FIRST 10 QUERIES
',
unsafe_allow_html=True)
bench_tabs = _bench_tabs_data()
if not bench_tabs:
st.info("Run `python bench_runner.py` to populate comparisons.")
return
tabs = st.tabs([bq["label"] for bq in bench_tabs])
for tab, bq in zip(tabs, bench_tabs):
with tab:
sc, mc = st.columns(2)
s, m = bq["s"], bq["m"]
def resp_card(s_data, is_winner):
h_cls = "ab-halu-bad" if s_data["halu"] == "Yes" else "ab-halu-ok"
icon = "🔬" if is_winner else "⚡"
name = "Multi-Agent" if is_winner else "Single Agent"
pipe = (_multi_pipe(["done","done","done","done"]) if is_winner
else _step("Direct LLM call", "done"))
card = "ab-resp-card winner" if is_winner else "ab-resp-card"
mrow = ""
if any(s_data.get(k, 0) for k in ["coherence","completeness","depth"]):
clr = "blue" if is_winner else "orange"
mrow = f"""
{_mini_bar("Rel", s_data.get("rel",0), clr)}
{_mini_bar("Coh", s_data.get("coherence",0), clr)}
{_mini_bar("Comp", s_data.get("completeness",0), clr)}
{_mini_bar("Depth", s_data.get("depth",0), clr)}
"""
return f"""
{icon} {name}
{_e(s_data["halu"])}
{pipe}
{_e(s_data["text"])}
{mrow}
"""
with sc: st.markdown(resp_card(s, False), unsafe_allow_html=True)
with mc: st.markdown(resp_card(m, True), unsafe_allow_html=True)
# ── Sidebar ───────────────────────────────────────────────────────────────────
def sidebar_nav() -> str:
bench_data = _load_bench()
summary = bench_data.get("summary", {})
m_hpct = round(summary.get("m_halu_rate", 0.04) * 100) if summary else 4
total_q = summary.get("total", 0) if summary else 0
with st.sidebar:
st.markdown(f"""
⚡
AgentBench
Multi-agent eval
""", unsafe_allow_html=True)
st.markdown("", unsafe_allow_html=True)
page = st.radio("nav", options=["Live query", "Benchmarks"], label_visibility="collapsed")
st.markdown("", unsafe_allow_html=True)
st.markdown("---")
# Model info
st.markdown(f"""
Model
llama-3.3-70b
Provider
Tavily Search · Groq
""", unsafe_allow_html=True)
if total_q > 0:
st.markdown("", unsafe_allow_html=True)
st.markdown("---")
st.markdown(f"""
Benchmark Stats
Queries evaluated{total_q}
Multi halluc. rate{m_hpct}%
Metrics per query5
""", unsafe_allow_html=True)
return page
# ── Entry ─────────────────────────────────────────────────────────────────────
def main():
page = sidebar_nav()
if page == "Live query":
panel_live()
else:
panel_bench()
if __name__ == "__main__":
main()