""" AgentBench — Multi-Agent Evaluation Dashboard (Streamlit) """ import html import json import os import time import uuid import plotly.graph_objects as go import streamlit as st from agents.single_agent import run_single_agent from graph import build_graph from evaluator import evaluate st.set_page_config( page_title="AgentBench — Multi-Agent Evaluation", page_icon="⚡", layout="wide", initial_sidebar_state="expanded", ) # ── CSS ─────────────────────────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ── Static data ─────────────────────────────────────────────────────────────── SUGGESTIONS = [ "What is retrieval-augmented generation?", "Explain the attention mechanism in transformers", "When should I use LangGraph over LangChain?", "How does LoRA reduce fine-tuning costs?", "What is data drift in machine learning?", ] CATEGORIES = [ ("GenAI Concepts", range(0, 5)), ("Agentic AI", range(5, 10)), ("Fine-tuning", range(10, 15)), ("Architectures", range(15, 20)), ("Retrieval", range(20, 25)), ("Eval & Safety", range(25, 30)), ("ML Fundamentals", range(30, 35)), ("Efficient ML", range(35, 40)), ("Applied AI", range(40, 45)), ("Trends & Future", range(45, 50)), ] _BENCH_FILE = os.path.join(os.path.dirname(__file__), "bench_results.json") S_CLR = "#f97316" M_CLR = "#2563eb" S_FILL = "rgba(249,115,22,0.12)" M_FILL = "rgba(37,99,235,0.12)" GR_CLR = "rgba(0,0,0,0.05)" TK_CLR = "rgba(0,0,0,0.40)" _BASE_CHART = dict( paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor ="rgba(0,0,0,0)", font=dict(size=10, color="#595959"), margin=dict(t=30, b=8, l=8, r=8), ) # ── Data helpers ────────────────────────────────────────────────────────────── def _load_bench() -> dict: if os.path.exists(_BENCH_FILE): try: with open(_BENCH_FILE) as f: return json.load(f) except Exception: pass return {} def _bench_tabs_data() -> list: data = _load_bench() tabs = [] for row in data.get("queries", [])[:10]: label = row["query"][:30].rstrip() + "…" if len(row["query"]) > 30 else row["query"] tabs.append({ "label": label, "query": row["query"], "s": { "text": row["single"]["text"], "lat": f'{row["single"]["lat"]}s', "rel": row["single"].get("rel", 0), "halu": row["single"].get("halu", "?"), "coherence": row["single"].get("coherence", 0), "completeness":row["single"].get("completeness", 0), "depth": row["single"].get("depth", 0), }, "m": { "text": row["multi"]["text"], "lat": f'{row["multi"]["lat"]}s', "rel": row["multi"].get("rel", 0), "halu": row["multi"].get("halu", "?"), "coherence": row["multi"].get("coherence", 0), "completeness":row["multi"].get("completeness", 0), "depth": row["multi"].get("depth", 0), }, }) return tabs # ── Session state ───────────────────────────────────────────────────────────── _DEFAULTS = { "ran": False, "last_query": "", "query_input": "", "res_single": None, "res_multi": None, "cmp_view": None, } for _k, _v in _DEFAULTS.items(): if _k not in st.session_state: st.session_state[_k] = _v @st.cache_resource def get_pipeline(): return build_graph() # ── Chart builders ──────────────────────────────────────────────────────────── def _chart_layout(**kw): d = dict(**_BASE_CHART) d.update(kw) return d def bar_two(labels, s_vals, m_vals, ymax=None): fig = go.Figure() fig.add_bar(name="Single", x=labels, y=s_vals, marker_color=S_CLR, marker_line_width=0, opacity=0.9, text=[f"{v:.2f}" if isinstance(v, float) else str(v) for v in s_vals], textposition="outside", textfont=dict(size=9)) fig.add_bar(name="Multi", x=labels, y=m_vals, marker_color=M_CLR, marker_line_width=0, opacity=0.9, text=[f"{v:.2f}" if isinstance(v, float) else str(v) for v in m_vals], textposition="outside", textfont=dict(size=9)) fig.update_layout(**_chart_layout( barmode="group", height=200, showlegend=True, legend=dict(orientation="h", y=1.2, x=0, font_size=9), xaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)), yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR), range=[0, ymax] if ymax else None), )) return fig def bar_pair(labels, vals, ymax=None): fig = go.Figure(go.Bar( x=labels, y=vals, marker_color=[S_CLR, M_CLR], marker_line_width=0, opacity=0.9, text=[f"{v:.2f}" if isinstance(v, float) else str(v) for v in vals], textposition="outside", textfont=dict(size=9), )) fig.update_layout(**_chart_layout( height=200, showlegend=False, xaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)), yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR), range=[0, ymax] if ymax else None), )) return fig def radar_5(s_rel, s_coh, s_comp, s_dep, s_no_halu, m_rel, m_coh, m_comp, m_dep, m_no_halu): cats = ["Relevance", "Coherence", "Completeness", "Depth", "No-Halluc."] s_vals = [s_rel, s_coh, s_comp, s_dep, s_no_halu] m_vals = [m_rel, m_coh, m_comp, m_dep, m_no_halu] closed = cats + [cats[0]] fig = go.Figure() fig.add_trace(go.Scatterpolar( r=s_vals + [s_vals[0]], theta=closed, fill="toself", name="Single", line=dict(color=S_CLR, width=2), fillcolor=S_FILL, )) fig.add_trace(go.Scatterpolar( r=m_vals + [m_vals[0]], theta=closed, fill="toself", name="Multi", line=dict(color=M_CLR, width=2), fillcolor=M_FILL, )) fig.update_layout( paper_bgcolor="rgba(0,0,0,0)", font=dict(size=10, color="#595959"), margin=dict(t=36, b=24, l=28, r=28), height=260, polar=dict( radialaxis=dict(visible=True, range=[0,1], gridcolor=GR_CLR, tickfont=dict(size=8, color=TK_CLR)), angularaxis=dict(gridcolor=GR_CLR), bgcolor="rgba(0,0,0,0)", ), showlegend=True, legend=dict(orientation="h", y=-0.1, x=0.2, font_size=9), ) return fig def mini_radar_5(s_rel, s_halu_ok, m_rel, m_halu_ok): cats = ["Relevance", "Trust", "Speed-adj", "Coverage"] s_v = [s_rel, 1 if s_halu_ok else 0.4, 0.92, min(s_rel + 0.04, 1.0)] m_v = [m_rel, 1 if m_halu_ok else 0.4, 0.15, min(m_rel + 0.07, 1.0)] closed = cats + [cats[0]] fig = go.Figure() fig.add_trace(go.Scatterpolar(r=s_v+[s_v[0]], theta=closed, fill="toself", name="Single", line=dict(color=S_CLR, width=1.5), fillcolor=S_FILL)) fig.add_trace(go.Scatterpolar(r=m_v+[m_v[0]], theta=closed, fill="toself", name="Multi", line=dict(color=M_CLR, width=1.5), fillcolor=M_FILL)) fig.update_layout( paper_bgcolor="rgba(0,0,0,0)", font=dict(size=9, color="#595959"), margin=dict(t=28, b=16, l=28, r=28), height=180, polar=dict( radialaxis=dict(visible=True, range=[0,1], gridcolor=GR_CLR, tickfont=dict(size=7, color=TK_CLR)), angularaxis=dict(gridcolor=GR_CLR), bgcolor="rgba(0,0,0,0)", ), showlegend=True, legend=dict(orientation="h", y=-0.15, x=0.15, font_size=8), ) return fig def trend_line(qs_data): idxs = list(range(1, len(qs_data) + 1)) s_rels = [q["single"].get("rel", 0) for q in qs_data] m_rels = [q["multi"].get("rel", 0) for q in qs_data] fig = go.Figure() fig.add_trace(go.Scatter(x=idxs, y=s_rels, mode="lines", name="Single", line=dict(color=S_CLR, width=2, shape="spline"), fill="tozeroy", fillcolor=S_FILL)) fig.add_trace(go.Scatter(x=idxs, y=m_rels, mode="lines", name="Multi", line=dict(color=M_CLR, width=2, shape="spline"), fill="tozeroy", fillcolor=M_FILL)) fig.update_layout(**_chart_layout( height=210, showlegend=True, legend=dict(orientation="h", y=1.2, x=0, font_size=9), xaxis=dict(title="Query #", range=[1, len(idxs)], gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)), yaxis=dict(title="Relevance", range=[0, 1.15], gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)), )) return fig def scatter_lat_rel(qs_data): s_x = [q["single"].get("lat", 0) for q in qs_data] s_y = [q["single"].get("rel", 0) for q in qs_data] m_x = [q["multi"].get("lat", 0) for q in qs_data] m_y = [q["multi"].get("rel", 0) for q in qs_data] q_labels = [q["query"][:45] + "…" for q in qs_data] fig = go.Figure() fig.add_trace(go.Scatter(x=s_x, y=s_y, mode="markers", name="Single", marker=dict(color=S_CLR, size=9, opacity=0.75, line=dict(width=1, color="white")), text=q_labels, hovertemplate="%{text}
Lat: %{x}s Rel: %{y}Single")) fig.add_trace(go.Scatter(x=m_x, y=m_y, mode="markers", name="Multi", marker=dict(color=M_CLR, size=9, opacity=0.75, line=dict(width=1, color="white")), text=q_labels, hovertemplate="%{text}
Lat: %{x}s Rel: %{y}Multi")) fig.update_layout(**_chart_layout( height=210, showlegend=True, legend=dict(orientation="h", y=1.2, x=0, font_size=9), xaxis=dict(title="Latency (s)", type="log", gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)), yaxis=dict(title="Relevance", range=[0, 1.15], gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)), )) return fig def gauge_duo(s_pct, m_pct): fig = go.Figure() steps_green = [ {"range": [0, 10], "color": "#f0fdf4"}, {"range": [10, 30], "color": "#fef9c3"}, {"range": [30, 100],"color": "#fef2f2"}, ] for val, label, clr, domain in [ (s_pct, "Single Agent", S_CLR, [0, 0.44]), (m_pct, "Multi-Agent", M_CLR, [0.56, 1.0]), ]: fig.add_trace(go.Indicator( mode="gauge+number", value=val, number={"suffix": "%", "font": {"size": 22, "color": clr}}, title={"text": label, "font": {"size": 12, "color": "#374151"}}, gauge={ "axis": {"range": [0, 100], "tickwidth": 1, "tickfont": {"size": 8}, "tickcolor": "#d1d5db"}, "bar": {"color": clr, "thickness": 0.28}, "bgcolor": "white", "steps": steps_green, "threshold": {"line": {"color": "red", "width": 3}, "thickness": 0.75, "value": 30}, }, domain={"x": domain, "y": [0, 1]}, )) fig.update_layout( paper_bgcolor="rgba(0,0,0,0)", font=dict(size=10, color="#595959"), margin=dict(t=40, b=10, l=20, r=20), height=200, ) return fig def category_chart(qs_data): if len(qs_data) < 50: return None names, s_avgs, m_avgs = [], [], [] for cat_name, idx_range in CATEGORIES: chunk = [qs_data[i] for i in idx_range if i < len(qs_data)] if not chunk: continue names.append(cat_name) s_avgs.append(round(sum(q["single"].get("rel", 0) for q in chunk) / len(chunk), 2)) m_avgs.append(round(sum(q["multi"].get("rel", 0) for q in chunk) / len(chunk), 2)) fig = go.Figure() fig.add_trace(go.Bar(name="Single", y=names, x=s_avgs, orientation="h", marker_color=S_CLR, marker_line_width=0, opacity=0.88, text=s_avgs, textposition="outside", textfont=dict(size=8))) fig.add_trace(go.Bar(name="Multi", y=names, x=m_avgs, orientation="h", marker_color=M_CLR, marker_line_width=0, opacity=0.88, text=m_avgs, textposition="outside", textfont=dict(size=8))) fig.update_layout( paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", font=dict(size=9, color="#595959"), margin=dict(t=24, b=8, l=8, r=50), height=310, barmode="group", showlegend=True, legend=dict(orientation="h", y=1.08, x=0, font_size=9), xaxis=dict(range=[0, 1.15], gridcolor=GR_CLR, tickfont=dict(size=8, color=TK_CLR)), yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)), ) return fig def win_donut(qs_data): multi_w = sum(1 for q in qs_data if q["multi"].get("rel", 0) > q["single"].get("rel", 0)) single_w = sum(1 for q in qs_data if q["single"].get("rel", 0) > q["multi"].get("rel", 0)) ties = len(qs_data) - multi_w - single_w total = len(qs_data) fig = go.Figure(go.Pie( labels=["Multi wins", "Single wins", "Tie"], values=[multi_w, single_w, ties], hole=0.62, marker=dict(colors=[M_CLR, S_CLR, "#d1d5db"], line=dict(width=2, color="white")), textfont=dict(size=9), textinfo="label+percent", hovertemplate="%{label}: %{value} queries", )) fig.update_layout( paper_bgcolor="rgba(0,0,0,0)", font=dict(size=9, color="#595959"), margin=dict(t=24, b=24, l=20, r=20), height=210, showlegend=False, annotations=[dict( text=f"{multi_w}/{total}
Multi", x=0.5, y=0.5, font_size=13, showarrow=False, font_color=M_CLR, )], ) return fig def latency_dist(qs_data): buckets = ["<2s", "2–5s", "5–10s", "10–60s", ">60s"] def bucket(lat): if lat < 2: return "<2s" if lat < 5: return "2–5s" if lat < 10: return "5–10s" if lat < 60: return "10–60s" return ">60s" s_b = {b: 0 for b in buckets} m_b = {b: 0 for b in buckets} for q in qs_data: s_b[bucket(q["single"].get("lat", 0))] += 1 m_b[bucket(q["multi"].get("lat", 0))] += 1 fig = go.Figure() fig.add_bar(name="Single", x=buckets, y=[s_b[b] for b in buckets], marker_color=S_CLR, marker_line_width=0, opacity=0.88) fig.add_bar(name="Multi", x=buckets, y=[m_b[b] for b in buckets], marker_color=M_CLR, marker_line_width=0, opacity=0.88) fig.update_layout(**_chart_layout( barmode="group", height=210, showlegend=True, legend=dict(orientation="h", y=1.2, x=0, font_size=9), xaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)), yaxis=dict(gridcolor=GR_CLR, tickfont=dict(size=9, color=TK_CLR)), )) return fig # ── HTML helpers ────────────────────────────────────────────────────────────── def _e(t): return html.escape(str(t)) def _badge(label, cls): return f'{_e(label)}' def _halu_badge(val): cls = "ab-halu-bad" if val not in ("No", "") else "ab-halu-ok" return f'{_e(val)}' def _step(label, state=""): return f'{_e(label)}' def _arr(): return '' def _mini_bar(label, val, color="blue"): w = int(val * 100) if val <= 1.0 else int(val) disp = f"{val:.2f}" if val <= 1.0 else f"{val:.0f}" return f"""
{_e(label)}
{disp}
""" def _card_html(icon, title, badge_label, badge_cls, pipe_html, body_text, lat, rel, halu, winner=False, coherence=0, completeness=0, depth=0): preview = _e(body_text[:500]) + ("…" if len(body_text) > 500 else "") card_cls = "ab-card winner" if winner else "ab-card" mrow_html = "" if rel or coherence or completeness or depth: bar_clr = "blue" if winner else "orange" mrow_html = f"""
{_mini_bar("Rel", rel, bar_clr)} {_mini_bar("Coh", coherence, bar_clr)} {_mini_bar("Comp", completeness, bar_clr)} {_mini_bar("Depth", depth, bar_clr)}
""" return f"""
{icon} {_e(title)} {_badge(badge_label, badge_cls)}
{pipe_html}
{preview if body_text else 'Response will appear here…'}
{mrow_html}
Latency {_e(str(lat))} Relevance {_e(str(rel))} Hallucination {_halu_badge(halu)}
""" def _single_pipe(state="done"): return _step("Direct LLM call", state) def _multi_pipe(states=None): if states is None: states = ["", "", "", ""] labels = ["Planner", "Research", "Analyst", "Writer"] parts = [] for i, (lbl, st_) in enumerate(zip(labels, states)): parts.append(_step(lbl, st_)) if i < len(labels) - 1: parts.append(_arr()) return "".join(parts) # ── Pipeline runner ─────────────────────────────────────────────────────────── NODE_LABELS = { "planner": "🧠 Planner decomposing query…", "researcher": "🔍 Researcher retrieving sources…", "analyst": "📊 Analyst processing findings…", "writer": "✍️ Writer synthesizing report…", "memory": "💾 Saving to memory…", } NODE_PROGRESS = {"planner": 22, "researcher": 46, "analyst": 68, "writer": 88, "memory": 100} def _pbar_html(pct): return f'
' def _run_multi(query, session_id, step_ph, prog_ph=None): pipeline = get_pipeline() initial = {"query": query, "session_id": session_id, "plan": None, "research_text": None, "analysis": None, "report": None} final_state = {} for update in pipeline.stream(initial, stream_mode="updates"): node_name = next(iter(update)) node_data = update[node_name] if node_data: final_state.update(node_data) step_ph.info(NODE_LABELS.get(node_name, f"⚙️ Running {node_name}…")) if prog_ph is not None: prog_ph.markdown(_pbar_html(NODE_PROGRESS.get(node_name, 50)), unsafe_allow_html=True) step_ph.empty() if prog_ph is not None: time.sleep(0.4) prog_ph.empty() return final_state def _stream(ph, text, delay=0.012): words = text.split() buf = "" for word in words: buf += word + " " ph.markdown(buf + "▋") time.sleep(delay) ph.markdown(buf.strip()) # ── Live query panel ────────────────────────────────────────────────────────── def panel_live(): if st.session_state.pop("_reset_query", False): st.session_state.query_input = "" st.markdown("""
⚡ Live Query
Ask anything — both agents respond in real time
● Live
""", unsafe_allow_html=True) st.caption("SUGGESTIONS") r1, r2 = st.columns(3), st.columns(3) grid = [r1[0], r1[1], r1[2], r2[0], r2[1]] for col, sug in zip(grid, SUGGESTIONS): if col.button(sug, key=f"sug_{sug[:20]}", use_container_width=True): st.session_state.query_input = sug query = st.text_area("query", key="query_input", placeholder="e.g. What is retrieval-augmented generation?", height=80, max_chars=300, label_visibility="collapsed") c_col, b_col, x_col = st.columns([6, 3, 1]) c_col.caption(f"{len(query)}/300") run_clicked = b_col.button("▶ Run both agents", type="primary", use_container_width=True) clear_clicked = x_col.button("✕", use_container_width=True, help="Clear") if clear_clicked: for k, v in _DEFAULTS.items(): if k != "query_input": st.session_state[k] = v st.session_state["_reset_query"] = True st.rerun() if run_clicked and not query.strip(): st.warning("Please enter a query before running.") prog_ph = st.empty() if run_clicked and query.strip(): q = query.strip() st.session_state.update({"last_query": q, "ran": True, "cmp_view": None, "res_single": None, "res_multi": None}) prog_ph.markdown(_pbar_html(5), unsafe_allow_html=True) s_col, m_col = st.columns(2) with s_col: st.markdown(f"""
⚡ Single Agent{_badge("Running…","ab-run")}
{_single_pipe("active")}
""", unsafe_allow_html=True) s_ph = st.empty(); s_ph.caption("Thinking…") with m_col: st.markdown(f"""
🔬 Multi-Agent{_badge("Waiting…","ab-idle")}
{_multi_pipe()}
""", unsafe_allow_html=True) m_status_ph = st.empty(); m_ph = st.empty(); m_ph.caption("Waiting…") # Run single try: t0 = time.time() s_rpt, _ = run_single_agent(q) s_lat = round(time.time() - t0, 1) s_text = (s_rpt.body or s_rpt.title) if s_rpt else "" _stream(s_ph, s_text[:500] if s_text else "(no output)") try: ev = evaluate(q, s_text) if s_text else None s_rel, s_halu = (round(ev.relevance, 2), ev.hallucination) if ev else (0.0, "Possible") s_coh = round(ev.coherence, 2) if ev else 0.0 s_comp = round(ev.completeness, 2) if ev else 0.0 s_dep = round(ev.depth, 2) if ev else 0.0 except Exception: s_rel, s_halu, s_coh, s_comp, s_dep = 0.0, "Possible", 0.0, 0.0, 0.0 st.session_state.res_single = dict( text=s_text, lat=s_lat, words=len(s_text.split()), report=s_rpt, rel=s_rel, halu=s_halu, coherence=s_coh, completeness=s_comp, depth=s_dep) except Exception as exc: s_ph.error(f"Single agent error: {exc}"); st.session_state.ran = False; return prog_ph.markdown(_pbar_html(18), unsafe_allow_html=True) # Run multi m_ph.caption("🔍 Pipeline running…") try: t1 = time.time() fs = _run_multi(q, str(uuid.uuid4()), m_status_ph, prog_ph) m_lat = round(time.time() - t1, 1) m_rpt = fs.get("report") m_text = (m_rpt.body or m_rpt.title) if m_rpt else "" _stream(m_ph, m_text[:500] if m_text else "(no output)") try: ev = evaluate(q, m_text) if m_text else None m_rel, m_halu = (round(ev.relevance, 2), ev.hallucination) if ev else (0.0, "No") m_coh = round(ev.coherence, 2) if ev else 0.0 m_comp = round(ev.completeness, 2) if ev else 0.0 m_dep = round(ev.depth, 2) if ev else 0.0 except Exception: m_rel, m_halu, m_coh, m_comp, m_dep = 0.0, "No", 0.0, 0.0, 0.0 st.session_state.res_multi = dict( text=m_text, lat=m_lat, words=len(m_text.split()), state=fs, rel=m_rel, halu=m_halu, coherence=m_coh, completeness=m_comp, depth=m_dep) except Exception as exc: m_ph.error(f"Multi-agent error: {exc}"); st.session_state.ran = False; return st.rerun() if st.session_state.ran and st.session_state.res_single and st.session_state.res_multi: sr = st.session_state.res_single mr = st.session_state.res_multi s_col, m_col = st.columns(2) with s_col: st.markdown(_card_html( "⚡", "Single Agent", "Done", "ab-idle", _single_pipe("done"), sr["text"], f'{sr["lat"]}s', sr["rel"], sr["halu"], coherence=sr.get("coherence",0), completeness=sr.get("completeness",0), depth=sr.get("depth",0), ), unsafe_allow_html=True) with m_col: st.markdown(_card_html( "🔬", "Multi-Agent", "Winner 🏆", "ab-win", _multi_pipe(["done","done","done","done"]), mr["text"], f'{mr["lat"]}s', mr["rel"], mr["halu"], winner=True, coherence=mr.get("coherence",0), completeness=mr.get("completeness",0), depth=mr.get("depth",0), ), unsafe_allow_html=True) # ── Per-query metrics ───────────────────────────────────────────────── st.markdown("
", unsafe_allow_html=True) st.markdown("---") q_prev = _e(st.session_state.last_query[:50]) + ("…" if len(st.session_state.last_query) > 50 else "") st.markdown(f'**This query — breakdown** "{q_prev}"', unsafe_allow_html=True) rel_d = round(mr["rel"] - sr["rel"], 2) d_str = (f"+{rel_d}" if rel_d >= 0 else str(rel_d)) lat_d = round(mr["lat"] - sr["lat"], 1) multi_halu_ok = mr["halu"] == "No" single_halu_ok = sr["halu"] == "No" st.markdown( '
👑 Multi-agent wins — lower hallucination, structured pipeline
', unsafe_allow_html=True) c1, c2, c3, c4, c5, c6 = st.columns(6) def qcard(col, label, val, sub, delta_cls=""): col.markdown(f"""
{label}
{val}{delta_cls}
{sub}
""", unsafe_allow_html=True) qcard(c1, "Relevance (M)", mr["rel"], f"vs {sr['rel']} single", f'{d_str}') qcard(c2, "Hallucination", mr["halu"], f"vs {sr['halu']} single", f'Low' if multi_halu_ok else f'High') qcard(c3, "Coherence", mr.get("coherence",0), f"vs {sr.get('coherence',0)} single", f'{round(mr.get("coherence",0)-sr.get("coherence",0),2):+.2f}') qcard(c4, "Completeness", mr.get("completeness",0), f"vs {sr.get('completeness',0)} single", f'{round(mr.get("completeness",0)-sr.get("completeness",0),2):+.2f}') qcard(c5, "Depth", mr.get("depth",0), f"vs {sr.get('depth',0)} single", f'{round(mr.get("depth",0)-sr.get("depth",0),2):+.2f}') qcard(c6, "Latency (M)", f"{mr['lat']}s", f"vs {sr['lat']}s single", f'+{lat_d}s') cc1, cc2, cc3 = st.columns(3) with cc1: st.plotly_chart(bar_pair(["Single", "Multi"], [sr["rel"], mr["rel"]], ymax=1.0), use_container_width=True) with cc2: st.plotly_chart(bar_pair(["Single", "Multi"], [sr["words"], mr["words"]]), use_container_width=True) with cc3: st.plotly_chart(mini_radar_5( sr["rel"], single_halu_ok, mr["rel"], multi_halu_ok), use_container_width=True) st.markdown("---") st.markdown("**Read the full reports**") rb1, rb2 = st.columns(2) if rb1.button("📄 Single Agent Report", use_container_width=True, key="btn_s"): st.session_state.cmp_view = "single" if rb2.button("🔬 Multi-Agent Pipeline Report", type="primary", use_container_width=True, key="btn_m"): st.session_state.cmp_view = "multi" view = st.session_state.cmp_view if view == "single": _render_single(sr) elif view == "multi": _render_multi(mr) def _render_single(sr): rpt = sr.get("report") st.markdown("---") st.markdown('⚡ SINGLE AGENT', unsafe_allow_html=True) if rpt: st.markdown(f"### {rpt.title}") c1, c2, c3 = st.columns(3) c1.metric("Word Count", rpt.word_count) c2.metric("Sources", len(rpt.sources_cited)) c3.metric("Agent", "Single LLM") st.markdown("---"); st.markdown(rpt.body or "") if rpt.sources_cited: with st.expander("📚 Sources"): for s in rpt.sources_cited: st.markdown(f"- {s}") else: st.markdown(sr.get("text", "")) def _render_multi(mr): state = mr.get("state", {}) rpt = state.get("report") plan = state.get("plan") analysis = state.get("analysis") st.markdown("---") st.markdown('🔬 MULTI-AGENT PIPELINE', unsafe_allow_html=True) if rpt: st.markdown(f"### {rpt.title}") c1, c2, c3, c4 = st.columns(4) c1.metric("Word Count", rpt.word_count) c2.metric("Sources", len(rpt.sources_cited)) c3.metric("Confidence", analysis.confidence.upper() if analysis else "—") c4.metric("Subtasks", len(plan.subtasks) if plan else "—") st.markdown("---"); st.markdown(rpt.body or "") if rpt.sources_cited: with st.expander("📚 Sources"): for s in rpt.sources_cited: st.markdown(f"- {s}") if plan: with st.expander("🧠 Research Plan"): for t in plan.subtasks: st.markdown(f"- {t}") st.markdown("**Search queries:**") for q in plan.search_queries: st.code(q) if analysis: with st.expander("📊 Analyst Insights"): for ins in analysis.key_insights: st.markdown(f"- {ins}") else: st.markdown(mr.get("text", "")) # ── Benchmark panel ─────────────────────────────────────────────────────────── def panel_bench(): st.markdown("""
📊 Benchmarks
50 queries × 5 metrics — LLM-as-judge evaluation
50 queries
""", unsafe_allow_html=True) bench_data = _load_bench() summary = bench_data.get("summary", {}) qs_all = bench_data.get("queries", []) has_real = bool(summary) if not has_real: st.info("Run `python bench_runner.py` to generate real benchmark data.", icon="ℹ️") # Pull metrics s_rel = summary.get("s_avg_rel", 0.877) m_rel = summary.get("m_avg_rel", 0.850) s_hpct = round(summary.get("s_halu_rate", 0.24) * 100) m_hpct = round(summary.get("m_halu_rate", 0.04) * 100) s_lat = summary.get("s_avg_lat", 5.5) m_lat = summary.get("m_avg_lat", 307.0) s_succ = round(summary.get("s_success", 0.78) * 100) m_succ = round(summary.get("m_success", 0.78) * 100) s_coh = summary.get("s_avg_coherence", 0.912) m_coh = summary.get("m_avg_coherence", 0.900) s_comp = summary.get("s_avg_completeness", 0.798) m_comp = summary.get("m_avg_completeness", 0.720) s_dep = summary.get("s_avg_depth", 0.658) m_dep = summary.get("m_avg_depth", 0.590) total_q = summary.get("total", len(qs_all)) # ── Row 1 metric cards ──────────────────────────────────────────────────── st.markdown('
KEY METRICS
', unsafe_allow_html=True) a1, a2, a3, a4 = st.columns(4) def mc(col, label, val_str, delta_str, delta_cls, vs_str, bar_pct, bar_cls, card_cls, delay="0s"): col.markdown(f"""
{label}
{val_str}{delta_str}
{vs_str}
""", unsafe_allow_html=True) mc(a1, "HALLUCINATION — MULTI", f"{m_hpct}%", f"-{s_hpct - m_hpct}pp", "ab-mdelta-g", f"vs {s_hpct}% single · 6× improvement", m_hpct, "green", "green", "0s") mc(a2, "AVG RELEVANCE — MULTI", str(m_rel), f"{round(m_rel-s_rel,3):+.3f}", "ab-mdelta-r" if m_rel < s_rel else "ab-mdelta-g", f"vs {s_rel} single", int(m_rel * 100), "blue", "blue", "0.06s") mc(a3, "SUCCESS RATE", f"{m_succ}%", f"{m_succ - s_succ:+d}pp", "ab-mdelta-n", f"tied with single at {s_succ}%", m_succ, "purple", "purple", "0.12s") mc(a4, "AVG LATENCY — MULTI", f"{m_lat:.0f}s", f"+{round(m_lat/s_lat,1) if s_lat else '?'}×", "ab-mdelta-r", f"vs {s_lat}s single", min(int(m_lat / 400 * 100), 100), "orange", "orange", "0.18s") b1, b2, b3, b4 = st.columns(4) mc(b1, "COHERENCE — SINGLE WINS", str(s_coh), f"vs {m_coh} multi", "ab-mdelta-n", "Single more structured", int(s_coh * 100), "orange", "orange", "0.24s") mc(b2, "COMPLETENESS — SINGLE WINS", str(s_comp), f"vs {m_comp} multi", "ab-mdelta-n", "Single covers more sub-topics", int(s_comp * 100), "orange", "orange", "0.30s") mc(b3, "DEPTH — SINGLE WINS", str(s_dep), f"vs {m_dep} multi", "ab-mdelta-n", "Single goes deeper technically", int(s_dep * 100), "orange", "orange", "0.36s") mc(b4, "QUERIES EVALUATED", str(total_q), "", "ab-mdelta-n", "5 metrics per query", 100, "blue", "gray", "0.42s") # ── Insight bar ─────────────────────────────────────────────────────────── st.markdown(f"""
👑 Multi-agent wins on hallucination ({m_hpct}% vs {s_hpct}% — 6× safer). Single agent wins on coherence, completeness, and depth — a real tradeoff, not just latency. {'(LLM-as-judge evaluated · llama-3.1-8b-instant)' if has_real else ''}
""", unsafe_allow_html=True) # ── Chart Row 1: Radar + Category ───────────────────────────────────────── rc1, rc2 = st.columns(2) with rc1: st.markdown('
5-Metric Radar
' '
Normalised 0–1, higher = better across all dimensions
', unsafe_allow_html=True) s_no_h = round(1 - summary.get("s_halu_rate", 0.24), 2) m_no_h = round(1 - summary.get("m_halu_rate", 0.04), 2) st.plotly_chart(radar_5( s_rel, s_coh, s_comp, s_dep, s_no_h, m_rel, m_coh, m_comp, m_dep, m_no_h, ), use_container_width=True) with rc2: fig_cat = category_chart(qs_all) if fig_cat: st.markdown('
Category Breakdown
' '
Avg relevance by topic — 5 queries per category
', unsafe_allow_html=True) st.plotly_chart(fig_cat, use_container_width=True) else: st.markdown('
5-Metric Quality
' '
Higher is better (0–1)
', unsafe_allow_html=True) st.plotly_chart(bar_two( ["Rel", "Coherence", "Completeness", "Depth"], [s_rel, s_coh, s_comp, s_dep], [m_rel, m_coh, m_comp, m_dep], ymax=1.0, ), use_container_width=True) # ── Chart Row 2: Trend + Scatter + Win donut ────────────────────────────── rd1, rd2, rd3 = st.columns([2, 2, 1.4]) with rd1: st.markdown('
Relevance Trend
' '
Query-by-query score progression
', unsafe_allow_html=True) if qs_all: st.plotly_chart(trend_line(qs_all), use_container_width=True) else: st.info("Run benchmark to see trend data.") with rd2: st.markdown('
Latency vs Relevance
' '
Each dot = one query (hover for details)
', unsafe_allow_html=True) if qs_all: st.plotly_chart(scatter_lat_rel(qs_all), use_container_width=True) else: st.info("Run benchmark to see scatter data.") with rd3: st.markdown('
Win Distribution
' '
Who scored higher per query
', unsafe_allow_html=True) if qs_all: st.plotly_chart(win_donut(qs_all), use_container_width=True) else: st.info("Run benchmark first.") # ── Chart Row 3: Gauges + Latency dist ─────────────────────────────────── re1, re2 = st.columns(2) with re1: st.markdown('
Hallucination Gauge
' '
Lower is better — green zone is target
', unsafe_allow_html=True) st.plotly_chart(gauge_duo(s_hpct, m_hpct), use_container_width=True) with re2: st.markdown('
Latency Distribution
' '
Query response time buckets
', unsafe_allow_html=True) if qs_all: st.plotly_chart(latency_dist(qs_all), use_container_width=True) else: st.info("Run benchmark first.") # ── Response comparison tabs ────────────────────────────────────────────── st.markdown("---") st.markdown('
RESPONSE COMPARISON — FIRST 10 QUERIES
', unsafe_allow_html=True) bench_tabs = _bench_tabs_data() if not bench_tabs: st.info("Run `python bench_runner.py` to populate comparisons.") return tabs = st.tabs([bq["label"] for bq in bench_tabs]) for tab, bq in zip(tabs, bench_tabs): with tab: sc, mc = st.columns(2) s, m = bq["s"], bq["m"] def resp_card(s_data, is_winner): h_cls = "ab-halu-bad" if s_data["halu"] == "Yes" else "ab-halu-ok" icon = "🔬" if is_winner else "⚡" name = "Multi-Agent" if is_winner else "Single Agent" pipe = (_multi_pipe(["done","done","done","done"]) if is_winner else _step("Direct LLM call", "done")) card = "ab-resp-card winner" if is_winner else "ab-resp-card" mrow = "" if any(s_data.get(k, 0) for k in ["coherence","completeness","depth"]): clr = "blue" if is_winner else "orange" mrow = f"""
{_mini_bar("Rel", s_data.get("rel",0), clr)} {_mini_bar("Coh", s_data.get("coherence",0), clr)} {_mini_bar("Comp", s_data.get("completeness",0), clr)} {_mini_bar("Depth", s_data.get("depth",0), clr)}
""" return f"""
{icon} {name} {_e(s_data["halu"])}
{pipe}
{_e(s_data["text"])}
{mrow}
Latency {s_data["lat"]} Relevance {s_data["rel"]}
""" with sc: st.markdown(resp_card(s, False), unsafe_allow_html=True) with mc: st.markdown(resp_card(m, True), unsafe_allow_html=True) # ── Sidebar ─────────────────────────────────────────────────────────────────── def sidebar_nav() -> str: bench_data = _load_bench() summary = bench_data.get("summary", {}) m_hpct = round(summary.get("m_halu_rate", 0.04) * 100) if summary else 4 total_q = summary.get("total", 0) if summary else 0 with st.sidebar: st.markdown(f"""
AgentBench
Multi-agent eval
""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) page = st.radio("nav", options=["Live query", "Benchmarks"], label_visibility="collapsed") st.markdown("
", unsafe_allow_html=True) st.markdown("---") # Model info st.markdown(f"""
Model
llama-3.3-70b
Provider
Tavily Search · Groq
""", unsafe_allow_html=True) if total_q > 0: st.markdown("
", unsafe_allow_html=True) st.markdown("---") st.markdown(f"""
Benchmark Stats
Queries evaluated{total_q}
Multi halluc. rate{m_hpct}%
Metrics per query5
""", unsafe_allow_html=True) return page # ── Entry ───────────────────────────────────────────────────────────────────── def main(): page = sidebar_nav() if page == "Live query": panel_live() else: panel_bench() if __name__ == "__main__": main()