Spaces:

mdarahmanxAI
/

comptoolbench-demo

Sleeping

App Files Files Community

comptoolbench-demo / app.py

mdarahmanxAI

Upload app.py with huggingface_hub

4d5e5f0 verified 2 days ago

raw

history blame contribute delete

26.2 kB

	"""CompToolBench Gradio Demo — Interactive Benchmark Explorer.

	Designed for HuggingFace Spaces (free CPU tier).
	Launch locally: python demo/app.py
	"""

	from __future__ import annotations

	import gradio as gr
	import pandas as pd
	import plotly.graph_objects as go

	# ---------------------------------------------------------------------------
	# DATA — extracted verbatim from paper/tables/leaderboard.tex
	# Columns: Model, Provider, L0, L1, L2, L3, Overall, Delta, SelectionGap
	# Delta = L0 - L3 (positive = degradation).
	# SelectionGap (dagger) = L0 < avg(L1,L2,L3).
	# ---------------------------------------------------------------------------

	CLOUD_MODELS = [
	# (Model, Provider, L0, L1, L2, L3, Overall, Delta, SelectionGap)
	("Llama 3.1 8B", "Groq", 27.1, 75.8, 87.1, 76.0, 66.4, -48.9, True),
	("Command A", "Cohere", 45.8, 62.7, 87.8, 40.8, 58.4, 5.1, True),
	("Mistral Small", "Mistral", 45.8, 59.7, 87.6, 40.9, 57.5, 4.9, True),
	("Command R+", "Cohere", 43.8, 57.5, 88.0, 40.3, 56.2, 3.4, True),
	("Llama 3.1 8B", "Cerebras", 31.2, 66.1, 81.2, 46.4, 56.0, -15.1, True),
	("Mistral Large", "Mistral", 39.6, 59.5, 87.9, 38.5, 55.4, 1.1, True),
	("Mistral Medium", "Mistral", 43.8, 57.5, 87.9, 36.3, 55.2, 7.4, True),
	("Gemini 2.0 Flash", "OpenRouter", 39.6, 52.4, 85.7, 39.0, 52.8, 0.6, True),
	("GPT-OSS 120B", "Cerebras", 45.8, 56.3, 56.1, 29.0, 47.2, 16.8, True),
	("Llama 4 Scout 17B", "Groq", 37.5, 49.6, 55.8, 7.0, 37.7, 30.5, False),
	]

	LOCAL_MODELS = [
	("Granite4 3B", "Ollama", 45.8, 57.3, 56.1, 30.2, 47.8, 15.6, True),
	("Granite4 1B", "Ollama", 41.7, 56.3, 55.9, 29.9, 46.4, 11.8, True),
	("Mistral 7B", "Ollama", 43.8, 57.7, 49.2, 30.5, 46.1, 13.3, True),
	("Llama 3.1 8B", "Ollama", 39.6, 56.7, 56.1, 29.5, 45.9, 10.1, True),
	("Mistral Nemo 12B", "Ollama", 37.5, 58.4, 51.0, 31.8, 45.5, 5.7, True),
	("Qwen 2.5 7B", "Ollama", 39.6, 56.7, 53.8, 25.8, 44.6, 13.8, True),
	("Mistral Small 24B", "Ollama", 37.5, 51.1, 47.7, 22.6, 40.3, 14.9, True),
	("Qwen3 8B", "Ollama", 35.4, 52.0, 36.9, 21.8, 37.7, 13.7, True),
	]

	# Averages from the table
	AVERAGES = {
	"All models": {"L0": 40.0, "L1": 58.0, "L2": 67.3, "L3": 34.2, "Overall": 49.8, "Delta": 5.8},
	"Cloud avg": {"L0": 40.0, "L1": 59.7, "L2": 80.5, "L3": 39.4, "Overall": 54.3, "Delta": 0.6},
	"Local avg": {"L0": 40.1, "L1": 55.8, "L2": 50.8, "L3": 27.8, "Overall": 44.3, "Delta": 12.3},
	}


	def _build_display_name(model: str, provider: str) -> str:
	"""Build a unique display name like 'Llama 3.1 8B (Groq)'."""
	return f"{model} ({provider})"


	def build_full_dataframe() -> pd.DataFrame:
	"""Build the full leaderboard DataFrame with all 18 models."""
	rows = []
	for model, provider, l0, l1, l2, l3, overall, delta, sgap in CLOUD_MODELS:
	composed_avg = (l1 + l2 + l3) / 3.0
	rows.append({
	"Rank": 0,
	"Model": _build_display_name(model, provider),
	"Provider": provider,
	"Type": "Cloud",
	"L0": l0,
	"L1": l1,
	"L2": l2,
	"L3": l3,
	"Overall": overall,
	"Delta": delta,
	"Selection Gap": sgap,
	"Composed Avg": round(composed_avg, 1),
	})
	for model, provider, l0, l1, l2, l3, overall, delta, sgap in LOCAL_MODELS:
	composed_avg = (l1 + l2 + l3) / 3.0
	rows.append({
	"Rank": 0,
	"Model": _build_display_name(model, provider),
	"Provider": provider,
	"Type": "Local",
	"L0": l0,
	"L1": l1,
	"L2": l2,
	"L3": l3,
	"Overall": overall,
	"Delta": delta,
	"Selection Gap": sgap,
	"Composed Avg": round(composed_avg, 1),
	})

	df = pd.DataFrame(rows)
	df = df.sort_values("Overall", ascending=False).reset_index(drop=True)
	df["Rank"] = df.index + 1
	return df


	# ---------------------------------------------------------------------------
	# PLOTLY THEME CONSTANTS
	# ---------------------------------------------------------------------------
	BG_COLOR = "#1a1a2e"
	CARD_BG = "#16213e"
	GRID_COLOR = "#2a2a4a"
	TEXT_COLOR = "#e0e0e0"
	ACCENT_BLUE = "#4fc3f7"
	ACCENT_GREEN = "#66bb6a"
	ACCENT_ORANGE = "#ffa726"
	ACCENT_RED = "#ef5350"
	ACCENT_PURPLE = "#ab47bc"

	LEVEL_COLORS = {
	"L0": ACCENT_BLUE,
	"L1": ACCENT_GREEN,
	"L2": ACCENT_ORANGE,
	"L3": ACCENT_RED,
	}

	PLOTLY_LAYOUT = dict(
	paper_bgcolor=BG_COLOR,
	plot_bgcolor=CARD_BG,
	font=dict(color=TEXT_COLOR, family="Inter, system-ui, sans-serif"),
	xaxis=dict(gridcolor=GRID_COLOR, zerolinecolor=GRID_COLOR),
	yaxis=dict(gridcolor=GRID_COLOR, zerolinecolor=GRID_COLOR),
	margin=dict(l=60, r=30, t=60, b=80),
	hoverlabel=dict(bgcolor=CARD_BG, font_color=TEXT_COLOR, bordercolor=GRID_COLOR),
	)


	def _apply_layout(fig: go.Figure, **kwargs) -> go.Figure:
	"""Apply consistent dark theme to a plotly figure."""
	layout = {PLOTLY_LAYOUT, kwargs}
	fig.update_layout(**layout)
	return fig


	# ---------------------------------------------------------------------------
	# TAB 1: LEADERBOARD (styled DataFrame)
	# ---------------------------------------------------------------------------
	def format_leaderboard_html(df: pd.DataFrame) -> str:
	"""Build a styled HTML leaderboard table with color-coded scores."""

	def _score_color(val: float, low: float = 20.0, high: float = 80.0) -> str:
	"""Map a score to a green-yellow-red gradient."""
	ratio = max(0.0, min(1.0, (val - low) / (high - low)))
	if ratio > 0.5:
	# green zone
	r = int(255 * (1 - (ratio - 0.5) * 2))
	g = 200
	else:
	# red zone
	r = 240
	g = int(200 * ratio * 2)
	return f"rgb({r},{g},80)"

	def _gap_badge(has_gap: bool) -> str:
	if has_gap:
	return '<span style="color:#66bb6a;font-weight:600;">Yes</span>'
	return '<span style="color:#999;">No</span>'

	def _type_badge(model_type: str) -> str:
	if model_type == "Cloud":
	return '<span style="background:#1e3a5f;color:#4fc3f7;padding:2px 8px;border-radius:4px;font-size:0.8em;">Cloud</span>'
	return '<span style="background:#2e3a1f;color:#a5d6a7;padding:2px 8px;border-radius:4px;font-size:0.8em;">Local</span>'

	css = """
	<style>
	.lb-table {
	width: 100%;
	border-collapse: collapse;
	font-family: 'Inter', system-ui, sans-serif;
	font-size: 14px;
	}
	.lb-table th {
	background: #0d1b2a;
	color: #b0bec5;
	padding: 12px 10px;
	text-align: center;
	font-weight: 600;
	border-bottom: 2px solid #2a2a4a;
	cursor: pointer;
	user-select: none;
	white-space: nowrap;
	}
	.lb-table th:first-child, .lb-table th:nth-child(2) {
	text-align: left;
	}
	.lb-table td {
	padding: 10px 10px;
	text-align: center;
	border-bottom: 1px solid #1a1a3a;
	}
	.lb-table td:first-child {
	font-weight: 700;
	color: #ffd54f;
	text-align: center;
	width: 40px;
	}
	.lb-table td:nth-child(2) {
	text-align: left;
	font-weight: 500;
	color: #e0e0e0;
	max-width: 220px;
	}
	.lb-table tr:hover {
	background: #1e2d4a !important;
	}
	.lb-table tr:nth-child(even) {
	background: #111827;
	}
	.lb-table tr:nth-child(odd) {
	background: #0f1729;
	}
	.lb-table .score-cell {
	font-weight: 600;
	font-variant-numeric: tabular-nums;
	}
	.lb-table .overall-cell {
	font-weight: 700;
	font-size: 15px;
	}
	.lb-avg-row td {
	background: #1a1a2e !important;
	border-top: 2px solid #4fc3f7;
	font-style: italic;
	color: #90caf9;
	}
	.lb-divider td {
	background: #1a1a2e !important;
	border-top: 2px solid #2a2a4a;
	padding: 2px;
	height: 4px;
	}
	</style>
	"""

	header = """
	<table class="lb-table">
	<thead>
	<tr>
	<th>#</th>
	<th>Model</th>
	<th>Type</th>
	<th>L0</th>
	<th>L1</th>
	<th>L2</th>
	<th>L3</th>
	<th>Overall</th>
	<th>Selection Gap</th>
	</tr>
	</thead>
	<tbody>
	"""

	rows_html = ""
	for _, row in df.iterrows():
	l0_c = _score_color(row["L0"])
	l1_c = _score_color(row["L1"])
	l2_c = _score_color(row["L2"])
	l3_c = _score_color(row["L3"])
	ov_c = _score_color(row["Overall"])

	rows_html += f"""
	<tr>
	<td>{row['Rank']}</td>
	<td>{row['Model']}</td>
	<td>{_type_badge(row['Type'])}</td>
	<td class="score-cell" style="color:{l0_c}">{row['L0']:.1f}</td>
	<td class="score-cell" style="color:{l1_c}">{row['L1']:.1f}</td>
	<td class="score-cell" style="color:{l2_c}">{row['L2']:.1f}</td>
	<td class="score-cell" style="color:{l3_c}">{row['L3']:.1f}</td>
	<td class="overall-cell" style="color:{ov_c}">{row['Overall']:.1f}</td>
	<td>{_gap_badge(row['Selection Gap'])}</td>
	</tr>
	"""

	# Divider
	rows_html += '<tr class="lb-divider"><td colspan="9"></td></tr>'

	# Averages
	for label, avg in AVERAGES.items():
	l0_c = _score_color(avg["L0"])
	l1_c = _score_color(avg["L1"])
	l2_c = _score_color(avg["L2"])
	l3_c = _score_color(avg["L3"])
	ov_c = _score_color(avg["Overall"])
	rows_html += f"""
	<tr class="lb-avg-row">
	<td></td>
	<td><em>{label}</em></td>
	<td></td>
	<td class="score-cell" style="color:{l0_c}">{avg['L0']:.1f}</td>
	<td class="score-cell" style="color:{l1_c}">{avg['L1']:.1f}</td>
	<td class="score-cell" style="color:{l2_c}">{avg['L2']:.1f}</td>
	<td class="score-cell" style="color:{l3_c}">{avg['L3']:.1f}</td>
	<td class="overall-cell" style="color:{ov_c}">{avg['Overall']:.1f}</td>
	<td></td>
	</tr>
	"""

	footer = "</tbody></table>"
	return css + header + rows_html + footer


	# ---------------------------------------------------------------------------
	# TAB 2: SELECTION GAP VISUALIZATION
	# ---------------------------------------------------------------------------
	def plot_selection_gap(df: pd.DataFrame) -> go.Figure:
	"""Bar chart: L0 vs Composed Average for each model, with gap arrows."""
	df_sorted = df.sort_values("Overall", ascending=True)

	fig = go.Figure()

	# L0 bars
	fig.add_trace(go.Bar(
	y=df_sorted["Model"],
	x=df_sorted["L0"],
	name="L0 (Single Tool)",
	orientation="h",
	marker=dict(color=ACCENT_BLUE, line=dict(width=0)),
	text=[f"{v:.1f}" for v in df_sorted["L0"]],
	textposition="inside",
	textfont=dict(size=11, color="white"),
	hovertemplate="<b>%{y}</b><br>L0: %{x:.1f}%<extra></extra>",
	))

	# Composed average bars
	fig.add_trace(go.Bar(
	y=df_sorted["Model"],
	x=df_sorted["Composed Avg"],
	name="Composed Avg (L1-L3)",
	orientation="h",
	marker=dict(color=ACCENT_ORANGE, line=dict(width=0)),
	text=[f"{v:.1f}" for v in df_sorted["Composed Avg"]],
	textposition="inside",
	textfont=dict(size=11, color="white"),
	hovertemplate="<b>%{y}</b><br>Composed Avg: %{x:.1f}%<extra></extra>",
	))

	# Add gap annotations
	for _, row in df_sorted.iterrows():
	gap = row["Composed Avg"] - row["L0"]
	direction = "+" if gap > 0 else ""
	color = ACCENT_GREEN if gap > 0 else ACCENT_RED
	x_pos = max(row["L0"], row["Composed Avg"]) + 2
	fig.add_annotation(
	x=x_pos,
	y=row["Model"],
	text=f"<b>{direction}{gap:.1f}</b>",
	showarrow=False,
	font=dict(color=color, size=11),
	xanchor="left",
	)

	fig = _apply_layout(
	fig,
	title=dict(text="Selection Gap: L0 (Single Tool) vs Composed Average (L1-L3)", font=dict(size=16)),
	barmode="group",
	xaxis=dict(title="Accuracy (%)", range=[0, 100], gridcolor=GRID_COLOR),
	yaxis=dict(title="", gridcolor=GRID_COLOR, tickfont=dict(size=11)),
	legend=dict(
	orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
	bgcolor="rgba(0,0,0,0)",
	),
	height=700,
	)
	return fig


	# ---------------------------------------------------------------------------
	# TAB 3: LEVEL COMPARISON
	# ---------------------------------------------------------------------------
	def plot_level_comparison(df: pd.DataFrame, model_type: str = "All") -> go.Figure:
	"""Grouped bar chart: L0/L1/L2/L3 per model, filterable by type."""
	if model_type == "Cloud":
	df_plot = df[df["Type"] == "Cloud"].copy()
	elif model_type == "Local":
	df_plot = df[df["Type"] == "Local"].copy()
	else:
	df_plot = df.copy()

	df_plot = df_plot.sort_values("Overall", ascending=True)

	fig = go.Figure()

	for level, color in LEVEL_COLORS.items():
	fig.add_trace(go.Bar(
	y=df_plot["Model"],
	x=df_plot[level],
	name=level,
	orientation="h",
	marker=dict(color=color, line=dict(width=0.5, color="#111")),
	text=[f"{v:.1f}" for v in df_plot[level]],
	textposition="outside",
	textfont=dict(size=9),
	hovertemplate=f"<b>%{{y}}</b><br>{level}: %{{x:.1f}}%<extra></extra>",
	))

	n_models = len(df_plot)
	fig = _apply_layout(
	fig,
	title=dict(
	text=f"Performance by Composition Level ({model_type} Models)",
	font=dict(size=16),
	),
	barmode="group",
	xaxis=dict(title="Accuracy (%)", range=[0, 105], gridcolor=GRID_COLOR),
	yaxis=dict(title="", gridcolor=GRID_COLOR, tickfont=dict(size=11)),
	legend=dict(
	orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
	bgcolor="rgba(0,0,0,0)",
	),
	height=max(400, n_models * 50 + 150),
	)
	return fig


	def plot_level_radar() -> go.Figure:
	"""Radar/spider chart comparing cloud vs local averages."""
	categories = ["L0", "L1", "L2", "L3"]

	fig = go.Figure()

	fig.add_trace(go.Scatterpolar(
	r=[AVERAGES["Cloud avg"]["L0"], AVERAGES["Cloud avg"]["L1"],
	AVERAGES["Cloud avg"]["L2"], AVERAGES["Cloud avg"]["L3"],
	AVERAGES["Cloud avg"]["L0"]],
	theta=categories + [categories[0]],
	fill="toself",
	name="Cloud Avg",
	line=dict(color=ACCENT_BLUE, width=2),
	fillcolor="rgba(79, 195, 247, 0.2)",
	))

	fig.add_trace(go.Scatterpolar(
	r=[AVERAGES["Local avg"]["L0"], AVERAGES["Local avg"]["L1"],
	AVERAGES["Local avg"]["L2"], AVERAGES["Local avg"]["L3"],
	AVERAGES["Local avg"]["L0"]],
	theta=categories + [categories[0]],
	fill="toself",
	name="Local Avg",
	line=dict(color=ACCENT_PURPLE, width=2),
	fillcolor="rgba(171, 71, 188, 0.2)",
	))

	fig.update_layout(
	polar=dict(
	bgcolor=CARD_BG,
	radialaxis=dict(
	visible=True, range=[0, 90],
	gridcolor=GRID_COLOR, linecolor=GRID_COLOR,
	tickfont=dict(color=TEXT_COLOR, size=10),
	),
	angularaxis=dict(
	gridcolor=GRID_COLOR, linecolor=GRID_COLOR,
	tickfont=dict(color=TEXT_COLOR, size=13, family="Inter, system-ui, sans-serif"),
	),
	),
	paper_bgcolor=BG_COLOR,
	font=dict(color=TEXT_COLOR, family="Inter, system-ui, sans-serif"),
	title=dict(text="Cloud vs Local: Performance Profile", font=dict(size=16, color=TEXT_COLOR)),
	legend=dict(
	orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5,
	bgcolor="rgba(0,0,0,0)",
	),
	height=500,
	margin=dict(l=80, r=80, t=80, b=80),
	)
	return fig


	# ---------------------------------------------------------------------------
	# TAB 4: ABOUT
	# ---------------------------------------------------------------------------
	ABOUT_MD = """
	## CompToolBench: Measuring Compositional Tool-Use in LLMs

	CompToolBench is a benchmark that measures compositional tool-use generalization in large
	language models. The central question: if an LLM can use tools A, B, and C individually, can it
	compose them into novel pipelines like `A(B(C(x)))`?

	---

	### Composition Levels

	\| Level \| Topology \| Description \|
	\|:------\|:---------\|:------------\|
	\| L0 (Node) \| Single call \| One tool, correct arguments -- the baseline \|
	\| L1 (Chain) \| A -> B -> C \| Sequential: output of tool_i feeds tool_{i+1} \|
	\| L2 (Parallel) \| [A, B] -> C \| Independent calls whose results merge downstream \|
	\| L3 (DAG) \| Complex graph \| Branching, merging, and sequential edges combined \|

	---

	### Key Finding: The Selection Gap

	> 17 out of 18 models exhibit a Selection Gap: their L0 (single-tool) accuracy is lower
	> than their average accuracy on composed tasks (L1-L3).

	This is counter-intuitive. Models are better at multi-step composition than at simple
	single-tool selection. The explanation: L0 tests pure tool selection (choosing the right
	tool from a large catalogue), while L1-L3 tasks provide more structural context that narrows
	the search space. The hardest part of tool use is not execution -- it is selection.

	---

	### Benchmark Details

	- 18 models evaluated (10 cloud API, 8 local via Ollama)
	- 106 deterministic tool simulations across 15 categories
	- 200 tasks at 4 composition levels (L0-L3)
	- Deterministic scoring with verifiable ground-truth execution traces

	---

	### Links

	\| Resource \| Link \|
	\|:---------\|:-----\|
	\| Paper \| [ArXiv (coming soon)](#) \|
	\| Code \| [github.com/ronyrahmaan/comptoolbench](https://github.com/ronyrahmaan/comptoolbench) \|
	\| Author \| Md A Rahman, Texas Tech University \|

	---

	<p style="text-align:center;color:#666;font-size:0.85em;">
	CompToolBench -- February 2026
	</p>
	"""


	# ---------------------------------------------------------------------------
	# GRADIO APP
	# ---------------------------------------------------------------------------
	def create_app() -> gr.Blocks:
	"""Build the full 4-tab Gradio Blocks application."""
	df = build_full_dataframe()

	custom_css = """
	.gradio-container {
	max-width: 1200px !important;
	margin: auto !important;
	}
	.main-header {
	text-align: center;
	padding: 20px 0 10px 0;
	}
	.main-header h1 {
	font-size: 2em;
	font-weight: 700;
	background: linear-gradient(135deg, #4fc3f7, #ab47bc);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	margin-bottom: 8px;
	}
	.main-header p {
	color: #aaa;
	font-size: 1.1em;
	}
	.stat-row {
	display: flex;
	justify-content: center;
	gap: 40px;
	padding: 15px 0;
	flex-wrap: wrap;
	}
	.stat-item {
	text-align: center;
	}
	.stat-num {
	font-size: 1.8em;
	font-weight: 700;
	color: #4fc3f7;
	}
	.stat-label {
	font-size: 0.85em;
	color: #888;
	text-transform: uppercase;
	letter-spacing: 1px;
	}
	footer {visibility: hidden;}
	"""

	theme = gr.themes.Base(
	primary_hue=gr.themes.colors.blue,
	secondary_hue=gr.themes.colors.purple,
	neutral_hue=gr.themes.colors.gray,
	font=gr.themes.GoogleFont("Inter"),
	).set(
	body_background_fill="#0f0f1a",
	body_background_fill_dark="#0f0f1a",
	block_background_fill="#1a1a2e",
	block_background_fill_dark="#1a1a2e",
	block_border_color="#2a2a4a",
	block_border_color_dark="#2a2a4a",
	block_label_text_color="#b0bec5",
	block_label_text_color_dark="#b0bec5",
	block_title_text_color="#e0e0e0",
	block_title_text_color_dark="#e0e0e0",
	body_text_color="#e0e0e0",
	body_text_color_dark="#e0e0e0",
	body_text_color_subdued="#888",
	body_text_color_subdued_dark="#888",
	background_fill_primary="#16213e",
	background_fill_primary_dark="#16213e",
	background_fill_secondary="#1a1a2e",
	background_fill_secondary_dark="#1a1a2e",
	border_color_accent="#4fc3f7",
	border_color_accent_dark="#4fc3f7",
	color_accent_soft="#1e3a5f",
	color_accent_soft_dark="#1e3a5f",
	button_primary_background_fill="#4fc3f7",
	button_primary_background_fill_dark="#4fc3f7",
	button_primary_text_color="#0f0f1a",
	button_primary_text_color_dark="#0f0f1a",
	)

	# Gradio 6+ moved theme/css from Blocks() to launch().
	# Detect version and pass params accordingly.
	_gradio_major = int(gr.__version__.split(".")[0])
	_blocks_kwargs: dict = {"title": "CompToolBench"}
	if _gradio_major < 6:
	_blocks_kwargs["theme"] = theme
	_blocks_kwargs["css"] = custom_css

	with gr.Blocks(**_blocks_kwargs) as app:
	# ── Header ──
	gr.HTML("""
	<div class="main-header">
	<h1>CompToolBench</h1>
	<p>Measuring Compositional Tool-Use Generalization in LLMs</p>
	</div>
	<div class="stat-row">
	<div class="stat-item">
	<div class="stat-num">18</div>
	<div class="stat-label">Models</div>
	</div>
	<div class="stat-item">
	<div class="stat-num">106</div>
	<div class="stat-label">Tools</div>
	</div>
	<div class="stat-item">
	<div class="stat-num">4</div>
	<div class="stat-label">Composition Levels</div>
	</div>
	<div class="stat-item">
	<div class="stat-num">17/18</div>
	<div class="stat-label">Show Selection Gap</div>
	</div>
	</div>
	""")

	# ── Tab 1: Leaderboard ──
	with gr.Tab("Leaderboard", id="leaderboard"):
	gr.HTML(format_leaderboard_html(df))
	gr.Markdown(
	"""
	Reading the table: Scores are accuracy percentages. Colors range from
	<span style="color:#ef5350">red</span> (low) to
	<span style="color:#66bb6a">green</span> (high).
	Selection Gap = model's L0 is lower than its average of L1-L3
	(i.e., models are better at composed tasks than single-tool selection).
	Delta in the paper = L0 minus L3 (positive means degradation from single to DAG).
	""",
	elem_classes=["block"],
	)

	# ── Tab 2: Selection Gap ──
	with gr.Tab("Selection Gap", id="selection-gap"):
	gr.Markdown(
	"### The Selection Gap: Why are models better at composed tasks than single-tool calls?"
	)
	gr.Plot(plot_selection_gap(df))
	gr.Markdown(
	"""
	How to read this chart: For each model, the blue bar shows L0 accuracy
	(single-tool selection) and the orange bar shows the average of L1, L2, L3
	(composed tasks). The number on the right is the gap.

	A positive gap (green number) means the model performs better on composed
	tasks -- the Selection Gap. This happens because multi-step prompts provide
	richer structural context that narrows the tool search space.

	Only Llama 4 Scout 17B does not exhibit a Selection Gap, because its L3
	accuracy collapses to 7.0% (catastrophic DAG failure).
	"""
	)

	# ── Tab 3: Level Comparison ──
	with gr.Tab("Level Comparison", id="level-comparison"):
	gr.Markdown("### Performance breakdown by composition level")
	model_filter = gr.Radio(
	choices=["All", "Cloud", "Local"],
	value="All",
	label="Filter by deployment type",
	)
	level_chart = gr.Plot(plot_level_comparison(df, "All"))
	model_filter.change(
	fn=lambda t: plot_level_comparison(df, t),
	inputs=[model_filter],
	outputs=[level_chart],
	)

	gr.Markdown("### Cloud vs Local: Aggregate Profile")
	gr.Plot(plot_level_radar())
	gr.Markdown(
	"""
	Key insight: Cloud models massively outperform local models on L2
	(parallel composition): 80.5% vs 50.8%. This 30-point gap is the largest
	difference between deployment types at any level, suggesting that parallel
	tool orchestration is where API-served models have the biggest advantage.
	"""
	)

	# ── Tab 4: About ──
	with gr.Tab("About", id="about"):
	gr.Markdown(ABOUT_MD)

	# Store launch kwargs for Gradio 6+ theme/css
	app._ctb_launch_kwargs = {} # type: ignore[attr-defined]
	if _gradio_major >= 6:
	app._ctb_launch_kwargs["theme"] = theme # type: ignore[attr-defined]
	app._ctb_launch_kwargs["css"] = custom_css # type: ignore[attr-defined]

	return app


	# ---------------------------------------------------------------------------
	# ENTRY POINT
	# ---------------------------------------------------------------------------
	if __name__ == "__main__":
	app = create_app()
	launch_kwargs = getattr(app, "_ctb_launch_kwargs", {})
	app.launch(share=False, **launch_kwargs)