Spaces:

mdarahmanxAI
/

comptoolbench-demo

Sleeping

App Files Files Community

mdarahmanxAI commited on 2 days ago

Commit

4d5e5f0

verified ·

1 Parent(s): 7f7f06f

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +735 -0

app.py ADDED Viewed

	@@ -0,0 +1,735 @@

+"""CompToolBench Gradio Demo — Interactive Benchmark Explorer.
+Designed for HuggingFace Spaces (free CPU tier).
+Launch locally:  python demo/app.py
+"""
+from __future__ import annotations
+import gradio as gr
+import pandas as pd
+import plotly.graph_objects as go
+# ---------------------------------------------------------------------------
+# DATA — extracted verbatim from paper/tables/leaderboard.tex
+# Columns: Model, Provider, L0, L1, L2, L3, Overall, Delta, SelectionGap
+# Delta = L0 - L3 (positive = degradation).
+# SelectionGap (dagger) = L0 < avg(L1,L2,L3).
+# ---------------------------------------------------------------------------
+CLOUD_MODELS = [
+    # (Model, Provider, L0, L1, L2, L3, Overall, Delta, SelectionGap)
+    ("Llama 3.1 8B",      "Groq",       27.1, 75.8, 87.1, 76.0, 66.4, -48.9, True),
+    ("Command A",          "Cohere",     45.8, 62.7, 87.8, 40.8, 58.4,   5.1, True),
+    ("Mistral Small",      "Mistral",    45.8, 59.7, 87.6, 40.9, 57.5,   4.9, True),
+    ("Command R+",         "Cohere",     43.8, 57.5, 88.0, 40.3, 56.2,   3.4, True),
+    ("Llama 3.1 8B",      "Cerebras",   31.2, 66.1, 81.2, 46.4, 56.0, -15.1, True),
+    ("Mistral Large",      "Mistral",    39.6, 59.5, 87.9, 38.5, 55.4,   1.1, True),
+    ("Mistral Medium",     "Mistral",    43.8, 57.5, 87.9, 36.3, 55.2,   7.4, True),
+    ("Gemini 2.0 Flash",   "OpenRouter", 39.6, 52.4, 85.7, 39.0, 52.8,   0.6, True),
+    ("GPT-OSS 120B",       "Cerebras",   45.8, 56.3, 56.1, 29.0, 47.2,  16.8, True),
+    ("Llama 4 Scout 17B",  "Groq",       37.5, 49.6, 55.8,  7.0, 37.7,  30.5, False),
+]
+LOCAL_MODELS = [
+    ("Granite4 3B",        "Ollama",     45.8, 57.3, 56.1, 30.2, 47.8,  15.6, True),
+    ("Granite4 1B",        "Ollama",     41.7, 56.3, 55.9, 29.9, 46.4,  11.8, True),
+    ("Mistral 7B",         "Ollama",     43.8, 57.7, 49.2, 30.5, 46.1,  13.3, True),
+    ("Llama 3.1 8B",      "Ollama",     39.6, 56.7, 56.1, 29.5, 45.9,  10.1, True),
+    ("Mistral Nemo 12B",   "Ollama",     37.5, 58.4, 51.0, 31.8, 45.5,   5.7, True),
+    ("Qwen 2.5 7B",        "Ollama",     39.6, 56.7, 53.8, 25.8, 44.6,  13.8, True),
+    ("Mistral Small 24B",  "Ollama",     37.5, 51.1, 47.7, 22.6, 40.3,  14.9, True),
+    ("Qwen3 8B",           "Ollama",     35.4, 52.0, 36.9, 21.8, 37.7,  13.7, True),
+]
+# Averages from the table
+AVERAGES = {
+    "All models":  {"L0": 40.0, "L1": 58.0, "L2": 67.3, "L3": 34.2, "Overall": 49.8, "Delta": 5.8},
+    "Cloud avg":   {"L0": 40.0, "L1": 59.7, "L2": 80.5, "L3": 39.4, "Overall": 54.3, "Delta": 0.6},
+    "Local avg":   {"L0": 40.1, "L1": 55.8, "L2": 50.8, "L3": 27.8, "Overall": 44.3, "Delta": 12.3},
+}
+def _build_display_name(model: str, provider: str) -> str:
+    """Build a unique display name like 'Llama 3.1 8B (Groq)'."""
+    return f"{model} ({provider})"
+def build_full_dataframe() -> pd.DataFrame:
+    """Build the full leaderboard DataFrame with all 18 models."""
+    rows = []
+    for model, provider, l0, l1, l2, l3, overall, delta, sgap in CLOUD_MODELS:
+        composed_avg = (l1 + l2 + l3) / 3.0
+        rows.append({
+            "Rank": 0,
+            "Model": _build_display_name(model, provider),
+            "Provider": provider,
+            "Type": "Cloud",
+            "L0": l0,
+            "L1": l1,
+            "L2": l2,
+            "L3": l3,
+            "Overall": overall,
+            "Delta": delta,
+            "Selection Gap": sgap,
+            "Composed Avg": round(composed_avg, 1),
+        })
+    for model, provider, l0, l1, l2, l3, overall, delta, sgap in LOCAL_MODELS:
+        composed_avg = (l1 + l2 + l3) / 3.0
+        rows.append({
+            "Rank": 0,
+            "Model": _build_display_name(model, provider),
+            "Provider": provider,
+            "Type": "Local",
+            "L0": l0,
+            "L1": l1,
+            "L2": l2,
+            "L3": l3,
+            "Overall": overall,
+            "Delta": delta,
+            "Selection Gap": sgap,
+            "Composed Avg": round(composed_avg, 1),
+        })
+    df = pd.DataFrame(rows)
+    df = df.sort_values("Overall", ascending=False).reset_index(drop=True)
+    df["Rank"] = df.index + 1
+    return df
+# ---------------------------------------------------------------------------
+# PLOTLY THEME CONSTANTS
+# ---------------------------------------------------------------------------
+BG_COLOR = "#1a1a2e"
+CARD_BG = "#16213e"
+GRID_COLOR = "#2a2a4a"
+TEXT_COLOR = "#e0e0e0"
+ACCENT_BLUE = "#4fc3f7"
+ACCENT_GREEN = "#66bb6a"
+ACCENT_ORANGE = "#ffa726"
+ACCENT_RED = "#ef5350"
+ACCENT_PURPLE = "#ab47bc"
+LEVEL_COLORS = {
+    "L0": ACCENT_BLUE,
+    "L1": ACCENT_GREEN,
+    "L2": ACCENT_ORANGE,
+    "L3": ACCENT_RED,
+}
+PLOTLY_LAYOUT = dict(
+    paper_bgcolor=BG_COLOR,
+    plot_bgcolor=CARD_BG,
+    font=dict(color=TEXT_COLOR, family="Inter, system-ui, sans-serif"),
+    xaxis=dict(gridcolor=GRID_COLOR, zerolinecolor=GRID_COLOR),
+    yaxis=dict(gridcolor=GRID_COLOR, zerolinecolor=GRID_COLOR),
+    margin=dict(l=60, r=30, t=60, b=80),
+    hoverlabel=dict(bgcolor=CARD_BG, font_color=TEXT_COLOR, bordercolor=GRID_COLOR),
+)
+def _apply_layout(fig: go.Figure, **kwargs) -> go.Figure:
+    """Apply consistent dark theme to a plotly figure."""
+    layout = {**PLOTLY_LAYOUT, **kwargs}
+    fig.update_layout(**layout)
+    return fig
+# ---------------------------------------------------------------------------
+# TAB 1: LEADERBOARD (styled DataFrame)
+# ---------------------------------------------------------------------------
+def format_leaderboard_html(df: pd.DataFrame) -> str:
+    """Build a styled HTML leaderboard table with color-coded scores."""
+    def _score_color(val: float, low: float = 20.0, high: float = 80.0) -> str:
+        """Map a score to a green-yellow-red gradient."""
+        ratio = max(0.0, min(1.0, (val - low) / (high - low)))
+        if ratio > 0.5:
+            # green zone
+            r = int(255 * (1 - (ratio - 0.5) * 2))
+            g = 200
+        else:
+            # red zone
+            r = 240
+            g = int(200 * ratio * 2)
+        return f"rgb({r},{g},80)"
+    def _gap_badge(has_gap: bool) -> str:
+        if has_gap:
+            return '<span style="color:#66bb6a;font-weight:600;">Yes</span>'
+        return '<span style="color:#999;">No</span>'
+    def _type_badge(model_type: str) -> str:
+        if model_type == "Cloud":
+            return '<span style="background:#1e3a5f;color:#4fc3f7;padding:2px 8px;border-radius:4px;font-size:0.8em;">Cloud</span>'
+        return '<span style="background:#2e3a1f;color:#a5d6a7;padding:2px 8px;border-radius:4px;font-size:0.8em;">Local</span>'
+    css = """
+    <style>
+    .lb-table {
+        width: 100%;
+        border-collapse: collapse;
+        font-family: 'Inter', system-ui, sans-serif;
+        font-size: 14px;
+    }
+    .lb-table th {
+        background: #0d1b2a;
+        color: #b0bec5;
+        padding: 12px 10px;
+        text-align: center;
+        font-weight: 600;
+        border-bottom: 2px solid #2a2a4a;
+        cursor: pointer;
+        user-select: none;
+        white-space: nowrap;
+    }
+    .lb-table th:first-child, .lb-table th:nth-child(2) {
+        text-align: left;
+    }
+    .lb-table td {
+        padding: 10px 10px;
+        text-align: center;
+        border-bottom: 1px solid #1a1a3a;
+    }
+    .lb-table td:first-child {
+        font-weight: 700;
+        color: #ffd54f;
+        text-align: center;
+        width: 40px;
+    }
+    .lb-table td:nth-child(2) {
+        text-align: left;
+        font-weight: 500;
+        color: #e0e0e0;
+        max-width: 220px;
+    }
+    .lb-table tr:hover {
+        background: #1e2d4a !important;
+    }
+    .lb-table tr:nth-child(even) {
+        background: #111827;
+    }
+    .lb-table tr:nth-child(odd) {
+        background: #0f1729;
+    }
+    .lb-table .score-cell {
+        font-weight: 600;
+        font-variant-numeric: tabular-nums;
+    }
+    .lb-table .overall-cell {
+        font-weight: 700;
+        font-size: 15px;
+    }
+    .lb-avg-row td {
+        background: #1a1a2e !important;
+        border-top: 2px solid #4fc3f7;
+        font-style: italic;
+        color: #90caf9;
+    }
+    .lb-divider td {
+        background: #1a1a2e !important;
+        border-top: 2px solid #2a2a4a;
+        padding: 2px;
+        height: 4px;
+    }
+    </style>
+    """
+    header = """
+    <table class="lb-table">
+    <thead>
+    <tr>
+        <th>#</th>
+        <th>Model</th>
+        <th>Type</th>
+        <th>L0</th>
+        <th>L1</th>
+        <th>L2</th>
+        <th>L3</th>
+        <th>Overall</th>
+        <th>Selection Gap</th>
+    </tr>
+    </thead>
+    <tbody>
+    """
+    rows_html = ""
+    for _, row in df.iterrows():
+        l0_c = _score_color(row["L0"])
+        l1_c = _score_color(row["L1"])
+        l2_c = _score_color(row["L2"])
+        l3_c = _score_color(row["L3"])
+        ov_c = _score_color(row["Overall"])
+        rows_html += f"""
+        <tr>
+            <td>{row['Rank']}</td>
+            <td>{row['Model']}</td>
+            <td>{_type_badge(row['Type'])}</td>
+            <td class="score-cell" style="color:{l0_c}">{row['L0']:.1f}</td>
+            <td class="score-cell" style="color:{l1_c}">{row['L1']:.1f}</td>
+            <td class="score-cell" style="color:{l2_c}">{row['L2']:.1f}</td>
+            <td class="score-cell" style="color:{l3_c}">{row['L3']:.1f}</td>
+            <td class="overall-cell" style="color:{ov_c}">{row['Overall']:.1f}</td>
+            <td>{_gap_badge(row['Selection Gap'])}</td>
+        </tr>
+        """
+    # Divider
+    rows_html += '<tr class="lb-divider"><td colspan="9"></td></tr>'
+    # Averages
+    for label, avg in AVERAGES.items():
+        l0_c = _score_color(avg["L0"])
+        l1_c = _score_color(avg["L1"])
+        l2_c = _score_color(avg["L2"])
+        l3_c = _score_color(avg["L3"])
+        ov_c = _score_color(avg["Overall"])
+        rows_html += f"""
+        <tr class="lb-avg-row">
+            <td></td>
+            <td><em>{label}</em></td>
+            <td></td>
+            <td class="score-cell" style="color:{l0_c}">{avg['L0']:.1f}</td>
+            <td class="score-cell" style="color:{l1_c}">{avg['L1']:.1f}</td>
+            <td class="score-cell" style="color:{l2_c}">{avg['L2']:.1f}</td>
+            <td class="score-cell" style="color:{l3_c}">{avg['L3']:.1f}</td>
+            <td class="overall-cell" style="color:{ov_c}">{avg['Overall']:.1f}</td>
+            <td></td>
+        </tr>
+        """
+    footer = "</tbody></table>"
+    return css + header + rows_html + footer
+# ---------------------------------------------------------------------------
+# TAB 2: SELECTION GAP VISUALIZATION
+# ---------------------------------------------------------------------------
+def plot_selection_gap(df: pd.DataFrame) -> go.Figure:
+    """Bar chart: L0 vs Composed Average for each model, with gap arrows."""
+    df_sorted = df.sort_values("Overall", ascending=True)
+    fig = go.Figure()
+    # L0 bars
+    fig.add_trace(go.Bar(
+        y=df_sorted["Model"],
+        x=df_sorted["L0"],
+        name="L0 (Single Tool)",
+        orientation="h",
+        marker=dict(color=ACCENT_BLUE, line=dict(width=0)),
+        text=[f"{v:.1f}" for v in df_sorted["L0"]],
+        textposition="inside",
+        textfont=dict(size=11, color="white"),
+        hovertemplate="<b>%{y}</b><br>L0: %{x:.1f}%<extra></extra>",
+    ))
+    # Composed average bars
+    fig.add_trace(go.Bar(
+        y=df_sorted["Model"],
+        x=df_sorted["Composed Avg"],
+        name="Composed Avg (L1-L3)",
+        orientation="h",
+        marker=dict(color=ACCENT_ORANGE, line=dict(width=0)),
+        text=[f"{v:.1f}" for v in df_sorted["Composed Avg"]],
+        textposition="inside",
+        textfont=dict(size=11, color="white"),
+        hovertemplate="<b>%{y}</b><br>Composed Avg: %{x:.1f}%<extra></extra>",
+    ))
+    # Add gap annotations
+    for _, row in df_sorted.iterrows():
+        gap = row["Composed Avg"] - row["L0"]
+        direction = "+" if gap > 0 else ""
+        color = ACCENT_GREEN if gap > 0 else ACCENT_RED
+        x_pos = max(row["L0"], row["Composed Avg"]) + 2
+        fig.add_annotation(
+            x=x_pos,
+            y=row["Model"],
+            text=f"<b>{direction}{gap:.1f}</b>",
+            showarrow=False,
+            font=dict(color=color, size=11),
+            xanchor="left",
+        )
+    fig = _apply_layout(
+        fig,
+        title=dict(text="Selection Gap: L0 (Single Tool) vs Composed Average (L1-L3)", font=dict(size=16)),
+        barmode="group",
+        xaxis=dict(title="Accuracy (%)", range=[0, 100], gridcolor=GRID_COLOR),
+        yaxis=dict(title="", gridcolor=GRID_COLOR, tickfont=dict(size=11)),
+        legend=dict(
+            orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
+            bgcolor="rgba(0,0,0,0)",
+        ),
+        height=700,
+    )
+    return fig
+# ---------------------------------------------------------------------------
+# TAB 3: LEVEL COMPARISON
+# ---------------------------------------------------------------------------
+def plot_level_comparison(df: pd.DataFrame, model_type: str = "All") -> go.Figure:
+    """Grouped bar chart: L0/L1/L2/L3 per model, filterable by type."""
+    if model_type == "Cloud":
+        df_plot = df[df["Type"] == "Cloud"].copy()
+    elif model_type == "Local":
+        df_plot = df[df["Type"] == "Local"].copy()
+    else:
+        df_plot = df.copy()
+    df_plot = df_plot.sort_values("Overall", ascending=True)
+    fig = go.Figure()
+    for level, color in LEVEL_COLORS.items():
+        fig.add_trace(go.Bar(
+            y=df_plot["Model"],
+            x=df_plot[level],
+            name=level,
+            orientation="h",
+            marker=dict(color=color, line=dict(width=0.5, color="#111")),
+            text=[f"{v:.1f}" for v in df_plot[level]],
+            textposition="outside",
+            textfont=dict(size=9),
+            hovertemplate=f"<b>%{{y}}</b><br>{level}: %{{x:.1f}}%<extra></extra>",
+        ))
+    n_models = len(df_plot)
+    fig = _apply_layout(
+        fig,
+        title=dict(
+            text=f"Performance by Composition Level ({model_type} Models)",
+            font=dict(size=16),
+        ),
+        barmode="group",
+        xaxis=dict(title="Accuracy (%)", range=[0, 105], gridcolor=GRID_COLOR),
+        yaxis=dict(title="", gridcolor=GRID_COLOR, tickfont=dict(size=11)),
+        legend=dict(
+            orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
+            bgcolor="rgba(0,0,0,0)",
+        ),
+        height=max(400, n_models * 50 + 150),
+    )
+    return fig
+def plot_level_radar() -> go.Figure:
+    """Radar/spider chart comparing cloud vs local averages."""
+    categories = ["L0", "L1", "L2", "L3"]
+    fig = go.Figure()
+    fig.add_trace(go.Scatterpolar(
+        r=[AVERAGES["Cloud avg"]["L0"], AVERAGES["Cloud avg"]["L1"],
+           AVERAGES["Cloud avg"]["L2"], AVERAGES["Cloud avg"]["L3"],
+           AVERAGES["Cloud avg"]["L0"]],
+        theta=categories + [categories[0]],
+        fill="toself",
+        name="Cloud Avg",
+        line=dict(color=ACCENT_BLUE, width=2),
+        fillcolor="rgba(79, 195, 247, 0.2)",
+    ))
+    fig.add_trace(go.Scatterpolar(
+        r=[AVERAGES["Local avg"]["L0"], AVERAGES["Local avg"]["L1"],
+           AVERAGES["Local avg"]["L2"], AVERAGES["Local avg"]["L3"],
+           AVERAGES["Local avg"]["L0"]],
+        theta=categories + [categories[0]],
+        fill="toself",
+        name="Local Avg",
+        line=dict(color=ACCENT_PURPLE, width=2),
+        fillcolor="rgba(171, 71, 188, 0.2)",
+    ))
+    fig.update_layout(
+        polar=dict(
+            bgcolor=CARD_BG,
+            radialaxis=dict(
+                visible=True, range=[0, 90],
+                gridcolor=GRID_COLOR, linecolor=GRID_COLOR,
+                tickfont=dict(color=TEXT_COLOR, size=10),
+            ),
+            angularaxis=dict(
+                gridcolor=GRID_COLOR, linecolor=GRID_COLOR,
+                tickfont=dict(color=TEXT_COLOR, size=13, family="Inter, system-ui, sans-serif"),
+            ),
+        ),
+        paper_bgcolor=BG_COLOR,
+        font=dict(color=TEXT_COLOR, family="Inter, system-ui, sans-serif"),
+        title=dict(text="Cloud vs Local: Performance Profile", font=dict(size=16, color=TEXT_COLOR)),
+        legend=dict(
+            orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5,
+            bgcolor="rgba(0,0,0,0)",
+        ),
+        height=500,
+        margin=dict(l=80, r=80, t=80, b=80),
+    )
+    return fig
+# ---------------------------------------------------------------------------
+# TAB 4: ABOUT
+# ---------------------------------------------------------------------------
+ABOUT_MD = """
+## CompToolBench: Measuring Compositional Tool-Use in LLMs
+**CompToolBench** is a benchmark that measures *compositional tool-use generalization* in large
+language models. The central question: if an LLM can use tools A, B, and C individually, can it
+compose them into novel pipelines like `A(B(C(x)))`?
+---
+### Composition Levels
+| Level | Topology | Description |
+|:------|:---------|:------------|
+| **L0 (Node)** | Single call | One tool, correct arguments -- the baseline |
+| **L1 (Chain)** | A -> B -> C | Sequential: output of tool_i feeds tool_{i+1} |
+| **L2 (Parallel)** | [A, B] -> C | Independent calls whose results merge downstream |
+| **L3 (DAG)** | Complex graph | Branching, merging, and sequential edges combined |
+---
+### Key Finding: The Selection Gap
+> **17 out of 18 models exhibit a Selection Gap**: their L0 (single-tool) accuracy is *lower*
+> than their average accuracy on composed tasks (L1-L3).
+This is counter-intuitive. Models are *better* at multi-step composition than at simple
+single-tool selection. The explanation: L0 tests pure tool *selection* (choosing the right
+tool from a large catalogue), while L1-L3 tasks provide more structural context that narrows
+the search space. The hardest part of tool use is not execution -- it is *selection*.
+---
+### Benchmark Details
+- **18 models** evaluated (10 cloud API, 8 local via Ollama)
+- **106 deterministic tool simulations** across 15 categories
+- **200 tasks** at 4 composition levels (L0-L3)
+- **Deterministic scoring** with verifiable ground-truth execution traces
+---
+### Links
+| Resource | Link |
+|:---------|:-----|
+| Paper | [ArXiv (coming soon)](#) |
+| Code | [github.com/ronyrahmaan/comptoolbench](https://github.com/ronyrahmaan/comptoolbench) |
+| Author | Md A Rahman, Texas Tech University |
+---
+<p style="text-align:center;color:#666;font-size:0.85em;">
+CompToolBench -- February 2026
+</p>
+"""
+# ---------------------------------------------------------------------------
+# GRADIO APP
+# ---------------------------------------------------------------------------
+def create_app() -> gr.Blocks:
+    """Build the full 4-tab Gradio Blocks application."""
+    df = build_full_dataframe()
+    custom_css = """
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto !important;
+    }
+    .main-header {
+        text-align: center;
+        padding: 20px 0 10px 0;
+    }
+    .main-header h1 {
+        font-size: 2em;
+        font-weight: 700;
+        background: linear-gradient(135deg, #4fc3f7, #ab47bc);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        margin-bottom: 8px;
+    }
+    .main-header p {
+        color: #aaa;
+        font-size: 1.1em;
+    }
+    .stat-row {
+        display: flex;
+        justify-content: center;
+        gap: 40px;
+        padding: 15px 0;
+        flex-wrap: wrap;
+    }
+    .stat-item {
+        text-align: center;
+    }
+    .stat-num {
+        font-size: 1.8em;
+        font-weight: 700;
+        color: #4fc3f7;
+    }
+    .stat-label {
+        font-size: 0.85em;
+        color: #888;
+        text-transform: uppercase;
+        letter-spacing: 1px;
+    }
+    footer {visibility: hidden;}
+    """
+    theme = gr.themes.Base(
+        primary_hue=gr.themes.colors.blue,
+        secondary_hue=gr.themes.colors.purple,
+        neutral_hue=gr.themes.colors.gray,
+        font=gr.themes.GoogleFont("Inter"),
+    ).set(
+        body_background_fill="#0f0f1a",
+        body_background_fill_dark="#0f0f1a",
+        block_background_fill="#1a1a2e",
+        block_background_fill_dark="#1a1a2e",
+        block_border_color="#2a2a4a",
+        block_border_color_dark="#2a2a4a",
+        block_label_text_color="#b0bec5",
+        block_label_text_color_dark="#b0bec5",
+        block_title_text_color="#e0e0e0",
+        block_title_text_color_dark="#e0e0e0",
+        body_text_color="#e0e0e0",
+        body_text_color_dark="#e0e0e0",
+        body_text_color_subdued="#888",
+        body_text_color_subdued_dark="#888",
+        background_fill_primary="#16213e",
+        background_fill_primary_dark="#16213e",
+        background_fill_secondary="#1a1a2e",
+        background_fill_secondary_dark="#1a1a2e",
+        border_color_accent="#4fc3f7",
+        border_color_accent_dark="#4fc3f7",
+        color_accent_soft="#1e3a5f",
+        color_accent_soft_dark="#1e3a5f",
+        button_primary_background_fill="#4fc3f7",
+        button_primary_background_fill_dark="#4fc3f7",
+        button_primary_text_color="#0f0f1a",
+        button_primary_text_color_dark="#0f0f1a",
+    )
+    # Gradio 6+ moved theme/css from Blocks() to launch().
+    # Detect version and pass params accordingly.
+    _gradio_major = int(gr.__version__.split(".")[0])
+    _blocks_kwargs: dict = {"title": "CompToolBench"}
+    if _gradio_major < 6:
+        _blocks_kwargs["theme"] = theme
+        _blocks_kwargs["css"] = custom_css
+    with gr.Blocks(**_blocks_kwargs) as app:
+        # ── Header ──
+        gr.HTML("""
+        <div class="main-header">
+            <h1>CompToolBench</h1>
+            <p>Measuring Compositional Tool-Use Generalization in LLMs</p>
+        </div>
+        <div class="stat-row">
+            <div class="stat-item">
+                <div class="stat-num">18</div>
+                <div class="stat-label">Models</div>
+            </div>
+            <div class="stat-item">
+                <div class="stat-num">106</div>
+                <div class="stat-label">Tools</div>
+            </div>
+            <div class="stat-item">
+                <div class="stat-num">4</div>
+                <div class="stat-label">Composition Levels</div>
+            </div>
+            <div class="stat-item">
+                <div class="stat-num">17/18</div>
+                <div class="stat-label">Show Selection Gap</div>
+            </div>
+        </div>
+        """)
+        # ── Tab 1: Leaderboard ──
+        with gr.Tab("Leaderboard", id="leaderboard"):
+            gr.HTML(format_leaderboard_html(df))
+            gr.Markdown(
+                """
+                **Reading the table:** Scores are accuracy percentages. Colors range from
+                <span style="color:#ef5350">red</span> (low) to
+                <span style="color:#66bb6a">green</span> (high).
+                **Selection Gap** = model's L0 is lower than its average of L1-L3
+                (i.e., models are *better* at composed tasks than single-tool selection).
+                **Delta** in the paper = L0 minus L3 (positive means degradation from single to DAG).
+                """,
+                elem_classes=["block"],
+            )
+        # ── Tab 2: Selection Gap ──
+        with gr.Tab("Selection Gap", id="selection-gap"):
+            gr.Markdown(
+                "### The Selection Gap: Why are models better at *composed* tasks than single-tool calls?"
+            )
+            gr.Plot(plot_selection_gap(df))
+            gr.Markdown(
+                """
+                **How to read this chart:** For each model, the blue bar shows L0 accuracy
+                (single-tool selection) and the orange bar shows the average of L1, L2, L3
+                (composed tasks). The number on the right is the gap.
+                A **positive gap** (green number) means the model performs *better* on composed
+                tasks -- the Selection Gap. This happens because multi-step prompts provide
+                richer structural context that narrows the tool search space.
+                Only **Llama 4 Scout 17B** does not exhibit a Selection Gap, because its L3
+                accuracy collapses to 7.0% (catastrophic DAG failure).
+                """
+            )
+        # ── Tab 3: Level Comparison ──
+        with gr.Tab("Level Comparison", id="level-comparison"):
+            gr.Markdown("### Performance breakdown by composition level")
+            model_filter = gr.Radio(
+                choices=["All", "Cloud", "Local"],
+                value="All",
+                label="Filter by deployment type",
+            )
+            level_chart = gr.Plot(plot_level_comparison(df, "All"))
+            model_filter.change(
+                fn=lambda t: plot_level_comparison(df, t),
+                inputs=[model_filter],
+                outputs=[level_chart],
+            )
+            gr.Markdown("### Cloud vs Local: Aggregate Profile")
+            gr.Plot(plot_level_radar())
+            gr.Markdown(
+                """
+                **Key insight:** Cloud models massively outperform local models on L2
+                (parallel composition): 80.5% vs 50.8%. This 30-point gap is the largest
+                difference between deployment types at any level, suggesting that parallel
+                tool orchestration is where API-served models have the biggest advantage.
+                """
+            )
+        # ── Tab 4: About ──
+        with gr.Tab("About", id="about"):
+            gr.Markdown(ABOUT_MD)
+    # Store launch kwargs for Gradio 6+ theme/css
+    app._ctb_launch_kwargs = {}  # type: ignore[attr-defined]
+    if _gradio_major >= 6:
+        app._ctb_launch_kwargs["theme"] = theme  # type: ignore[attr-defined]
+        app._ctb_launch_kwargs["css"] = custom_css  # type: ignore[attr-defined]
+    return app
+# ---------------------------------------------------------------------------
+# ENTRY POINT
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    app = create_app()
+    launch_kwargs = getattr(app, "_ctb_launch_kwargs", {})
+    app.launch(share=False, **launch_kwargs)