"""CompToolBench Gradio Demo — Interactive Benchmark Explorer. Designed for HuggingFace Spaces (free CPU tier). Launch locally: python demo/app.py """ from __future__ import annotations import gradio as gr import pandas as pd import plotly.graph_objects as go # --------------------------------------------------------------------------- # DATA — extracted verbatim from paper/tables/leaderboard.tex # Columns: Model, Provider, L0, L1, L2, L3, Overall, Delta, SelectionGap # Delta = L0 - L3 (positive = degradation). # SelectionGap (dagger) = L0 < avg(L1,L2,L3). # --------------------------------------------------------------------------- CLOUD_MODELS = [ # (Model, Provider, L0, L1, L2, L3, Overall, Delta, SelectionGap) ("Llama 3.1 8B", "Groq", 27.1, 75.8, 87.1, 76.0, 66.4, -48.9, True), ("Command A", "Cohere", 45.8, 62.7, 87.8, 40.8, 58.4, 5.1, True), ("Mistral Small", "Mistral", 45.8, 59.7, 87.6, 40.9, 57.5, 4.9, True), ("Command R+", "Cohere", 43.8, 57.5, 88.0, 40.3, 56.2, 3.4, True), ("Llama 3.1 8B", "Cerebras", 31.2, 66.1, 81.2, 46.4, 56.0, -15.1, True), ("Mistral Large", "Mistral", 39.6, 59.5, 87.9, 38.5, 55.4, 1.1, True), ("Mistral Medium", "Mistral", 43.8, 57.5, 87.9, 36.3, 55.2, 7.4, True), ("Gemini 2.0 Flash", "OpenRouter", 39.6, 52.4, 85.7, 39.0, 52.8, 0.6, True), ("GPT-OSS 120B", "Cerebras", 45.8, 56.3, 56.1, 29.0, 47.2, 16.8, True), ("Llama 4 Scout 17B", "Groq", 37.5, 49.6, 55.8, 7.0, 37.7, 30.5, False), ] LOCAL_MODELS = [ ("Granite4 3B", "Ollama", 45.8, 57.3, 56.1, 30.2, 47.8, 15.6, True), ("Granite4 1B", "Ollama", 41.7, 56.3, 55.9, 29.9, 46.4, 11.8, True), ("Mistral 7B", "Ollama", 43.8, 57.7, 49.2, 30.5, 46.1, 13.3, True), ("Llama 3.1 8B", "Ollama", 39.6, 56.7, 56.1, 29.5, 45.9, 10.1, True), ("Mistral Nemo 12B", "Ollama", 37.5, 58.4, 51.0, 31.8, 45.5, 5.7, True), ("Qwen 2.5 7B", "Ollama", 39.6, 56.7, 53.8, 25.8, 44.6, 13.8, True), ("Mistral Small 24B", "Ollama", 37.5, 51.1, 47.7, 22.6, 40.3, 14.9, True), ("Qwen3 8B", "Ollama", 35.4, 52.0, 36.9, 21.8, 37.7, 13.7, True), ] # Averages from the table AVERAGES = { "All models": {"L0": 40.0, "L1": 58.0, "L2": 67.3, "L3": 34.2, "Overall": 49.8, "Delta": 5.8}, "Cloud avg": {"L0": 40.0, "L1": 59.7, "L2": 80.5, "L3": 39.4, "Overall": 54.3, "Delta": 0.6}, "Local avg": {"L0": 40.1, "L1": 55.8, "L2": 50.8, "L3": 27.8, "Overall": 44.3, "Delta": 12.3}, } def _build_display_name(model: str, provider: str) -> str: """Build a unique display name like 'Llama 3.1 8B (Groq)'.""" return f"{model} ({provider})" def build_full_dataframe() -> pd.DataFrame: """Build the full leaderboard DataFrame with all 18 models.""" rows = [] for model, provider, l0, l1, l2, l3, overall, delta, sgap in CLOUD_MODELS: composed_avg = (l1 + l2 + l3) / 3.0 rows.append({ "Rank": 0, "Model": _build_display_name(model, provider), "Provider": provider, "Type": "Cloud", "L0": l0, "L1": l1, "L2": l2, "L3": l3, "Overall": overall, "Delta": delta, "Selection Gap": sgap, "Composed Avg": round(composed_avg, 1), }) for model, provider, l0, l1, l2, l3, overall, delta, sgap in LOCAL_MODELS: composed_avg = (l1 + l2 + l3) / 3.0 rows.append({ "Rank": 0, "Model": _build_display_name(model, provider), "Provider": provider, "Type": "Local", "L0": l0, "L1": l1, "L2": l2, "L3": l3, "Overall": overall, "Delta": delta, "Selection Gap": sgap, "Composed Avg": round(composed_avg, 1), }) df = pd.DataFrame(rows) df = df.sort_values("Overall", ascending=False).reset_index(drop=True) df["Rank"] = df.index + 1 return df # --------------------------------------------------------------------------- # PLOTLY THEME CONSTANTS # --------------------------------------------------------------------------- BG_COLOR = "#1a1a2e" CARD_BG = "#16213e" GRID_COLOR = "#2a2a4a" TEXT_COLOR = "#e0e0e0" ACCENT_BLUE = "#4fc3f7" ACCENT_GREEN = "#66bb6a" ACCENT_ORANGE = "#ffa726" ACCENT_RED = "#ef5350" ACCENT_PURPLE = "#ab47bc" LEVEL_COLORS = { "L0": ACCENT_BLUE, "L1": ACCENT_GREEN, "L2": ACCENT_ORANGE, "L3": ACCENT_RED, } PLOTLY_LAYOUT = dict( paper_bgcolor=BG_COLOR, plot_bgcolor=CARD_BG, font=dict(color=TEXT_COLOR, family="Inter, system-ui, sans-serif"), xaxis=dict(gridcolor=GRID_COLOR, zerolinecolor=GRID_COLOR), yaxis=dict(gridcolor=GRID_COLOR, zerolinecolor=GRID_COLOR), margin=dict(l=60, r=30, t=60, b=80), hoverlabel=dict(bgcolor=CARD_BG, font_color=TEXT_COLOR, bordercolor=GRID_COLOR), ) def _apply_layout(fig: go.Figure, **kwargs) -> go.Figure: """Apply consistent dark theme to a plotly figure.""" layout = {**PLOTLY_LAYOUT, **kwargs} fig.update_layout(**layout) return fig # --------------------------------------------------------------------------- # TAB 1: LEADERBOARD (styled DataFrame) # --------------------------------------------------------------------------- def format_leaderboard_html(df: pd.DataFrame) -> str: """Build a styled HTML leaderboard table with color-coded scores.""" def _score_color(val: float, low: float = 20.0, high: float = 80.0) -> str: """Map a score to a green-yellow-red gradient.""" ratio = max(0.0, min(1.0, (val - low) / (high - low))) if ratio > 0.5: # green zone r = int(255 * (1 - (ratio - 0.5) * 2)) g = 200 else: # red zone r = 240 g = int(200 * ratio * 2) return f"rgb({r},{g},80)" def _gap_badge(has_gap: bool) -> str: if has_gap: return 'Yes' return 'No' def _type_badge(model_type: str) -> str: if model_type == "Cloud": return 'Cloud' return 'Local' css = """ """ header = """ """ rows_html = "" for _, row in df.iterrows(): l0_c = _score_color(row["L0"]) l1_c = _score_color(row["L1"]) l2_c = _score_color(row["L2"]) l3_c = _score_color(row["L3"]) ov_c = _score_color(row["Overall"]) rows_html += f""" """ # Divider rows_html += '' # Averages for label, avg in AVERAGES.items(): l0_c = _score_color(avg["L0"]) l1_c = _score_color(avg["L1"]) l2_c = _score_color(avg["L2"]) l3_c = _score_color(avg["L3"]) ov_c = _score_color(avg["Overall"]) rows_html += f""" """ footer = "
# Model Type L0 L1 L2 L3 Overall Selection Gap
{row['Rank']} {row['Model']} {_type_badge(row['Type'])} {row['L0']:.1f} {row['L1']:.1f} {row['L2']:.1f} {row['L3']:.1f} {row['Overall']:.1f} {_gap_badge(row['Selection Gap'])}
{label} {avg['L0']:.1f} {avg['L1']:.1f} {avg['L2']:.1f} {avg['L3']:.1f} {avg['Overall']:.1f}
" return css + header + rows_html + footer # --------------------------------------------------------------------------- # TAB 2: SELECTION GAP VISUALIZATION # --------------------------------------------------------------------------- def plot_selection_gap(df: pd.DataFrame) -> go.Figure: """Bar chart: L0 vs Composed Average for each model, with gap arrows.""" df_sorted = df.sort_values("Overall", ascending=True) fig = go.Figure() # L0 bars fig.add_trace(go.Bar( y=df_sorted["Model"], x=df_sorted["L0"], name="L0 (Single Tool)", orientation="h", marker=dict(color=ACCENT_BLUE, line=dict(width=0)), text=[f"{v:.1f}" for v in df_sorted["L0"]], textposition="inside", textfont=dict(size=11, color="white"), hovertemplate="%{y}
L0: %{x:.1f}%", )) # Composed average bars fig.add_trace(go.Bar( y=df_sorted["Model"], x=df_sorted["Composed Avg"], name="Composed Avg (L1-L3)", orientation="h", marker=dict(color=ACCENT_ORANGE, line=dict(width=0)), text=[f"{v:.1f}" for v in df_sorted["Composed Avg"]], textposition="inside", textfont=dict(size=11, color="white"), hovertemplate="%{y}
Composed Avg: %{x:.1f}%", )) # Add gap annotations for _, row in df_sorted.iterrows(): gap = row["Composed Avg"] - row["L0"] direction = "+" if gap > 0 else "" color = ACCENT_GREEN if gap > 0 else ACCENT_RED x_pos = max(row["L0"], row["Composed Avg"]) + 2 fig.add_annotation( x=x_pos, y=row["Model"], text=f"{direction}{gap:.1f}", showarrow=False, font=dict(color=color, size=11), xanchor="left", ) fig = _apply_layout( fig, title=dict(text="Selection Gap: L0 (Single Tool) vs Composed Average (L1-L3)", font=dict(size=16)), barmode="group", xaxis=dict(title="Accuracy (%)", range=[0, 100], gridcolor=GRID_COLOR), yaxis=dict(title="", gridcolor=GRID_COLOR, tickfont=dict(size=11)), legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, bgcolor="rgba(0,0,0,0)", ), height=700, ) return fig # --------------------------------------------------------------------------- # TAB 3: LEVEL COMPARISON # --------------------------------------------------------------------------- def plot_level_comparison(df: pd.DataFrame, model_type: str = "All") -> go.Figure: """Grouped bar chart: L0/L1/L2/L3 per model, filterable by type.""" if model_type == "Cloud": df_plot = df[df["Type"] == "Cloud"].copy() elif model_type == "Local": df_plot = df[df["Type"] == "Local"].copy() else: df_plot = df.copy() df_plot = df_plot.sort_values("Overall", ascending=True) fig = go.Figure() for level, color in LEVEL_COLORS.items(): fig.add_trace(go.Bar( y=df_plot["Model"], x=df_plot[level], name=level, orientation="h", marker=dict(color=color, line=dict(width=0.5, color="#111")), text=[f"{v:.1f}" for v in df_plot[level]], textposition="outside", textfont=dict(size=9), hovertemplate=f"%{{y}}
{level}: %{{x:.1f}}%", )) n_models = len(df_plot) fig = _apply_layout( fig, title=dict( text=f"Performance by Composition Level ({model_type} Models)", font=dict(size=16), ), barmode="group", xaxis=dict(title="Accuracy (%)", range=[0, 105], gridcolor=GRID_COLOR), yaxis=dict(title="", gridcolor=GRID_COLOR, tickfont=dict(size=11)), legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, bgcolor="rgba(0,0,0,0)", ), height=max(400, n_models * 50 + 150), ) return fig def plot_level_radar() -> go.Figure: """Radar/spider chart comparing cloud vs local averages.""" categories = ["L0", "L1", "L2", "L3"] fig = go.Figure() fig.add_trace(go.Scatterpolar( r=[AVERAGES["Cloud avg"]["L0"], AVERAGES["Cloud avg"]["L1"], AVERAGES["Cloud avg"]["L2"], AVERAGES["Cloud avg"]["L3"], AVERAGES["Cloud avg"]["L0"]], theta=categories + [categories[0]], fill="toself", name="Cloud Avg", line=dict(color=ACCENT_BLUE, width=2), fillcolor="rgba(79, 195, 247, 0.2)", )) fig.add_trace(go.Scatterpolar( r=[AVERAGES["Local avg"]["L0"], AVERAGES["Local avg"]["L1"], AVERAGES["Local avg"]["L2"], AVERAGES["Local avg"]["L3"], AVERAGES["Local avg"]["L0"]], theta=categories + [categories[0]], fill="toself", name="Local Avg", line=dict(color=ACCENT_PURPLE, width=2), fillcolor="rgba(171, 71, 188, 0.2)", )) fig.update_layout( polar=dict( bgcolor=CARD_BG, radialaxis=dict( visible=True, range=[0, 90], gridcolor=GRID_COLOR, linecolor=GRID_COLOR, tickfont=dict(color=TEXT_COLOR, size=10), ), angularaxis=dict( gridcolor=GRID_COLOR, linecolor=GRID_COLOR, tickfont=dict(color=TEXT_COLOR, size=13, family="Inter, system-ui, sans-serif"), ), ), paper_bgcolor=BG_COLOR, font=dict(color=TEXT_COLOR, family="Inter, system-ui, sans-serif"), title=dict(text="Cloud vs Local: Performance Profile", font=dict(size=16, color=TEXT_COLOR)), legend=dict( orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5, bgcolor="rgba(0,0,0,0)", ), height=500, margin=dict(l=80, r=80, t=80, b=80), ) return fig # --------------------------------------------------------------------------- # TAB 4: ABOUT # --------------------------------------------------------------------------- ABOUT_MD = """ ## CompToolBench: Measuring Compositional Tool-Use in LLMs **CompToolBench** is a benchmark that measures *compositional tool-use generalization* in large language models. The central question: if an LLM can use tools A, B, and C individually, can it compose them into novel pipelines like `A(B(C(x)))`? --- ### Composition Levels | Level | Topology | Description | |:------|:---------|:------------| | **L0 (Node)** | Single call | One tool, correct arguments -- the baseline | | **L1 (Chain)** | A -> B -> C | Sequential: output of tool_i feeds tool_{i+1} | | **L2 (Parallel)** | [A, B] -> C | Independent calls whose results merge downstream | | **L3 (DAG)** | Complex graph | Branching, merging, and sequential edges combined | --- ### Key Finding: The Selection Gap > **17 out of 18 models exhibit a Selection Gap**: their L0 (single-tool) accuracy is *lower* > than their average accuracy on composed tasks (L1-L3). This is counter-intuitive. Models are *better* at multi-step composition than at simple single-tool selection. The explanation: L0 tests pure tool *selection* (choosing the right tool from a large catalogue), while L1-L3 tasks provide more structural context that narrows the search space. The hardest part of tool use is not execution -- it is *selection*. --- ### Benchmark Details - **18 models** evaluated (10 cloud API, 8 local via Ollama) - **106 deterministic tool simulations** across 15 categories - **200 tasks** at 4 composition levels (L0-L3) - **Deterministic scoring** with verifiable ground-truth execution traces --- ### Links | Resource | Link | |:---------|:-----| | Paper | [ArXiv (coming soon)](#) | | Code | [github.com/ronyrahmaan/comptoolbench](https://github.com/ronyrahmaan/comptoolbench) | | Author | Md A Rahman, Texas Tech University | ---

CompToolBench -- February 2026

""" # --------------------------------------------------------------------------- # GRADIO APP # --------------------------------------------------------------------------- def create_app() -> gr.Blocks: """Build the full 4-tab Gradio Blocks application.""" df = build_full_dataframe() custom_css = """ .gradio-container { max-width: 1200px !important; margin: auto !important; } .main-header { text-align: center; padding: 20px 0 10px 0; } .main-header h1 { font-size: 2em; font-weight: 700; background: linear-gradient(135deg, #4fc3f7, #ab47bc); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin-bottom: 8px; } .main-header p { color: #aaa; font-size: 1.1em; } .stat-row { display: flex; justify-content: center; gap: 40px; padding: 15px 0; flex-wrap: wrap; } .stat-item { text-align: center; } .stat-num { font-size: 1.8em; font-weight: 700; color: #4fc3f7; } .stat-label { font-size: 0.85em; color: #888; text-transform: uppercase; letter-spacing: 1px; } footer {visibility: hidden;} """ theme = gr.themes.Base( primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.purple, neutral_hue=gr.themes.colors.gray, font=gr.themes.GoogleFont("Inter"), ).set( body_background_fill="#0f0f1a", body_background_fill_dark="#0f0f1a", block_background_fill="#1a1a2e", block_background_fill_dark="#1a1a2e", block_border_color="#2a2a4a", block_border_color_dark="#2a2a4a", block_label_text_color="#b0bec5", block_label_text_color_dark="#b0bec5", block_title_text_color="#e0e0e0", block_title_text_color_dark="#e0e0e0", body_text_color="#e0e0e0", body_text_color_dark="#e0e0e0", body_text_color_subdued="#888", body_text_color_subdued_dark="#888", background_fill_primary="#16213e", background_fill_primary_dark="#16213e", background_fill_secondary="#1a1a2e", background_fill_secondary_dark="#1a1a2e", border_color_accent="#4fc3f7", border_color_accent_dark="#4fc3f7", color_accent_soft="#1e3a5f", color_accent_soft_dark="#1e3a5f", button_primary_background_fill="#4fc3f7", button_primary_background_fill_dark="#4fc3f7", button_primary_text_color="#0f0f1a", button_primary_text_color_dark="#0f0f1a", ) # Gradio 6+ moved theme/css from Blocks() to launch(). # Detect version and pass params accordingly. _gradio_major = int(gr.__version__.split(".")[0]) _blocks_kwargs: dict = {"title": "CompToolBench"} if _gradio_major < 6: _blocks_kwargs["theme"] = theme _blocks_kwargs["css"] = custom_css with gr.Blocks(**_blocks_kwargs) as app: # ── Header ── gr.HTML("""

CompToolBench

Measuring Compositional Tool-Use Generalization in LLMs

18
Models
106
Tools
4
Composition Levels
17/18
Show Selection Gap
""") # ── Tab 1: Leaderboard ── with gr.Tab("Leaderboard", id="leaderboard"): gr.HTML(format_leaderboard_html(df)) gr.Markdown( """ **Reading the table:** Scores are accuracy percentages. Colors range from red (low) to green (high). **Selection Gap** = model's L0 is lower than its average of L1-L3 (i.e., models are *better* at composed tasks than single-tool selection). **Delta** in the paper = L0 minus L3 (positive means degradation from single to DAG). """, elem_classes=["block"], ) # ── Tab 2: Selection Gap ── with gr.Tab("Selection Gap", id="selection-gap"): gr.Markdown( "### The Selection Gap: Why are models better at *composed* tasks than single-tool calls?" ) gr.Plot(plot_selection_gap(df)) gr.Markdown( """ **How to read this chart:** For each model, the blue bar shows L0 accuracy (single-tool selection) and the orange bar shows the average of L1, L2, L3 (composed tasks). The number on the right is the gap. A **positive gap** (green number) means the model performs *better* on composed tasks -- the Selection Gap. This happens because multi-step prompts provide richer structural context that narrows the tool search space. Only **Llama 4 Scout 17B** does not exhibit a Selection Gap, because its L3 accuracy collapses to 7.0% (catastrophic DAG failure). """ ) # ── Tab 3: Level Comparison ── with gr.Tab("Level Comparison", id="level-comparison"): gr.Markdown("### Performance breakdown by composition level") model_filter = gr.Radio( choices=["All", "Cloud", "Local"], value="All", label="Filter by deployment type", ) level_chart = gr.Plot(plot_level_comparison(df, "All")) model_filter.change( fn=lambda t: plot_level_comparison(df, t), inputs=[model_filter], outputs=[level_chart], ) gr.Markdown("### Cloud vs Local: Aggregate Profile") gr.Plot(plot_level_radar()) gr.Markdown( """ **Key insight:** Cloud models massively outperform local models on L2 (parallel composition): 80.5% vs 50.8%. This 30-point gap is the largest difference between deployment types at any level, suggesting that parallel tool orchestration is where API-served models have the biggest advantage. """ ) # ── Tab 4: About ── with gr.Tab("About", id="about"): gr.Markdown(ABOUT_MD) # Store launch kwargs for Gradio 6+ theme/css app._ctb_launch_kwargs = {} # type: ignore[attr-defined] if _gradio_major >= 6: app._ctb_launch_kwargs["theme"] = theme # type: ignore[attr-defined] app._ctb_launch_kwargs["css"] = custom_css # type: ignore[attr-defined] return app # --------------------------------------------------------------------------- # ENTRY POINT # --------------------------------------------------------------------------- if __name__ == "__main__": app = create_app() launch_kwargs = getattr(app, "_ctb_launch_kwargs", {}) app.launch(share=False, **launch_kwargs)