mdarahmanxAI commited on
Commit
4d5e5f0
·
verified ·
1 Parent(s): 7f7f06f

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +735 -0
app.py ADDED
@@ -0,0 +1,735 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CompToolBench Gradio Demo — Interactive Benchmark Explorer.
2
+
3
+ Designed for HuggingFace Spaces (free CPU tier).
4
+ Launch locally: python demo/app.py
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import gradio as gr
10
+ import pandas as pd
11
+ import plotly.graph_objects as go
12
+
13
+ # ---------------------------------------------------------------------------
14
+ # DATA — extracted verbatim from paper/tables/leaderboard.tex
15
+ # Columns: Model, Provider, L0, L1, L2, L3, Overall, Delta, SelectionGap
16
+ # Delta = L0 - L3 (positive = degradation).
17
+ # SelectionGap (dagger) = L0 < avg(L1,L2,L3).
18
+ # ---------------------------------------------------------------------------
19
+
20
+ CLOUD_MODELS = [
21
+ # (Model, Provider, L0, L1, L2, L3, Overall, Delta, SelectionGap)
22
+ ("Llama 3.1 8B", "Groq", 27.1, 75.8, 87.1, 76.0, 66.4, -48.9, True),
23
+ ("Command A", "Cohere", 45.8, 62.7, 87.8, 40.8, 58.4, 5.1, True),
24
+ ("Mistral Small", "Mistral", 45.8, 59.7, 87.6, 40.9, 57.5, 4.9, True),
25
+ ("Command R+", "Cohere", 43.8, 57.5, 88.0, 40.3, 56.2, 3.4, True),
26
+ ("Llama 3.1 8B", "Cerebras", 31.2, 66.1, 81.2, 46.4, 56.0, -15.1, True),
27
+ ("Mistral Large", "Mistral", 39.6, 59.5, 87.9, 38.5, 55.4, 1.1, True),
28
+ ("Mistral Medium", "Mistral", 43.8, 57.5, 87.9, 36.3, 55.2, 7.4, True),
29
+ ("Gemini 2.0 Flash", "OpenRouter", 39.6, 52.4, 85.7, 39.0, 52.8, 0.6, True),
30
+ ("GPT-OSS 120B", "Cerebras", 45.8, 56.3, 56.1, 29.0, 47.2, 16.8, True),
31
+ ("Llama 4 Scout 17B", "Groq", 37.5, 49.6, 55.8, 7.0, 37.7, 30.5, False),
32
+ ]
33
+
34
+ LOCAL_MODELS = [
35
+ ("Granite4 3B", "Ollama", 45.8, 57.3, 56.1, 30.2, 47.8, 15.6, True),
36
+ ("Granite4 1B", "Ollama", 41.7, 56.3, 55.9, 29.9, 46.4, 11.8, True),
37
+ ("Mistral 7B", "Ollama", 43.8, 57.7, 49.2, 30.5, 46.1, 13.3, True),
38
+ ("Llama 3.1 8B", "Ollama", 39.6, 56.7, 56.1, 29.5, 45.9, 10.1, True),
39
+ ("Mistral Nemo 12B", "Ollama", 37.5, 58.4, 51.0, 31.8, 45.5, 5.7, True),
40
+ ("Qwen 2.5 7B", "Ollama", 39.6, 56.7, 53.8, 25.8, 44.6, 13.8, True),
41
+ ("Mistral Small 24B", "Ollama", 37.5, 51.1, 47.7, 22.6, 40.3, 14.9, True),
42
+ ("Qwen3 8B", "Ollama", 35.4, 52.0, 36.9, 21.8, 37.7, 13.7, True),
43
+ ]
44
+
45
+ # Averages from the table
46
+ AVERAGES = {
47
+ "All models": {"L0": 40.0, "L1": 58.0, "L2": 67.3, "L3": 34.2, "Overall": 49.8, "Delta": 5.8},
48
+ "Cloud avg": {"L0": 40.0, "L1": 59.7, "L2": 80.5, "L3": 39.4, "Overall": 54.3, "Delta": 0.6},
49
+ "Local avg": {"L0": 40.1, "L1": 55.8, "L2": 50.8, "L3": 27.8, "Overall": 44.3, "Delta": 12.3},
50
+ }
51
+
52
+
53
+ def _build_display_name(model: str, provider: str) -> str:
54
+ """Build a unique display name like 'Llama 3.1 8B (Groq)'."""
55
+ return f"{model} ({provider})"
56
+
57
+
58
+ def build_full_dataframe() -> pd.DataFrame:
59
+ """Build the full leaderboard DataFrame with all 18 models."""
60
+ rows = []
61
+ for model, provider, l0, l1, l2, l3, overall, delta, sgap in CLOUD_MODELS:
62
+ composed_avg = (l1 + l2 + l3) / 3.0
63
+ rows.append({
64
+ "Rank": 0,
65
+ "Model": _build_display_name(model, provider),
66
+ "Provider": provider,
67
+ "Type": "Cloud",
68
+ "L0": l0,
69
+ "L1": l1,
70
+ "L2": l2,
71
+ "L3": l3,
72
+ "Overall": overall,
73
+ "Delta": delta,
74
+ "Selection Gap": sgap,
75
+ "Composed Avg": round(composed_avg, 1),
76
+ })
77
+ for model, provider, l0, l1, l2, l3, overall, delta, sgap in LOCAL_MODELS:
78
+ composed_avg = (l1 + l2 + l3) / 3.0
79
+ rows.append({
80
+ "Rank": 0,
81
+ "Model": _build_display_name(model, provider),
82
+ "Provider": provider,
83
+ "Type": "Local",
84
+ "L0": l0,
85
+ "L1": l1,
86
+ "L2": l2,
87
+ "L3": l3,
88
+ "Overall": overall,
89
+ "Delta": delta,
90
+ "Selection Gap": sgap,
91
+ "Composed Avg": round(composed_avg, 1),
92
+ })
93
+
94
+ df = pd.DataFrame(rows)
95
+ df = df.sort_values("Overall", ascending=False).reset_index(drop=True)
96
+ df["Rank"] = df.index + 1
97
+ return df
98
+
99
+
100
+ # ---------------------------------------------------------------------------
101
+ # PLOTLY THEME CONSTANTS
102
+ # ---------------------------------------------------------------------------
103
+ BG_COLOR = "#1a1a2e"
104
+ CARD_BG = "#16213e"
105
+ GRID_COLOR = "#2a2a4a"
106
+ TEXT_COLOR = "#e0e0e0"
107
+ ACCENT_BLUE = "#4fc3f7"
108
+ ACCENT_GREEN = "#66bb6a"
109
+ ACCENT_ORANGE = "#ffa726"
110
+ ACCENT_RED = "#ef5350"
111
+ ACCENT_PURPLE = "#ab47bc"
112
+
113
+ LEVEL_COLORS = {
114
+ "L0": ACCENT_BLUE,
115
+ "L1": ACCENT_GREEN,
116
+ "L2": ACCENT_ORANGE,
117
+ "L3": ACCENT_RED,
118
+ }
119
+
120
+ PLOTLY_LAYOUT = dict(
121
+ paper_bgcolor=BG_COLOR,
122
+ plot_bgcolor=CARD_BG,
123
+ font=dict(color=TEXT_COLOR, family="Inter, system-ui, sans-serif"),
124
+ xaxis=dict(gridcolor=GRID_COLOR, zerolinecolor=GRID_COLOR),
125
+ yaxis=dict(gridcolor=GRID_COLOR, zerolinecolor=GRID_COLOR),
126
+ margin=dict(l=60, r=30, t=60, b=80),
127
+ hoverlabel=dict(bgcolor=CARD_BG, font_color=TEXT_COLOR, bordercolor=GRID_COLOR),
128
+ )
129
+
130
+
131
+ def _apply_layout(fig: go.Figure, **kwargs) -> go.Figure:
132
+ """Apply consistent dark theme to a plotly figure."""
133
+ layout = {**PLOTLY_LAYOUT, **kwargs}
134
+ fig.update_layout(**layout)
135
+ return fig
136
+
137
+
138
+ # ---------------------------------------------------------------------------
139
+ # TAB 1: LEADERBOARD (styled DataFrame)
140
+ # ---------------------------------------------------------------------------
141
+ def format_leaderboard_html(df: pd.DataFrame) -> str:
142
+ """Build a styled HTML leaderboard table with color-coded scores."""
143
+
144
+ def _score_color(val: float, low: float = 20.0, high: float = 80.0) -> str:
145
+ """Map a score to a green-yellow-red gradient."""
146
+ ratio = max(0.0, min(1.0, (val - low) / (high - low)))
147
+ if ratio > 0.5:
148
+ # green zone
149
+ r = int(255 * (1 - (ratio - 0.5) * 2))
150
+ g = 200
151
+ else:
152
+ # red zone
153
+ r = 240
154
+ g = int(200 * ratio * 2)
155
+ return f"rgb({r},{g},80)"
156
+
157
+ def _gap_badge(has_gap: bool) -> str:
158
+ if has_gap:
159
+ return '<span style="color:#66bb6a;font-weight:600;">Yes</span>'
160
+ return '<span style="color:#999;">No</span>'
161
+
162
+ def _type_badge(model_type: str) -> str:
163
+ if model_type == "Cloud":
164
+ return '<span style="background:#1e3a5f;color:#4fc3f7;padding:2px 8px;border-radius:4px;font-size:0.8em;">Cloud</span>'
165
+ return '<span style="background:#2e3a1f;color:#a5d6a7;padding:2px 8px;border-radius:4px;font-size:0.8em;">Local</span>'
166
+
167
+ css = """
168
+ <style>
169
+ .lb-table {
170
+ width: 100%;
171
+ border-collapse: collapse;
172
+ font-family: 'Inter', system-ui, sans-serif;
173
+ font-size: 14px;
174
+ }
175
+ .lb-table th {
176
+ background: #0d1b2a;
177
+ color: #b0bec5;
178
+ padding: 12px 10px;
179
+ text-align: center;
180
+ font-weight: 600;
181
+ border-bottom: 2px solid #2a2a4a;
182
+ cursor: pointer;
183
+ user-select: none;
184
+ white-space: nowrap;
185
+ }
186
+ .lb-table th:first-child, .lb-table th:nth-child(2) {
187
+ text-align: left;
188
+ }
189
+ .lb-table td {
190
+ padding: 10px 10px;
191
+ text-align: center;
192
+ border-bottom: 1px solid #1a1a3a;
193
+ }
194
+ .lb-table td:first-child {
195
+ font-weight: 700;
196
+ color: #ffd54f;
197
+ text-align: center;
198
+ width: 40px;
199
+ }
200
+ .lb-table td:nth-child(2) {
201
+ text-align: left;
202
+ font-weight: 500;
203
+ color: #e0e0e0;
204
+ max-width: 220px;
205
+ }
206
+ .lb-table tr:hover {
207
+ background: #1e2d4a !important;
208
+ }
209
+ .lb-table tr:nth-child(even) {
210
+ background: #111827;
211
+ }
212
+ .lb-table tr:nth-child(odd) {
213
+ background: #0f1729;
214
+ }
215
+ .lb-table .score-cell {
216
+ font-weight: 600;
217
+ font-variant-numeric: tabular-nums;
218
+ }
219
+ .lb-table .overall-cell {
220
+ font-weight: 700;
221
+ font-size: 15px;
222
+ }
223
+ .lb-avg-row td {
224
+ background: #1a1a2e !important;
225
+ border-top: 2px solid #4fc3f7;
226
+ font-style: italic;
227
+ color: #90caf9;
228
+ }
229
+ .lb-divider td {
230
+ background: #1a1a2e !important;
231
+ border-top: 2px solid #2a2a4a;
232
+ padding: 2px;
233
+ height: 4px;
234
+ }
235
+ </style>
236
+ """
237
+
238
+ header = """
239
+ <table class="lb-table">
240
+ <thead>
241
+ <tr>
242
+ <th>#</th>
243
+ <th>Model</th>
244
+ <th>Type</th>
245
+ <th>L0</th>
246
+ <th>L1</th>
247
+ <th>L2</th>
248
+ <th>L3</th>
249
+ <th>Overall</th>
250
+ <th>Selection Gap</th>
251
+ </tr>
252
+ </thead>
253
+ <tbody>
254
+ """
255
+
256
+ rows_html = ""
257
+ for _, row in df.iterrows():
258
+ l0_c = _score_color(row["L0"])
259
+ l1_c = _score_color(row["L1"])
260
+ l2_c = _score_color(row["L2"])
261
+ l3_c = _score_color(row["L3"])
262
+ ov_c = _score_color(row["Overall"])
263
+
264
+ rows_html += f"""
265
+ <tr>
266
+ <td>{row['Rank']}</td>
267
+ <td>{row['Model']}</td>
268
+ <td>{_type_badge(row['Type'])}</td>
269
+ <td class="score-cell" style="color:{l0_c}">{row['L0']:.1f}</td>
270
+ <td class="score-cell" style="color:{l1_c}">{row['L1']:.1f}</td>
271
+ <td class="score-cell" style="color:{l2_c}">{row['L2']:.1f}</td>
272
+ <td class="score-cell" style="color:{l3_c}">{row['L3']:.1f}</td>
273
+ <td class="overall-cell" style="color:{ov_c}">{row['Overall']:.1f}</td>
274
+ <td>{_gap_badge(row['Selection Gap'])}</td>
275
+ </tr>
276
+ """
277
+
278
+ # Divider
279
+ rows_html += '<tr class="lb-divider"><td colspan="9"></td></tr>'
280
+
281
+ # Averages
282
+ for label, avg in AVERAGES.items():
283
+ l0_c = _score_color(avg["L0"])
284
+ l1_c = _score_color(avg["L1"])
285
+ l2_c = _score_color(avg["L2"])
286
+ l3_c = _score_color(avg["L3"])
287
+ ov_c = _score_color(avg["Overall"])
288
+ rows_html += f"""
289
+ <tr class="lb-avg-row">
290
+ <td></td>
291
+ <td><em>{label}</em></td>
292
+ <td></td>
293
+ <td class="score-cell" style="color:{l0_c}">{avg['L0']:.1f}</td>
294
+ <td class="score-cell" style="color:{l1_c}">{avg['L1']:.1f}</td>
295
+ <td class="score-cell" style="color:{l2_c}">{avg['L2']:.1f}</td>
296
+ <td class="score-cell" style="color:{l3_c}">{avg['L3']:.1f}</td>
297
+ <td class="overall-cell" style="color:{ov_c}">{avg['Overall']:.1f}</td>
298
+ <td></td>
299
+ </tr>
300
+ """
301
+
302
+ footer = "</tbody></table>"
303
+ return css + header + rows_html + footer
304
+
305
+
306
+ # ---------------------------------------------------------------------------
307
+ # TAB 2: SELECTION GAP VISUALIZATION
308
+ # ---------------------------------------------------------------------------
309
+ def plot_selection_gap(df: pd.DataFrame) -> go.Figure:
310
+ """Bar chart: L0 vs Composed Average for each model, with gap arrows."""
311
+ df_sorted = df.sort_values("Overall", ascending=True)
312
+
313
+ fig = go.Figure()
314
+
315
+ # L0 bars
316
+ fig.add_trace(go.Bar(
317
+ y=df_sorted["Model"],
318
+ x=df_sorted["L0"],
319
+ name="L0 (Single Tool)",
320
+ orientation="h",
321
+ marker=dict(color=ACCENT_BLUE, line=dict(width=0)),
322
+ text=[f"{v:.1f}" for v in df_sorted["L0"]],
323
+ textposition="inside",
324
+ textfont=dict(size=11, color="white"),
325
+ hovertemplate="<b>%{y}</b><br>L0: %{x:.1f}%<extra></extra>",
326
+ ))
327
+
328
+ # Composed average bars
329
+ fig.add_trace(go.Bar(
330
+ y=df_sorted["Model"],
331
+ x=df_sorted["Composed Avg"],
332
+ name="Composed Avg (L1-L3)",
333
+ orientation="h",
334
+ marker=dict(color=ACCENT_ORANGE, line=dict(width=0)),
335
+ text=[f"{v:.1f}" for v in df_sorted["Composed Avg"]],
336
+ textposition="inside",
337
+ textfont=dict(size=11, color="white"),
338
+ hovertemplate="<b>%{y}</b><br>Composed Avg: %{x:.1f}%<extra></extra>",
339
+ ))
340
+
341
+ # Add gap annotations
342
+ for _, row in df_sorted.iterrows():
343
+ gap = row["Composed Avg"] - row["L0"]
344
+ direction = "+" if gap > 0 else ""
345
+ color = ACCENT_GREEN if gap > 0 else ACCENT_RED
346
+ x_pos = max(row["L0"], row["Composed Avg"]) + 2
347
+ fig.add_annotation(
348
+ x=x_pos,
349
+ y=row["Model"],
350
+ text=f"<b>{direction}{gap:.1f}</b>",
351
+ showarrow=False,
352
+ font=dict(color=color, size=11),
353
+ xanchor="left",
354
+ )
355
+
356
+ fig = _apply_layout(
357
+ fig,
358
+ title=dict(text="Selection Gap: L0 (Single Tool) vs Composed Average (L1-L3)", font=dict(size=16)),
359
+ barmode="group",
360
+ xaxis=dict(title="Accuracy (%)", range=[0, 100], gridcolor=GRID_COLOR),
361
+ yaxis=dict(title="", gridcolor=GRID_COLOR, tickfont=dict(size=11)),
362
+ legend=dict(
363
+ orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
364
+ bgcolor="rgba(0,0,0,0)",
365
+ ),
366
+ height=700,
367
+ )
368
+ return fig
369
+
370
+
371
+ # ---------------------------------------------------------------------------
372
+ # TAB 3: LEVEL COMPARISON
373
+ # ---------------------------------------------------------------------------
374
+ def plot_level_comparison(df: pd.DataFrame, model_type: str = "All") -> go.Figure:
375
+ """Grouped bar chart: L0/L1/L2/L3 per model, filterable by type."""
376
+ if model_type == "Cloud":
377
+ df_plot = df[df["Type"] == "Cloud"].copy()
378
+ elif model_type == "Local":
379
+ df_plot = df[df["Type"] == "Local"].copy()
380
+ else:
381
+ df_plot = df.copy()
382
+
383
+ df_plot = df_plot.sort_values("Overall", ascending=True)
384
+
385
+ fig = go.Figure()
386
+
387
+ for level, color in LEVEL_COLORS.items():
388
+ fig.add_trace(go.Bar(
389
+ y=df_plot["Model"],
390
+ x=df_plot[level],
391
+ name=level,
392
+ orientation="h",
393
+ marker=dict(color=color, line=dict(width=0.5, color="#111")),
394
+ text=[f"{v:.1f}" for v in df_plot[level]],
395
+ textposition="outside",
396
+ textfont=dict(size=9),
397
+ hovertemplate=f"<b>%{{y}}</b><br>{level}: %{{x:.1f}}%<extra></extra>",
398
+ ))
399
+
400
+ n_models = len(df_plot)
401
+ fig = _apply_layout(
402
+ fig,
403
+ title=dict(
404
+ text=f"Performance by Composition Level ({model_type} Models)",
405
+ font=dict(size=16),
406
+ ),
407
+ barmode="group",
408
+ xaxis=dict(title="Accuracy (%)", range=[0, 105], gridcolor=GRID_COLOR),
409
+ yaxis=dict(title="", gridcolor=GRID_COLOR, tickfont=dict(size=11)),
410
+ legend=dict(
411
+ orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
412
+ bgcolor="rgba(0,0,0,0)",
413
+ ),
414
+ height=max(400, n_models * 50 + 150),
415
+ )
416
+ return fig
417
+
418
+
419
+ def plot_level_radar() -> go.Figure:
420
+ """Radar/spider chart comparing cloud vs local averages."""
421
+ categories = ["L0", "L1", "L2", "L3"]
422
+
423
+ fig = go.Figure()
424
+
425
+ fig.add_trace(go.Scatterpolar(
426
+ r=[AVERAGES["Cloud avg"]["L0"], AVERAGES["Cloud avg"]["L1"],
427
+ AVERAGES["Cloud avg"]["L2"], AVERAGES["Cloud avg"]["L3"],
428
+ AVERAGES["Cloud avg"]["L0"]],
429
+ theta=categories + [categories[0]],
430
+ fill="toself",
431
+ name="Cloud Avg",
432
+ line=dict(color=ACCENT_BLUE, width=2),
433
+ fillcolor="rgba(79, 195, 247, 0.2)",
434
+ ))
435
+
436
+ fig.add_trace(go.Scatterpolar(
437
+ r=[AVERAGES["Local avg"]["L0"], AVERAGES["Local avg"]["L1"],
438
+ AVERAGES["Local avg"]["L2"], AVERAGES["Local avg"]["L3"],
439
+ AVERAGES["Local avg"]["L0"]],
440
+ theta=categories + [categories[0]],
441
+ fill="toself",
442
+ name="Local Avg",
443
+ line=dict(color=ACCENT_PURPLE, width=2),
444
+ fillcolor="rgba(171, 71, 188, 0.2)",
445
+ ))
446
+
447
+ fig.update_layout(
448
+ polar=dict(
449
+ bgcolor=CARD_BG,
450
+ radialaxis=dict(
451
+ visible=True, range=[0, 90],
452
+ gridcolor=GRID_COLOR, linecolor=GRID_COLOR,
453
+ tickfont=dict(color=TEXT_COLOR, size=10),
454
+ ),
455
+ angularaxis=dict(
456
+ gridcolor=GRID_COLOR, linecolor=GRID_COLOR,
457
+ tickfont=dict(color=TEXT_COLOR, size=13, family="Inter, system-ui, sans-serif"),
458
+ ),
459
+ ),
460
+ paper_bgcolor=BG_COLOR,
461
+ font=dict(color=TEXT_COLOR, family="Inter, system-ui, sans-serif"),
462
+ title=dict(text="Cloud vs Local: Performance Profile", font=dict(size=16, color=TEXT_COLOR)),
463
+ legend=dict(
464
+ orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5,
465
+ bgcolor="rgba(0,0,0,0)",
466
+ ),
467
+ height=500,
468
+ margin=dict(l=80, r=80, t=80, b=80),
469
+ )
470
+ return fig
471
+
472
+
473
+ # ---------------------------------------------------------------------------
474
+ # TAB 4: ABOUT
475
+ # ---------------------------------------------------------------------------
476
+ ABOUT_MD = """
477
+ ## CompToolBench: Measuring Compositional Tool-Use in LLMs
478
+
479
+ **CompToolBench** is a benchmark that measures *compositional tool-use generalization* in large
480
+ language models. The central question: if an LLM can use tools A, B, and C individually, can it
481
+ compose them into novel pipelines like `A(B(C(x)))`?
482
+
483
+ ---
484
+
485
+ ### Composition Levels
486
+
487
+ | Level | Topology | Description |
488
+ |:------|:---------|:------------|
489
+ | **L0 (Node)** | Single call | One tool, correct arguments -- the baseline |
490
+ | **L1 (Chain)** | A -> B -> C | Sequential: output of tool_i feeds tool_{i+1} |
491
+ | **L2 (Parallel)** | [A, B] -> C | Independent calls whose results merge downstream |
492
+ | **L3 (DAG)** | Complex graph | Branching, merging, and sequential edges combined |
493
+
494
+ ---
495
+
496
+ ### Key Finding: The Selection Gap
497
+
498
+ > **17 out of 18 models exhibit a Selection Gap**: their L0 (single-tool) accuracy is *lower*
499
+ > than their average accuracy on composed tasks (L1-L3).
500
+
501
+ This is counter-intuitive. Models are *better* at multi-step composition than at simple
502
+ single-tool selection. The explanation: L0 tests pure tool *selection* (choosing the right
503
+ tool from a large catalogue), while L1-L3 tasks provide more structural context that narrows
504
+ the search space. The hardest part of tool use is not execution -- it is *selection*.
505
+
506
+ ---
507
+
508
+ ### Benchmark Details
509
+
510
+ - **18 models** evaluated (10 cloud API, 8 local via Ollama)
511
+ - **106 deterministic tool simulations** across 15 categories
512
+ - **200 tasks** at 4 composition levels (L0-L3)
513
+ - **Deterministic scoring** with verifiable ground-truth execution traces
514
+
515
+ ---
516
+
517
+ ### Links
518
+
519
+ | Resource | Link |
520
+ |:---------|:-----|
521
+ | Paper | [ArXiv (coming soon)](#) |
522
+ | Code | [github.com/ronyrahmaan/comptoolbench](https://github.com/ronyrahmaan/comptoolbench) |
523
+ | Author | Md A Rahman, Texas Tech University |
524
+
525
+ ---
526
+
527
+ <p style="text-align:center;color:#666;font-size:0.85em;">
528
+ CompToolBench -- February 2026
529
+ </p>
530
+ """
531
+
532
+
533
+ # ---------------------------------------------------------------------------
534
+ # GRADIO APP
535
+ # ---------------------------------------------------------------------------
536
+ def create_app() -> gr.Blocks:
537
+ """Build the full 4-tab Gradio Blocks application."""
538
+ df = build_full_dataframe()
539
+
540
+ custom_css = """
541
+ .gradio-container {
542
+ max-width: 1200px !important;
543
+ margin: auto !important;
544
+ }
545
+ .main-header {
546
+ text-align: center;
547
+ padding: 20px 0 10px 0;
548
+ }
549
+ .main-header h1 {
550
+ font-size: 2em;
551
+ font-weight: 700;
552
+ background: linear-gradient(135deg, #4fc3f7, #ab47bc);
553
+ -webkit-background-clip: text;
554
+ -webkit-text-fill-color: transparent;
555
+ margin-bottom: 8px;
556
+ }
557
+ .main-header p {
558
+ color: #aaa;
559
+ font-size: 1.1em;
560
+ }
561
+ .stat-row {
562
+ display: flex;
563
+ justify-content: center;
564
+ gap: 40px;
565
+ padding: 15px 0;
566
+ flex-wrap: wrap;
567
+ }
568
+ .stat-item {
569
+ text-align: center;
570
+ }
571
+ .stat-num {
572
+ font-size: 1.8em;
573
+ font-weight: 700;
574
+ color: #4fc3f7;
575
+ }
576
+ .stat-label {
577
+ font-size: 0.85em;
578
+ color: #888;
579
+ text-transform: uppercase;
580
+ letter-spacing: 1px;
581
+ }
582
+ footer {visibility: hidden;}
583
+ """
584
+
585
+ theme = gr.themes.Base(
586
+ primary_hue=gr.themes.colors.blue,
587
+ secondary_hue=gr.themes.colors.purple,
588
+ neutral_hue=gr.themes.colors.gray,
589
+ font=gr.themes.GoogleFont("Inter"),
590
+ ).set(
591
+ body_background_fill="#0f0f1a",
592
+ body_background_fill_dark="#0f0f1a",
593
+ block_background_fill="#1a1a2e",
594
+ block_background_fill_dark="#1a1a2e",
595
+ block_border_color="#2a2a4a",
596
+ block_border_color_dark="#2a2a4a",
597
+ block_label_text_color="#b0bec5",
598
+ block_label_text_color_dark="#b0bec5",
599
+ block_title_text_color="#e0e0e0",
600
+ block_title_text_color_dark="#e0e0e0",
601
+ body_text_color="#e0e0e0",
602
+ body_text_color_dark="#e0e0e0",
603
+ body_text_color_subdued="#888",
604
+ body_text_color_subdued_dark="#888",
605
+ background_fill_primary="#16213e",
606
+ background_fill_primary_dark="#16213e",
607
+ background_fill_secondary="#1a1a2e",
608
+ background_fill_secondary_dark="#1a1a2e",
609
+ border_color_accent="#4fc3f7",
610
+ border_color_accent_dark="#4fc3f7",
611
+ color_accent_soft="#1e3a5f",
612
+ color_accent_soft_dark="#1e3a5f",
613
+ button_primary_background_fill="#4fc3f7",
614
+ button_primary_background_fill_dark="#4fc3f7",
615
+ button_primary_text_color="#0f0f1a",
616
+ button_primary_text_color_dark="#0f0f1a",
617
+ )
618
+
619
+ # Gradio 6+ moved theme/css from Blocks() to launch().
620
+ # Detect version and pass params accordingly.
621
+ _gradio_major = int(gr.__version__.split(".")[0])
622
+ _blocks_kwargs: dict = {"title": "CompToolBench"}
623
+ if _gradio_major < 6:
624
+ _blocks_kwargs["theme"] = theme
625
+ _blocks_kwargs["css"] = custom_css
626
+
627
+ with gr.Blocks(**_blocks_kwargs) as app:
628
+ # ── Header ──
629
+ gr.HTML("""
630
+ <div class="main-header">
631
+ <h1>CompToolBench</h1>
632
+ <p>Measuring Compositional Tool-Use Generalization in LLMs</p>
633
+ </div>
634
+ <div class="stat-row">
635
+ <div class="stat-item">
636
+ <div class="stat-num">18</div>
637
+ <div class="stat-label">Models</div>
638
+ </div>
639
+ <div class="stat-item">
640
+ <div class="stat-num">106</div>
641
+ <div class="stat-label">Tools</div>
642
+ </div>
643
+ <div class="stat-item">
644
+ <div class="stat-num">4</div>
645
+ <div class="stat-label">Composition Levels</div>
646
+ </div>
647
+ <div class="stat-item">
648
+ <div class="stat-num">17/18</div>
649
+ <div class="stat-label">Show Selection Gap</div>
650
+ </div>
651
+ </div>
652
+ """)
653
+
654
+ # ── Tab 1: Leaderboard ──
655
+ with gr.Tab("Leaderboard", id="leaderboard"):
656
+ gr.HTML(format_leaderboard_html(df))
657
+ gr.Markdown(
658
+ """
659
+ **Reading the table:** Scores are accuracy percentages. Colors range from
660
+ <span style="color:#ef5350">red</span> (low) to
661
+ <span style="color:#66bb6a">green</span> (high).
662
+ **Selection Gap** = model's L0 is lower than its average of L1-L3
663
+ (i.e., models are *better* at composed tasks than single-tool selection).
664
+ **Delta** in the paper = L0 minus L3 (positive means degradation from single to DAG).
665
+ """,
666
+ elem_classes=["block"],
667
+ )
668
+
669
+ # ── Tab 2: Selection Gap ──
670
+ with gr.Tab("Selection Gap", id="selection-gap"):
671
+ gr.Markdown(
672
+ "### The Selection Gap: Why are models better at *composed* tasks than single-tool calls?"
673
+ )
674
+ gr.Plot(plot_selection_gap(df))
675
+ gr.Markdown(
676
+ """
677
+ **How to read this chart:** For each model, the blue bar shows L0 accuracy
678
+ (single-tool selection) and the orange bar shows the average of L1, L2, L3
679
+ (composed tasks). The number on the right is the gap.
680
+
681
+ A **positive gap** (green number) means the model performs *better* on composed
682
+ tasks -- the Selection Gap. This happens because multi-step prompts provide
683
+ richer structural context that narrows the tool search space.
684
+
685
+ Only **Llama 4 Scout 17B** does not exhibit a Selection Gap, because its L3
686
+ accuracy collapses to 7.0% (catastrophic DAG failure).
687
+ """
688
+ )
689
+
690
+ # ── Tab 3: Level Comparison ──
691
+ with gr.Tab("Level Comparison", id="level-comparison"):
692
+ gr.Markdown("### Performance breakdown by composition level")
693
+ model_filter = gr.Radio(
694
+ choices=["All", "Cloud", "Local"],
695
+ value="All",
696
+ label="Filter by deployment type",
697
+ )
698
+ level_chart = gr.Plot(plot_level_comparison(df, "All"))
699
+ model_filter.change(
700
+ fn=lambda t: plot_level_comparison(df, t),
701
+ inputs=[model_filter],
702
+ outputs=[level_chart],
703
+ )
704
+
705
+ gr.Markdown("### Cloud vs Local: Aggregate Profile")
706
+ gr.Plot(plot_level_radar())
707
+ gr.Markdown(
708
+ """
709
+ **Key insight:** Cloud models massively outperform local models on L2
710
+ (parallel composition): 80.5% vs 50.8%. This 30-point gap is the largest
711
+ difference between deployment types at any level, suggesting that parallel
712
+ tool orchestration is where API-served models have the biggest advantage.
713
+ """
714
+ )
715
+
716
+ # ── Tab 4: About ──
717
+ with gr.Tab("About", id="about"):
718
+ gr.Markdown(ABOUT_MD)
719
+
720
+ # Store launch kwargs for Gradio 6+ theme/css
721
+ app._ctb_launch_kwargs = {} # type: ignore[attr-defined]
722
+ if _gradio_major >= 6:
723
+ app._ctb_launch_kwargs["theme"] = theme # type: ignore[attr-defined]
724
+ app._ctb_launch_kwargs["css"] = custom_css # type: ignore[attr-defined]
725
+
726
+ return app
727
+
728
+
729
+ # ---------------------------------------------------------------------------
730
+ # ENTRY POINT
731
+ # ---------------------------------------------------------------------------
732
+ if __name__ == "__main__":
733
+ app = create_app()
734
+ launch_kwargs = getattr(app, "_ctb_launch_kwargs", {})
735
+ app.launch(share=False, **launch_kwargs)