OpenCaptchaWorld commited on
Commit
a042349
·
1 Parent(s): f312abf

Add application file

Browse files
Files changed (1) hide show
  1. app.py +1190 -0
app.py ADDED
@@ -0,0 +1,1190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # leaderboard/app.py
2
+ import pandas as pd, numpy as np, matplotlib.pyplot as plt, gradio as gr
3
+ import pathlib
4
+ import json
5
+ import csv
6
+
7
+ CATEGORY_MAP = {
8
+ "Overall": ["Overall Pass Rate"],
9
+ # You can define sets, e.g. "Vision-hard": ["Squiggle", "Shadow_Plausible"]
10
+ }
11
+
12
+ def get_results_path():
13
+ """Get the path to results.csv, resolving relative to this file's location."""
14
+ this_file = pathlib.Path(__file__).resolve()
15
+ results_path = this_file.parent / "results.csv"
16
+ return results_path
17
+
18
+ def get_runs_path():
19
+ """Get the path to runs directory, resolving relative to this file's location."""
20
+ this_file = pathlib.Path(__file__).resolve()
21
+ runs_path = this_file.parent / "runs"
22
+ runs_path.mkdir(parents=True, exist_ok=True)
23
+ return runs_path
24
+
25
+ def infer_type(row):
26
+ """Infer model type (Proprietary/Open source) from Provider or Model name."""
27
+ provider = str(row.get("Provider", "")).lower()
28
+ model = str(row.get("Model", "")).lower()
29
+
30
+ # Open source indicators
31
+ open_source_keywords = [
32
+ "llama", "mistral", "qwen", "phi", "gemma", "falcon", "mpt",
33
+ "vicuna", "alpaca", "wizard", "openchat", "neural-chat",
34
+ "browser-use", "browseruse", "open source", "opensource"
35
+ ]
36
+
37
+ # Check if any open source keyword appears
38
+ for keyword in open_source_keywords:
39
+ if keyword in provider or keyword in model:
40
+ return "Open source"
41
+
42
+ # Default to Proprietary if not found
43
+ return "Proprietary"
44
+
45
+ def load_df(path=None):
46
+ """Load the results CSV, creating empty dataframe if file doesn't exist."""
47
+ if path is None:
48
+ path = get_results_path()
49
+
50
+ metadata_cols = ["Model", "Provider", "Agent Framework", "Type"]
51
+ metric_cols = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
52
+ expected_cols = metadata_cols + metric_cols
53
+
54
+ if not pathlib.Path(path).exists():
55
+ # Return empty dataframe with expected columns
56
+ return pd.DataFrame(columns=expected_cols)
57
+
58
+ try:
59
+ df = pd.read_csv(path)
60
+ # Handle empty CSV (only headers)
61
+ if len(df) == 0:
62
+ return pd.DataFrame(columns=expected_cols)
63
+
64
+ # Ensure required columns exist
65
+ if "Agent Framework" not in df.columns:
66
+ # Try legacy "Notes" column
67
+ if "Notes" in df.columns:
68
+ df["Agent Framework"] = df["Notes"]
69
+ else:
70
+ df["Agent Framework"] = ""
71
+
72
+ # Handle legacy "Overall" column
73
+ if "Overall" in df.columns and "Overall Pass Rate" not in df.columns:
74
+ df["Overall Pass Rate"] = df["Overall"]
75
+
76
+ # Add Type column if missing, infer from Provider/Model
77
+ if "Type" not in df.columns:
78
+ df["Type"] = df.apply(infer_type, axis=1)
79
+
80
+ # Convert numeric columns
81
+ numeric_cols = metric_cols + [c for c in df.columns if c not in metadata_cols + metric_cols]
82
+ for c in numeric_cols:
83
+ if c in df.columns:
84
+ df[c] = pd.to_numeric(df[c], errors="coerce")
85
+
86
+ return df
87
+ except Exception as e:
88
+ print(f"Error loading results.csv: {e}")
89
+ return pd.DataFrame(columns=expected_cols)
90
+
91
+ def compute_score(df, category):
92
+ # Get columns to compute score from
93
+ # Map "Overall" category to "Overall Pass Rate" column
94
+ if category == "Overall":
95
+ # Use CATEGORY_MAP which maps "Overall" to ["Overall Pass Rate"]
96
+ cols = CATEGORY_MAP.get("Overall", ["Overall Pass Rate"])
97
+ elif category in CATEGORY_MAP:
98
+ # Use predefined category mapping
99
+ cols = CATEGORY_MAP[category]
100
+ elif category in df.columns:
101
+ # Category is a direct column name
102
+ cols = [category]
103
+ else:
104
+ # Fallback: use "Overall Pass Rate" if it exists, otherwise all numeric columns
105
+ if "Overall Pass Rate" in df.columns:
106
+ cols = ["Overall Pass Rate"]
107
+ else:
108
+ numeric_cols = [c for c in df.columns if c not in ["Model", "Provider", "Agent Framework", "Type"]]
109
+ cols = numeric_cols if numeric_cols else []
110
+
111
+ # Filter to only existing columns
112
+ cols = [c for c in cols if c in df.columns]
113
+
114
+ # If no valid columns found, use all numeric columns except metadata/metrics
115
+ if not cols:
116
+ exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Avg Duration (s)", "Avg Cost ($)"]
117
+ numeric_cols = [c for c in df.columns if c not in exclude_cols]
118
+ cols = numeric_cols if numeric_cols else []
119
+ # If still no columns, create a zero score
120
+ if not cols:
121
+ df = df.copy()
122
+ df["Category Pass Rate"] = 0.0
123
+ return df
124
+
125
+ df = df.copy()
126
+ if cols:
127
+ df["Category Pass Rate"] = df[cols].mean(axis=1, skipna=True)
128
+ else:
129
+ df["Category Pass Rate"] = 0.0
130
+ return df
131
+
132
+ def table_html(df):
133
+ if len(df) == 0:
134
+ return """
135
+ <style>
136
+ .leaderboard-container {
137
+ background: #ffffff;
138
+ border-radius: 8px;
139
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
140
+ overflow: hidden;
141
+ margin: 20px 0;
142
+ }
143
+ table.lb {
144
+ width: 100%;
145
+ border-collapse: collapse;
146
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
147
+ font-size: 14px;
148
+ }
149
+ table.lb thead {
150
+ background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%);
151
+ color: white;
152
+ }
153
+ table.lb th {
154
+ padding: 16px 20px;
155
+ text-align: left;
156
+ font-weight: 600;
157
+ font-size: 13px;
158
+ text-transform: uppercase;
159
+ letter-spacing: 0.5px;
160
+ }
161
+ table.lb td {
162
+ padding: 16px 20px;
163
+ border-bottom: 1px solid #e5e7eb;
164
+ color: #374151;
165
+ }
166
+ table.lb tbody tr {
167
+ transition: background-color 0.2s ease;
168
+ }
169
+ table.lb tbody tr:hover {
170
+ background: #f9fafb;
171
+ }
172
+ table.lb tbody tr:last-child td {
173
+ border-bottom: none;
174
+ }
175
+ .rank-badge {
176
+ display: inline-block;
177
+ width: 32px;
178
+ height: 32px;
179
+ line-height: 32px;
180
+ text-align: center;
181
+ border-radius: 50%;
182
+ font-weight: 700;
183
+ font-size: 14px;
184
+ }
185
+ .rank-1 { background: linear-gradient(135deg, #ffd700 0%, #ffed4e 100%); color: #000; box-shadow: 0 2px 8px rgba(255, 215, 0, 0.4); }
186
+ .rank-2 { background: linear-gradient(135deg, #c0c0c0 0%, #e8e8e8 100%); color: #000; box-shadow: 0 2px 8px rgba(192, 192, 192, 0.4); }
187
+ .rank-3 { background: linear-gradient(135deg, #cd7f32 0%, #e6a55d 100%); color: #fff; box-shadow: 0 2px 8px rgba(205, 127, 50, 0.4); }
188
+ .rank-other { background: #f1f5f9; color: #64748b; }
189
+ .pass-rate-cell {
190
+ font-weight: 600;
191
+ font-size: 15px;
192
+ }
193
+ .metric-cell {
194
+ font-weight: 500;
195
+ font-size: 14px;
196
+ color: #6b7280;
197
+ }
198
+ </style>
199
+ <div class="leaderboard-container">
200
+ <table class="lb">
201
+ <thead><tr><th>#</th><th>Model</th><th>Provider</th><th>Type</th><th>Agent Framework</th><th>Pass Rate</th><th>Avg Duration (s)</th><th>Avg Cost ($)</th></tr></thead>
202
+ <tbody><tr><td colspan="8" style="text-align:center;padding:40px;color:#9ca3af;font-size:16px;">No results yet. Run evaluations to populate the leaderboard.</td></tr></tbody>
203
+ </table>
204
+ </div>
205
+ """
206
+ rows = []
207
+ for i, r in df.iterrows():
208
+ rank = i + 1
209
+ rank_class = "rank-1" if rank == 1 else "rank-2" if rank == 2 else "rank-3" if rank == 3 else "rank-other"
210
+ pass_rate = r['Category Pass Rate']
211
+ pass_rate_color = "#10b981" if pass_rate >= 0.7 else "#f59e0b" if pass_rate >= 0.4 else "#ef4444"
212
+
213
+ # Format duration and cost
214
+ duration = r.get('Avg Duration (s)', None)
215
+ duration_str = f"{duration:.2f}" if pd.notna(duration) and duration is not None else "N/A"
216
+
217
+ cost = r.get('Avg Cost ($)', None)
218
+ cost_str = f"${cost:.4f}" if pd.notna(cost) and cost is not None else "N/A"
219
+
220
+ type_val = r.get('Type', 'Proprietary')
221
+ type_color = "#10b981" if type_val == "Open source" else "#6366f1"
222
+
223
+ rows.append(f"""
224
+ <tr>
225
+ <td><span class="rank-badge {rank_class}">{rank}</span></td>
226
+ <td><strong style="color: #111827;">{r['Model']}</strong></td>
227
+ <td style="color: #6b7280;">{r.get('Provider','')}</td>
228
+ <td><span style="color: {type_color}; font-weight: 600;">{type_val}</span></td>
229
+ <td style="color: #6b7280;">{r.get('Agent Framework','')}</td>
230
+ <td class="pass-rate-cell" style="color: {pass_rate_color};">{pass_rate:.3f}</td>
231
+ <td class="metric-cell">{duration_str}</td>
232
+ <td class="metric-cell">{cost_str}</td>
233
+ </tr>""")
234
+ return f"""
235
+ <style>
236
+ .leaderboard-container {{
237
+ background: #ffffff;
238
+ border-radius: 8px;
239
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
240
+ overflow: hidden;
241
+ margin: 20px 0;
242
+ }}
243
+ table.lb {{
244
+ width: 100%;
245
+ border-collapse: collapse;
246
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
247
+ font-size: 14px;
248
+ }}
249
+ table.lb thead {{
250
+ background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%);
251
+ color: white;
252
+ }}
253
+ table.lb th {{
254
+ padding: 16px 20px;
255
+ text-align: left;
256
+ font-weight: 600;
257
+ font-size: 13px;
258
+ text-transform: uppercase;
259
+ letter-spacing: 0.5px;
260
+ }}
261
+ table.lb td {{
262
+ padding: 16px 20px;
263
+ border-bottom: 1px solid #e5e7eb;
264
+ color: #374151;
265
+ }}
266
+ table.lb tbody tr {{
267
+ transition: background-color 0.2s ease;
268
+ }}
269
+ table.lb tbody tr:hover {{
270
+ background: #f9fafb;
271
+ }}
272
+ table.lb tbody tr:last-child td {{
273
+ border-bottom: none;
274
+ }}
275
+ .rank-badge {{
276
+ display: inline-block;
277
+ width: 32px;
278
+ height: 32px;
279
+ line-height: 32px;
280
+ text-align: center;
281
+ border-radius: 50%;
282
+ font-weight: 700;
283
+ font-size: 14px;
284
+ }}
285
+ .rank-1 {{ background: linear-gradient(135deg, #ffd700 0%, #ffed4e 100%); color: #000; box-shadow: 0 2px 8px rgba(255, 215, 0, 0.4); }}
286
+ .rank-2 {{ background: linear-gradient(135deg, #c0c0c0 0%, #e8e8e8 100%); color: #000; box-shadow: 0 2px 8px rgba(192, 192, 192, 0.4); }}
287
+ .rank-3 {{ background: linear-gradient(135deg, #cd7f32 0%, #e6a55d 100%); color: #fff; box-shadow: 0 2px 8px rgba(205, 127, 50, 0.4); }}
288
+ .rank-other {{ background: #f1f5f9; color: #64748b; }}
289
+ .pass-rate-cell {{
290
+ font-weight: 600;
291
+ font-size: 15px;
292
+ }}
293
+ .metric-cell {{
294
+ font-weight: 500;
295
+ font-size: 14px;
296
+ color: #6b7280;
297
+ }}
298
+ </style>
299
+ <div class="leaderboard-container">
300
+ <table class="lb">
301
+ <thead><tr><th>#</th><th>Model</th><th>Provider</th><th>Type</th><th>Agent Framework</th><th>Pass Rate</th><th>Avg Duration (s)</th><th>Avg Cost ($)</th></tr></thead>
302
+ <tbody>{''.join(rows)}</tbody>
303
+ </table>
304
+ </div>
305
+ """
306
+
307
+ def perf_bar(df):
308
+ plt.close("all")
309
+ if len(df) == 0:
310
+ fig, ax = plt.subplots(figsize=(10, 4), facecolor='white', dpi=150)
311
+ ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray")
312
+ ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
313
+ fig.tight_layout(); return fig
314
+ d = df.sort_values("Category Pass Rate", ascending=True)
315
+ fig, ax = plt.subplots(figsize=(10, max(4, 0.5*len(d))), facecolor='white', dpi=150)
316
+
317
+ # Create gradient colors based on pass rate - CAPTCHA themed
318
+ colors = []
319
+ for pass_rate in d["Category Pass Rate"]:
320
+ if pass_rate >= 0.7:
321
+ colors.append('#10b981') # verification green
322
+ elif pass_rate >= 0.4:
323
+ colors.append('#f59e0b') # warning amber
324
+ else:
325
+ colors.append('#ef4444') # error red
326
+
327
+ bars = ax.barh(range(len(d)), d["Category Pass Rate"], color=colors, alpha=0.8, edgecolor='white', linewidth=1.5)
328
+
329
+ # Add value labels on bars
330
+ for i, (bar, pass_rate) in enumerate(zip(bars, d["Category Pass Rate"])):
331
+ width = bar.get_width()
332
+ ax.text(width + 0.01, bar.get_y() + bar.get_height()/2,
333
+ f'{pass_rate:.3f}', ha='left', va='center', fontsize=11, fontweight='600')
334
+
335
+ ax.set_yticks(range(len(d)))
336
+ ax.set_yticklabels(d["Model"], fontsize=12)
337
+ ax.set_xlabel("Pass Rate", fontsize=12, fontweight='600', color='#374151')
338
+ ax.set_xlim(0, 1.1)
339
+ ax.set_title("Performance Comparison", fontsize=16, fontweight='700', color='#111827', pad=20)
340
+ ax.spines['top'].set_visible(False)
341
+ ax.spines['right'].set_visible(False)
342
+ ax.spines['left'].set_color('#e5e7eb')
343
+ ax.spines['bottom'].set_color('#e5e7eb')
344
+ ax.grid(axis='x', alpha=0.3, linestyle='--')
345
+ ax.set_facecolor('#fafafa')
346
+ fig.tight_layout()
347
+ return fig
348
+
349
+ def perf_by_type(df_full, model_filter="Models Avg"):
350
+ """
351
+ Show average performance by puzzle type.
352
+
353
+ Args:
354
+ df_full: Full dataframe with all models
355
+ model_filter: "Models Avg" for average across all models, or a specific model name
356
+ """
357
+ plt.close("all")
358
+
359
+ # Filter by model if specified
360
+ if model_filter and model_filter != "Models Avg":
361
+ df_filtered = df_full[df_full["Model"] == model_filter].copy()
362
+ if len(df_filtered) == 0:
363
+ fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
364
+ ax.text(0.5, 0.5, f"No data available for model: {model_filter}", ha="center", va="center", fontsize=14, color="gray")
365
+ ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
366
+ fig.tight_layout(); return fig
367
+ df_plot = df_filtered
368
+ plot_title = f"Performance by Type - {model_filter}"
369
+ else:
370
+ df_plot = df_full
371
+ plot_title = "Average Performance by CAPTCHA Type (All Models)"
372
+
373
+ if len(df_plot) == 0:
374
+ fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
375
+ ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray")
376
+ ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
377
+ fig.tight_layout(); return fig
378
+
379
+ # Average each per-type column across models (exclude metadata and metric columns)
380
+ exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)", "Category Pass Rate"]
381
+ numeric_cols = [c for c in df_plot.columns if c not in exclude_cols]
382
+ type_cols = [c for c in numeric_cols if df_plot[c].notna().any() and df_plot[c].dtype in ['float64', 'int64', 'float32', 'int32']]
383
+
384
+ if len(type_cols) == 0:
385
+ fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
386
+ ax.text(0.5, 0.5, "No per-type data available", ha="center", va="center", fontsize=14, color="gray")
387
+ ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
388
+ fig.tight_layout(); return fig
389
+
390
+ # Calculate means, handling NaN values properly
391
+ if model_filter == "Models Avg":
392
+ # Average across all models
393
+ means = df_plot[type_cols].mean(numeric_only=True)
394
+ else:
395
+ # For a single model, just get its values (should be one row)
396
+ if len(df_plot) == 1:
397
+ means = df_plot[type_cols].iloc[0]
398
+ else:
399
+ # If multiple rows (shouldn't happen), average them
400
+ means = df_plot[type_cols].mean(numeric_only=True)
401
+
402
+ # Filter out any NaN means
403
+ means = means.dropna()
404
+
405
+ if len(means) == 0:
406
+ fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
407
+ ax.text(0.5, 0.5, "No valid per-type data available", ha="center", va="center", fontsize=14, color="gray")
408
+ ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
409
+ fig.tight_layout(); return fig
410
+
411
+ fig, ax = plt.subplots(figsize=(max(12, len(means) * 0.8), 6), facecolor='white', dpi=150)
412
+
413
+ # Create gradient colors based on performance - CAPTCHA themed
414
+ colors = []
415
+ for val in means.values:
416
+ if pd.isna(val):
417
+ colors.append('#94a3b8') # slate gray for NaN
418
+ elif val >= 0.7:
419
+ colors.append('#10b981') # verification green
420
+ elif val >= 0.4:
421
+ colors.append('#f59e0b') # warning amber
422
+ else:
423
+ colors.append('#ef4444') # error red
424
+
425
+ bars = ax.bar(range(len(means)), means.values, color=colors, alpha=0.8, edgecolor='white', linewidth=1.5)
426
+
427
+ # Add value labels on bars
428
+ for bar, val in zip(bars, means.values):
429
+ if not pd.isna(val):
430
+ height = bar.get_height()
431
+ ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
432
+ f'{val:.2f}', ha='center', va='bottom', fontsize=10, fontweight='600')
433
+
434
+ ax.set_xticks(range(len(means)))
435
+ ax.set_xticklabels(means.index, rotation=45, ha="right", fontsize=10)
436
+ ax.set_ylim(0, max(1.1, means.max() * 1.1) if not means.empty else 1.1)
437
+ ax.set_ylabel("Average Pass Rate", fontsize=12, fontweight='600', color='#374151')
438
+ ax.set_title(plot_title, fontsize=16, fontweight='700', color='#111827', pad=20)
439
+ ax.spines['top'].set_visible(False)
440
+ ax.spines['right'].set_visible(False)
441
+ ax.spines['left'].set_color('#e5e7eb')
442
+ ax.spines['bottom'].set_color('#e5e7eb')
443
+ ax.grid(axis='y', alpha=0.3, linestyle='--')
444
+ ax.set_facecolor('#fafafa')
445
+ fig.tight_layout()
446
+ return fig
447
+
448
+ def cost_effectiveness_plot(df):
449
+ """
450
+ Create a cost-effectiveness scatter plot: Performance (X) vs Cost (Y).
451
+ Color-coded by Type (Proprietary vs Open source).
452
+ """
453
+ plt.close("all")
454
+ if len(df) == 0:
455
+ fig, ax = plt.subplots(figsize=(10, 6), facecolor='white', dpi=150)
456
+ ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray")
457
+ ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
458
+ fig.tight_layout(); return fig
459
+
460
+ # Filter to rows with valid performance and cost data
461
+ df_plot = df.copy()
462
+ df_plot = df_plot[df_plot['Category Pass Rate'].notna() & df_plot['Avg Cost ($)'].notna()]
463
+
464
+ if len(df_plot) == 0:
465
+ fig, ax = plt.subplots(figsize=(10, 6), facecolor='white', dpi=150)
466
+ ax.text(0.5, 0.5, "No data with both performance and cost metrics", ha="center", va="center", fontsize=14, color="gray")
467
+ ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
468
+ fig.tight_layout(); return fig
469
+
470
+ # Create figure with higher DPI for better resolution
471
+ fig, ax = plt.subplots(figsize=(14, 9), facecolor='white', dpi=150)
472
+
473
+ # Separate by type
474
+ proprietary = df_plot[df_plot.get('Type', 'Proprietary') == 'Proprietary']
475
+ open_source = df_plot[df_plot.get('Type', 'Proprietary') == 'Open source']
476
+
477
+ # Plot points
478
+ if len(proprietary) > 0:
479
+ ax.scatter(proprietary['Category Pass Rate'], proprietary['Avg Cost ($)'],
480
+ c='#6366f1', s=200, alpha=0.75, edgecolors='white', linewidth=2.5,
481
+ label='Proprietary', zorder=3)
482
+ # Add labels for proprietary models
483
+ for idx, row in proprietary.iterrows():
484
+ ax.annotate(row['Model'],
485
+ (row['Category Pass Rate'], row['Avg Cost ($)']),
486
+ fontsize=10, alpha=0.85, ha='left', va='bottom',
487
+ bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7, edgecolor='none'))
488
+
489
+ if len(open_source) > 0:
490
+ ax.scatter(open_source['Category Pass Rate'], open_source['Avg Cost ($)'],
491
+ c='#10b981', s=200, alpha=0.75, edgecolors='white', linewidth=2.5,
492
+ label='Open source', zorder=3)
493
+ # Add labels for open source models
494
+ for idx, row in open_source.iterrows():
495
+ ax.annotate(row['Model'],
496
+ (row['Category Pass Rate'], row['Avg Cost ($)']),
497
+ fontsize=10, alpha=0.85, ha='left', va='bottom',
498
+ bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7, edgecolor='none'))
499
+
500
+ # Calculate thresholds for quadrants (median or fixed thresholds)
501
+ perf_threshold = df_plot['Category Pass Rate'].median() if len(df_plot) > 1 else 0.4
502
+ cost_threshold = df_plot['Avg Cost ($)'].median() if len(df_plot) > 1 else 0.01
503
+
504
+ # Add quadrant lines
505
+ ax.axvline(x=perf_threshold, color='gray', linestyle='--', linewidth=1.5, alpha=0.5, zorder=1)
506
+ ax.axhline(y=cost_threshold, color='gray', linestyle='--', linewidth=1.5, alpha=0.5, zorder=1)
507
+
508
+ # Add quadrant annotations
509
+ x_range = df_plot['Category Pass Rate'].max() - df_plot['Category Pass Rate'].min()
510
+ y_range = df_plot['Avg Cost ($)'].max() - df_plot['Avg Cost ($)'].min()
511
+
512
+ # Top-left: Low Performance, High Cost
513
+ ax.text(df_plot['Category Pass Rate'].min() + x_range * 0.05,
514
+ df_plot['Avg Cost ($)'].max() - y_range * 0.05,
515
+ '▲ Low Performance\nHigh Cost',
516
+ fontsize=12, color='#ef4444', weight='bold',
517
+ ha='left', va='top', alpha=0.8,
518
+ bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8, edgecolor='#ef4444', linewidth=1.5))
519
+
520
+ # Bottom-right: High Performance, Low Cost
521
+ ax.text(df_plot['Category Pass Rate'].max() - x_range * 0.05,
522
+ df_plot['Avg Cost ($)'].min() + y_range * 0.05,
523
+ '▼ High Performance\nLow Cost',
524
+ fontsize=12, color='#10b981', weight='bold',
525
+ ha='right', va='bottom', alpha=0.8,
526
+ bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8, edgecolor='#10b981', linewidth=1.5))
527
+
528
+ # Styling
529
+ ax.set_xlabel("Performance (Pass Rate)", fontsize=14, fontweight='600', color='#374151')
530
+ ax.set_ylabel("Avg Cost ($)", fontsize=14, fontweight='600', color='#374151')
531
+ ax.set_title("Cost-Effectiveness Analysis", fontsize=17, fontweight='700', color='#111827', pad=25)
532
+
533
+ # Add padding to axes (more padding on right for legend space)
534
+ x_pad = x_range * 0.15 if x_range > 0 else 0.1
535
+ y_pad = y_range * 0.15 if y_range > 0 else 0.001
536
+ ax.set_xlim(df_plot['Category Pass Rate'].min() - x_pad * 0.5, df_plot['Category Pass Rate'].max() + x_pad)
537
+ ax.set_ylim(max(0, df_plot['Avg Cost ($)'].min() - y_pad * 0.5), df_plot['Avg Cost ($)'].max() + y_pad)
538
+
539
+ ax.spines['top'].set_visible(False)
540
+ ax.spines['right'].set_visible(False)
541
+ ax.spines['left'].set_color('#e5e7eb')
542
+ ax.spines['bottom'].set_color('#e5e7eb')
543
+ ax.grid(alpha=0.3, linestyle='--', zorder=0, linewidth=1)
544
+ ax.set_facecolor('#fafafa')
545
+
546
+ # Add legend - position it outside the plot area to avoid covering data
547
+ # Use bbox_to_anchor to place it outside the plot
548
+ ax.legend(loc='upper left', bbox_to_anchor=(1.02, 1), frameon=True,
549
+ fancybox=True, shadow=True, fontsize=12, framealpha=0.95,
550
+ edgecolor='#e5e7eb', facecolor='white')
551
+
552
+ # Adjust layout to make room for legend
553
+ fig.tight_layout(rect=[0, 0, 0.95, 1])
554
+ return fig
555
+
556
+ def convert_benchmark_results_json(file_path, model_name=None, provider=None, agent_framework=None):
557
+ """
558
+ Convert benchmark_results.json format (per-puzzle results) to aggregated format.
559
+
560
+ Args:
561
+ file_path: Path to benchmark_results.json file (Path object or string)
562
+ model_name: Model name (if None, will try to infer from filename or use "Unknown")
563
+ provider: Provider name (if None, will try to infer from model_name)
564
+ agent_framework: Agent framework name (if None, will use "browser-use" as default)
565
+
566
+ Returns:
567
+ dict: Aggregated record with Model, Provider, Agent Framework, Type, metrics, and per-type pass rates
568
+ """
569
+ # Convert to Path object if needed
570
+ file_path = pathlib.Path(file_path) if not isinstance(file_path, pathlib.Path) else file_path
571
+
572
+ # Read the file - it's a JSONL file (one JSON object per line)
573
+ puzzle_results = []
574
+ with open(file_path, 'r') as f:
575
+ for line in f:
576
+ line = line.strip()
577
+ if line:
578
+ try:
579
+ puzzle_results.append(json.loads(line))
580
+ except json.JSONDecodeError:
581
+ continue
582
+
583
+ if not puzzle_results:
584
+ raise ValueError("No valid puzzle results found in file")
585
+
586
+ # Try to extract model/provider from puzzle results first (if they were included)
587
+ extracted_model = None
588
+ extracted_provider = None
589
+ extracted_agent_framework = None
590
+
591
+ for result in puzzle_results[:10]: # Check first 10 results
592
+ if 'model' in result and result['model']:
593
+ extracted_model = result['model']
594
+ if 'provider' in result and result['provider']:
595
+ extracted_provider = result['provider']
596
+ if 'agent_framework' in result and result['agent_framework']:
597
+ extracted_agent_framework = result['agent_framework']
598
+ # Also check camelCase variants
599
+ if 'agentFramework' in result and result['agentFramework']:
600
+ extracted_agent_framework = result['agentFramework']
601
+
602
+ # Use extracted values if available, otherwise use provided parameters
603
+ if model_name is None:
604
+ model_name = extracted_model
605
+
606
+ if provider is None:
607
+ provider = extracted_provider
608
+
609
+ if agent_framework is None:
610
+ agent_framework = extracted_agent_framework
611
+
612
+ # Infer model/provider if still not available
613
+ if model_name is None:
614
+ # Try to infer from filename (e.g., "gpt-4_results.json" -> "gpt-4")
615
+ filename = file_path.stem.lower()
616
+ if 'benchmark_results' in filename:
617
+ model_name = "Unknown Model"
618
+ else:
619
+ # Try to extract model name from filename
620
+ model_name = filename.replace('_results', '').replace('_benchmark', '').replace('-', ' ').title()
621
+
622
+ if provider is None:
623
+ # Try to infer provider from model name
624
+ model_lower = model_name.lower()
625
+ if any(x in model_lower for x in ['gpt', 'openai']):
626
+ provider = "OpenAI"
627
+ elif any(x in model_lower for x in ['claude', 'anthropic']):
628
+ provider = "Anthropic"
629
+ elif any(x in model_lower for x in ['gemini', 'google']):
630
+ provider = "Google"
631
+ elif any(x in model_lower for x in ['llama', 'mistral', 'qwen', 'phi', 'gemma']):
632
+ provider = "Open Source"
633
+ else:
634
+ provider = "Unknown"
635
+
636
+ if agent_framework is None:
637
+ agent_framework = "browser-use" # Default assumption
638
+
639
+ # Aggregate results
640
+ # Group by puzzle_type
641
+ puzzle_type_stats = {}
642
+ total_correct = 0
643
+ total_attempts = len(puzzle_results)
644
+ total_duration = 0.0
645
+ total_cost = 0.0
646
+ cost_count = 0
647
+
648
+ for result in puzzle_results:
649
+ puzzle_type = result.get('puzzle_type', 'Unknown')
650
+
651
+ # Initialize puzzle type stats if needed
652
+ if puzzle_type not in puzzle_type_stats:
653
+ puzzle_type_stats[puzzle_type] = {'correct': 0, 'total': 0}
654
+
655
+ puzzle_type_stats[puzzle_type]['total'] += 1
656
+ if result.get('correct', False):
657
+ puzzle_type_stats[puzzle_type]['correct'] += 1
658
+ total_correct += 1
659
+
660
+ # Aggregate duration
661
+ elapsed_time = result.get('elapsed_time')
662
+ if elapsed_time is not None:
663
+ try:
664
+ total_duration += float(elapsed_time)
665
+ except (ValueError, TypeError):
666
+ pass
667
+
668
+ # Aggregate cost
669
+ cost = result.get('cost')
670
+ if cost is not None:
671
+ try:
672
+ total_cost += float(cost)
673
+ cost_count += 1
674
+ except (ValueError, TypeError):
675
+ pass
676
+
677
+ # Calculate overall pass rate
678
+ overall_pass_rate = total_correct / total_attempts if total_attempts > 0 else 0.0
679
+
680
+ # Calculate average duration
681
+ avg_duration = total_duration / total_attempts if total_attempts > 0 else None
682
+
683
+ # Calculate average cost
684
+ avg_cost = total_cost / cost_count if cost_count > 0 else None
685
+
686
+ # Build aggregated record
687
+ record = {
688
+ "Model": model_name,
689
+ "Provider": provider,
690
+ "Agent Framework": agent_framework,
691
+ "Overall Pass Rate": overall_pass_rate,
692
+ "Avg Duration (s)": avg_duration,
693
+ "Avg Cost ($)": avg_cost,
694
+ }
695
+
696
+ # Add per-type pass rates
697
+ for puzzle_type, stats in puzzle_type_stats.items():
698
+ pass_rate = stats['correct'] / stats['total'] if stats['total'] > 0 else 0.0
699
+ record[puzzle_type] = pass_rate
700
+
701
+ # Infer Type
702
+ record["Type"] = infer_type(record)
703
+
704
+ return record
705
+
706
+ def is_benchmark_results_format(data):
707
+ """
708
+ Check if the data is in benchmark_results.json format (per-puzzle results).
709
+
710
+ Args:
711
+ data: List of dictionaries or single dictionary
712
+
713
+ Returns:
714
+ bool: True if data appears to be in benchmark_results format
715
+ """
716
+ if isinstance(data, dict):
717
+ data = [data]
718
+
719
+ if not isinstance(data, list) or len(data) == 0:
720
+ return False
721
+
722
+ # Check if first record has benchmark_results.json structure
723
+ first = data[0]
724
+ required_fields = ['puzzle_type', 'puzzle_id', 'correct']
725
+ has_required = all(field in first for field in required_fields)
726
+
727
+ # Check if it's NOT the aggregated format (which would have Model, Provider, etc.)
728
+ aggregated_fields = ['Model', 'Provider', 'Overall Pass Rate']
729
+ is_not_aggregated = not any(field in first for field in aggregated_fields)
730
+
731
+ return has_required and is_not_aggregated
732
+
733
+ def process_uploaded_file(file, model_name=None, provider=None, agent_framework=None):
734
+ """
735
+ Process an uploaded CSV or JSON file and merge with existing results.
736
+
737
+ Args:
738
+ file: File path string (from Gradio File component with type="filepath")
739
+ model_name: Optional model name (for benchmark_results.json conversion)
740
+ provider: Optional provider name (for benchmark_results.json conversion)
741
+ agent_framework: Optional agent framework name (for benchmark_results.json conversion)
742
+
743
+ Returns:
744
+ tuple: (success_message, error_message)
745
+ """
746
+ if file is None:
747
+ return None, "No file uploaded"
748
+
749
+ try:
750
+ # Gradio returns a file path string when type="filepath"
751
+ file_path = pathlib.Path(file) if isinstance(file, str) else pathlib.Path(file.name)
752
+
753
+ # Read the file based on extension
754
+ if file_path.suffix.lower() == '.json':
755
+ # Try reading as JSONL first (benchmark_results.json format)
756
+ try:
757
+ # Read first few lines to detect format
758
+ with open(file_path, 'r') as f:
759
+ first_lines = [f.readline().strip() for _ in range(5)]
760
+ f.seek(0)
761
+
762
+ # Try to parse as JSONL (one JSON object per line)
763
+ puzzle_results = []
764
+ for line in first_lines:
765
+ if line:
766
+ try:
767
+ puzzle_results.append(json.loads(line))
768
+ except json.JSONDecodeError:
769
+ break
770
+
771
+ # Check if it's benchmark_results format
772
+ if puzzle_results and is_benchmark_results_format(puzzle_results):
773
+ # Read entire file as JSONL
774
+ puzzle_results = []
775
+ with open(file_path, 'r') as f:
776
+ for line in f:
777
+ line = line.strip()
778
+ if line:
779
+ try:
780
+ puzzle_results.append(json.loads(line))
781
+ except json.JSONDecodeError:
782
+ continue
783
+
784
+ # Convert to aggregated format
785
+ record = convert_benchmark_results_json(
786
+ file_path,
787
+ model_name=model_name,
788
+ provider=provider,
789
+ agent_framework=agent_framework
790
+ )
791
+ records = [record]
792
+ else:
793
+ # Try reading as regular JSON
794
+ f.seek(0)
795
+ data = json.load(f)
796
+
797
+ # Normalize to list of records
798
+ if isinstance(data, dict):
799
+ records = [data]
800
+ elif isinstance(data, list):
801
+ records = data
802
+ else:
803
+ return None, f"Invalid JSON format: expected object or array, got {type(data).__name__}"
804
+
805
+ # Check if it's benchmark_results format
806
+ if is_benchmark_results_format(records):
807
+ # Convert to aggregated format
808
+ record = convert_benchmark_results_json(
809
+ file_path,
810
+ model_name=model_name,
811
+ provider=provider,
812
+ agent_framework=agent_framework
813
+ )
814
+ records = [record]
815
+ except Exception as e:
816
+ # Fallback: try reading as regular JSON
817
+ try:
818
+ with open(file_path, 'r') as f:
819
+ data = json.load(f)
820
+
821
+ # Normalize to list of records
822
+ if isinstance(data, dict):
823
+ records = [data]
824
+ elif isinstance(data, list):
825
+ records = data
826
+ else:
827
+ return None, f"Invalid JSON format: expected object or array, got {type(data).__name__}"
828
+
829
+ # Check if it's benchmark_results format
830
+ if is_benchmark_results_format(records):
831
+ # Convert to aggregated format
832
+ record = convert_benchmark_results_json(
833
+ file_path,
834
+ model_name=model_name,
835
+ provider=provider,
836
+ agent_framework=agent_framework
837
+ )
838
+ records = [record]
839
+ except Exception as json_err:
840
+ return None, f"Error reading JSON file: {str(json_err)}"
841
+
842
+ # Handle legacy column names
843
+ legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"}
844
+ for record in records:
845
+ for old_key, new_key in legacy_map.items():
846
+ if old_key in record and new_key not in record:
847
+ record[new_key] = record.pop(old_key)
848
+
849
+ # Infer Type if not present
850
+ if "Type" not in record:
851
+ record["Type"] = infer_type(record)
852
+
853
+ # Save individual JSON files to runs directory for aggregation
854
+ runs_path = get_runs_path()
855
+ import time
856
+ for record in records:
857
+ run_file = runs_path / f"run_{int(time.time() * 1000)}.json"
858
+ with open(run_file, 'w') as f:
859
+ json.dump(record, f, indent=2)
860
+
861
+ num_records = len(records)
862
+
863
+ elif file_path.suffix.lower() == '.csv':
864
+ # Handle CSV file
865
+ df_uploaded = pd.read_csv(file_path)
866
+
867
+ # Handle legacy column names
868
+ if "Notes" in df_uploaded.columns and "Agent Framework" not in df_uploaded.columns:
869
+ df_uploaded["Agent Framework"] = df_uploaded["Notes"]
870
+ if "Overall" in df_uploaded.columns and "Overall Pass Rate" not in df_uploaded.columns:
871
+ df_uploaded["Overall Pass Rate"] = df_uploaded["Overall"]
872
+
873
+ # Add Type column if missing
874
+ if "Type" not in df_uploaded.columns:
875
+ df_uploaded["Type"] = df_uploaded.apply(infer_type, axis=1)
876
+
877
+ # Convert to records and save as JSON files (for consistency with aggregation script)
878
+ records = df_uploaded.to_dict('records')
879
+ runs_path = get_runs_path()
880
+ import time
881
+ for record in records:
882
+ run_file = runs_path / f"run_{int(time.time() * 1000)}.json"
883
+ with open(run_file, 'w') as f:
884
+ json.dump(record, f, indent=2)
885
+
886
+ num_records = len(records)
887
+
888
+ else:
889
+ return None, f"Unsupported file type: {file_path.suffix}. Please upload a .csv or .json file."
890
+
891
+ # Aggregate runs into results.csv
892
+ aggregate_runs_to_csv()
893
+
894
+ return f"✅ Successfully uploaded {num_records} record(s). Leaderboard updated!", None
895
+
896
+ except json.JSONDecodeError as e:
897
+ return None, f"Invalid JSON file: {str(e)}"
898
+ except pd.errors.EmptyDataError:
899
+ return None, "CSV file is empty"
900
+ except Exception as e:
901
+ return None, f"Error processing file: {str(e)}"
902
+
903
+ def aggregate_runs_to_csv():
904
+ """
905
+ Aggregate all JSON files in runs/ directory into results.csv.
906
+ This consolidates all uploaded evaluation results into a single CSV file.
907
+ """
908
+ runs_path = get_runs_path()
909
+ results_path = get_results_path()
910
+
911
+ # Gather all JSON files
912
+ records = []
913
+ for path in runs_path.glob("*.json"):
914
+ try:
915
+ records.append(json.loads(path.read_text()))
916
+ except Exception as e:
917
+ print(f"Warning: Skipping invalid JSON file {path}: {e}")
918
+
919
+ if not records:
920
+ # Create empty CSV with headers
921
+ fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
922
+ fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
923
+ with results_path.open("w", newline="") as f:
924
+ w = csv.DictWriter(f, fieldnames=fixed_metadata + fixed_metrics)
925
+ w.writeheader()
926
+ return
927
+
928
+ # Handle legacy column names and infer Type
929
+ legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"}
930
+ for record in records:
931
+ for old_key, new_key in legacy_map.items():
932
+ if old_key in record and new_key not in record:
933
+ record[new_key] = record.pop(old_key)
934
+
935
+ # Infer Type if not present
936
+ if "Type" not in record:
937
+ record["Type"] = infer_type(record)
938
+
939
+ # Build header: metadata → metrics → puzzle types
940
+ fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
941
+ fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
942
+ puzzle_types = sorted({k for r in records for k in r.keys()
943
+ if k not in fixed_metadata + fixed_metrics})
944
+ header = fixed_metadata + fixed_metrics + puzzle_types
945
+
946
+ # Write CSV
947
+ results_path.parent.mkdir(parents=True, exist_ok=True)
948
+ with results_path.open("w", newline="") as f:
949
+ w = csv.DictWriter(f, fieldnames=header)
950
+ w.writeheader()
951
+ for r in records:
952
+ w.writerow(r)
953
+
954
+ def render(category, sort_column, sort_direction, model_filter="Models Avg"):
955
+ df_full = load_df() # Keep full dataset for perf_by_type
956
+ df = df_full.copy()
957
+
958
+ df = compute_score(df, category)
959
+
960
+ # Determine sort column and direction
961
+ ascending = (sort_direction == "Low→High")
962
+
963
+ # Map sort column names to actual column names (only numeric/metric columns)
964
+ sort_column_map = {
965
+ "Pass Rate": "Category Pass Rate",
966
+ "Avg Duration (s)": "Avg Duration (s)",
967
+ "Avg Cost ($)": "Avg Cost ($)"
968
+ }
969
+
970
+ actual_sort_column = sort_column_map.get(sort_column, "Category Pass Rate")
971
+
972
+ # Check if column exists
973
+ if actual_sort_column not in df.columns:
974
+ actual_sort_column = "Category Pass Rate"
975
+
976
+ # Handle NaN values for numeric sorting
977
+ df = df.copy()
978
+ df['_sort_helper'] = df[actual_sort_column].fillna(float('inf') if ascending else float('-inf'))
979
+ df = df.sort_values('_sort_helper', ascending=ascending).drop(columns=['_sort_helper'])
980
+ df = df.reset_index(drop=True)
981
+
982
+ # perf_by_type uses full dataset to show all puzzle types, with optional model filter
983
+ # cost_effectiveness_plot needs df with Category Pass Rate computed
984
+ return table_html(df), perf_bar(df), perf_by_type(df_full, model_filter), cost_effectiveness_plot(df)
985
+
986
+ def app():
987
+ df = load_df()
988
+
989
+ cats = ["Overall"]
990
+ if len(df) > 0:
991
+ # Get all puzzle type columns (exclude metadata and metric columns)
992
+ exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
993
+ puzzle_cols = [c for c in df.columns if c not in exclude_cols]
994
+ cats = ["Overall"] + puzzle_cols
995
+
996
+ with gr.Blocks(title="CAPTCHAv2 Leaderboard", theme=gr.themes.Soft(primary_hue="indigo")) as demo:
997
+ gr.Markdown("""
998
+ <div style="text-align: center; padding: 30px 0;">
999
+ <h1 style="font-size: 42px; font-weight: 700; margin: 0; background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;">
1000
+ CAPTCHAv2 Leaderboard
1001
+ </h1>
1002
+ <p style="font-size: 16px; color: #64748b; margin-top: 10px;">
1003
+ Compare model performance across different CAPTCHA types
1004
+ </p>
1005
+ </div>
1006
+ """)
1007
+
1008
+ # Upload section
1009
+ with gr.Row():
1010
+ with gr.Column(scale=1):
1011
+ gr.Markdown("### 📤 Upload Results")
1012
+
1013
+ # Main accordion for the entire guide
1014
+ with gr.Accordion("📖 Step-by-Step Guide to Submit Results", open=False):
1015
+ # Step 1: Run Evaluation Protocol
1016
+ with gr.Accordion("Step 1: Run the Evaluation Protocol", open=False):
1017
+ gr.Markdown("""
1018
+ **Option A: Using browser-use Agent Framework**
1019
+
1020
+ 1. Start the CAPTCHA server:
1021
+ ```bash
1022
+ python app.py
1023
+ ```
1024
+ The server will run on `http://127.0.0.1:7860`
1025
+
1026
+ 2. Run the browser-use agent evaluation (default is their in house model BU1.0):
1027
+ ```bash
1028
+ python -m agent_frameworks.browseruse_cli \\
1029
+ --url http://127.0.0.1:7860 \\
1030
+ --llm browser-use \\
1031
+ ```
1032
+ Or with a different LLM:
1033
+ ```bash
1034
+ python -m agent_frameworks.browseruse_cli \\
1035
+ --url http://127.0.0.1:7860 \\
1036
+ --llm openai \\
1037
+ --model gpt-4o
1038
+ ```
1039
+
1040
+ 3. The evaluation will automatically save results to `benchmark_results.json` in the project root.
1041
+ Each puzzle attempt is logged as a JSON object with fields:
1042
+ - `puzzle_type`, `puzzle_id`, `user_answer`, `correct_answer`, `correct`
1043
+ - `elapsed_time`, `timestamp`
1044
+ - `model`, `provider`, `agent_framework`
1045
+
1046
+ **Option B: Using Other Agent Frameworks**
1047
+
1048
+ Follow your framework's evaluation protocol. Ensure results are saved in `benchmark_results.json` format
1049
+ (JSONL: one JSON object per line) with the same field structure.
1050
+ """)
1051
+
1052
+ # Step 2: Convert Results
1053
+ with gr.Accordion("Step 2: Convert Results to CSV Format", open=False):
1054
+ gr.Markdown("""
1055
+ **Method 1: Convert to CSV Format (Recommended)**
1056
+
1057
+ Use the provided conversion script (`convert_benchmark_to_csv.py` in the project root):
1058
+ ```bash
1059
+ python convert_benchmark_to_csv.py benchmark_results.json leaderboard/results.csv
1060
+ ```
1061
+
1062
+ **Method 2: Directly Upload to Leaderboard (Auto-conversion)**
1063
+
1064
+ You can upload `benchmark_results.json` directly here. The system will automatically handle all.
1065
+
1066
+ Optionally provide metadata below if auto-detection fails:
1067
+ - Model Name (e.g., "gpt-4", "claude-3-sonnet", "bu-1-0")
1068
+ - Provider (e.g., "OpenAI", "Anthropic", "browser-use")
1069
+ - Agent Framework (e.g., "browser-use", "crewai")
1070
+ """)
1071
+
1072
+ # Step 3: Upload Results
1073
+ with gr.Accordion("Step 3: Upload Results", open=False):
1074
+ gr.Markdown("""
1075
+ **Supported file formats:**
1076
+ - ✅ `benchmark_results.json` - Per-puzzle results (JSONL format)
1077
+ - ✅ `results.csv` - Aggregated results **Recommended**
1078
+ - ✅ JSON files - Single object or array of aggregated results
1079
+
1080
+ **File format requirements:**
1081
+
1082
+ For `benchmark_results.json` (per-puzzle format):
1083
+ ```json
1084
+ {"puzzle_type": "Dice_Count", "puzzle_id": "dice1.png", "user_answer": "24", "correct_answer": 24, "correct": true, "elapsed_time": "12.5", "timestamp": "2025-01-01T00:00:00Z", "model": "bu-1-0", "provider": "browser-use", "agent_framework": "browser-use"}
1085
+ ```
1086
+
1087
+ For CSV (aggregated format):
1088
+ - Required columns: `Model`, `Provider`, `Agent Framework`, `Type`, `Overall Pass Rate` , `Avg Duration (s)`, `Avg Cost ($)`, and puzzle type columns (e.g., `Dice_Count`, `Mirror`, etc.)
1089
+ """)
1090
+
1091
+ file_upload = gr.File(
1092
+ label="Upload Results File",
1093
+ file_types=[".csv", ".json"],
1094
+ type="filepath"
1095
+ )
1096
+ with gr.Row():
1097
+ model_name_input = gr.Textbox(
1098
+ label="Model Name (optional, for benchmark_results.json)",
1099
+ placeholder="e.g., gpt-4, claude-3-sonnet",
1100
+ container=True
1101
+ )
1102
+ provider_input = gr.Textbox(
1103
+ label="Provider (optional, for benchmark_results.json)",
1104
+ placeholder="e.g., OpenAI, Anthropic, Google",
1105
+ container=True
1106
+ )
1107
+ agent_framework_input = gr.Textbox(
1108
+ label="Agent Framework (optional, for benchmark_results.json)",
1109
+ placeholder="e.g., browser-use, crewai",
1110
+ value="browser-use",
1111
+ container=True
1112
+ )
1113
+ upload_btn = gr.Button("Upload & Update Leaderboard", variant="primary")
1114
+ upload_status = gr.Markdown("")
1115
+
1116
+ gr.Markdown("---")
1117
+
1118
+ with gr.Row():
1119
+ cat = gr.Dropdown(choices=cats, value="Overall", label="Category/Type", container=True)
1120
+ sort_col = gr.Dropdown(
1121
+ choices=["Pass Rate", "Avg Duration (s)", "Avg Cost ($)"],
1122
+ value="Pass Rate",
1123
+ label="Sort by",
1124
+ container=True
1125
+ )
1126
+ sort_dir = gr.Radio(
1127
+ choices=["High→Low", "Low→High"],
1128
+ value="High→Low",
1129
+ label="Sort Direction",
1130
+ container=True
1131
+ )
1132
+
1133
+ # Model filter for Performance by Type plot
1134
+ model_choices = ["Models Avg"]
1135
+ if len(df) > 0 and "Model" in df.columns:
1136
+ model_choices.extend(sorted(df["Model"].unique().tolist()))
1137
+
1138
+ with gr.Row():
1139
+ model_filter = gr.Dropdown(
1140
+ choices=model_choices,
1141
+ value="Models Avg",
1142
+ label="Model Filter (for Performance by Type plot)",
1143
+ container=True
1144
+ )
1145
+
1146
+ out = gr.HTML(elem_classes="leaderboard-table")
1147
+ bar = gr.Plot(label="Performance Comparison")
1148
+ pertype_plot = gr.Plot(label="Performance by Type")
1149
+ cost_eff_plot = gr.Plot(label="Cost-Effectiveness Analysis")
1150
+
1151
+ def handle_upload(file, model_filter_val, model_name_input_val, provider_input_val, agent_framework_input_val):
1152
+ if file is None:
1153
+ # Return current state if no file
1154
+ table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg")
1155
+ return "Please select a file to upload.", table, bar_fig, pertype_fig, cost_fig
1156
+
1157
+ # Use provided metadata or None (which will trigger auto-detection)
1158
+ model_name_val = model_name_input_val.strip() if model_name_input_val else None
1159
+ provider_val = provider_input_val.strip() if provider_input_val else None
1160
+ agent_framework_val = agent_framework_input_val.strip() if agent_framework_input_val else None
1161
+
1162
+ success_msg, error_msg = process_uploaded_file(
1163
+ file,
1164
+ model_name=model_name_val,
1165
+ provider=provider_val,
1166
+ agent_framework=agent_framework_val
1167
+ )
1168
+ if error_msg:
1169
+ # Return current state with error message
1170
+ table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg")
1171
+ return f"❌ {error_msg}", table, bar_fig, pertype_fig, cost_fig
1172
+
1173
+ # Reload and render after successful upload
1174
+ # Re-render with current settings (use Overall as default since we can't access component values directly)
1175
+ table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg")
1176
+ return success_msg, table, bar_fig, pertype_fig, cost_fig
1177
+
1178
+ upload_btn.click(
1179
+ handle_upload,
1180
+ inputs=[file_upload, model_filter, model_name_input, provider_input, agent_framework_input],
1181
+ outputs=[upload_status, out, bar, pertype_plot, cost_eff_plot]
1182
+ )
1183
+
1184
+ demo.load(lambda: render("Overall", "Pass Rate", "High→Low", "Models Avg"), outputs=[out, bar, pertype_plot, cost_eff_plot])
1185
+ for comp in (cat, sort_col, sort_dir, model_filter):
1186
+ comp.change(render, inputs=[cat, sort_col, sort_dir, model_filter], outputs=[out, bar, pertype_plot, cost_eff_plot])
1187
+ return demo
1188
+
1189
+ if __name__ == "__main__":
1190
+ app().launch()