deepmage121 commited on
Commit
a92080e
Β·
1 Parent(s): 149ce10

moving to EEE hf org

Browse files
Files changed (7) hide show
  1. .gitignore +9 -0
  2. app.py +499 -0
  3. data_loader.py +386 -0
  4. eval.schema.json +282 -0
  5. hf_operations.py +202 -0
  6. pyproject.toml +10 -0
  7. ui_components.py +1374 -0
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ .secrets
3
+ .actrc
4
+ __pycache__/
5
+ *.pyc
6
+ parquet_output/
7
+ *.venv*
8
+ *.md
9
+ *.ipynb_checkpoints
app.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluation Leaderboard - Gradio Interface
3
+ Displays model evaluation results from HuggingFace datasets.
4
+ """
5
+ import gradio as gr
6
+ import pandas as pd
7
+ from pathlib import Path
8
+
9
+ from data_loader import (
10
+ load_hf_dataset_on_startup,
11
+ get_available_leaderboards,
12
+ get_eval_metadata,
13
+ build_leaderboard_table,
14
+ clear_cache,
15
+ search_model_across_leaderboards,
16
+ get_all_model_names,
17
+ DATA_DIR
18
+ )
19
+ from ui_components import (
20
+ get_theme,
21
+ get_custom_css,
22
+ format_leaderboard_header,
23
+ format_metric_details,
24
+ format_model_card,
25
+ format_model_comparison,
26
+ )
27
+
28
+ PAGE_SIZE = 50
29
+
30
+
31
+ def update_leaderboard_table(selected_leaderboard, search_query="", current_page=1, sort_column=None, selected_columns=None, progress=gr.Progress()):
32
+ """Loads and aggregates data for the selected leaderboard."""
33
+ if not selected_leaderboard:
34
+ return (
35
+ pd.DataFrame(),
36
+ format_leaderboard_header(None, {}),
37
+ format_metric_details(None, {}),
38
+ gr.update(choices=[], value=None),
39
+ gr.update(interactive=False),
40
+ gr.update(interactive=False),
41
+ gr.update(choices=[], value=None),
42
+ "0 / 0",
43
+ gr.update(choices=[], value=[]),
44
+ )
45
+
46
+ metadata = get_eval_metadata(selected_leaderboard)
47
+
48
+ def progress_callback(value, desc):
49
+ progress(value, desc=desc)
50
+
51
+ df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
52
+
53
+ # Get all available columns BEFORE filtering (for column selector)
54
+ all_available_columns = list(df.columns) if not df.empty else []
55
+
56
+ # Filter columns if selected (if None or empty, show all columns)
57
+ if selected_columns is not None and len(selected_columns) > 0:
58
+ # Ensure Model column is always included
59
+ base_cols = ["Model"]
60
+ available_cols = list(df.columns)
61
+ cols_to_show = [col for col in base_cols if col in available_cols]
62
+ # Add Developer and other selected columns
63
+ cols_to_show.extend([col for col in selected_columns if col in available_cols and col not in cols_to_show])
64
+ if cols_to_show:
65
+ df = df[cols_to_show]
66
+
67
+ if search_query and not df.empty:
68
+ mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
69
+ df = df[mask]
70
+
71
+ filtered_count = len(df)
72
+
73
+ if sort_column and sort_column in df.columns and not df.empty:
74
+ df = df.sort_values(by=sort_column, ascending=False, na_position='last')
75
+
76
+ total_pages = max(1, (filtered_count + PAGE_SIZE - 1) // PAGE_SIZE) if filtered_count > 0 else 1
77
+ current_page = max(1, min(current_page, total_pages))
78
+
79
+ start_idx = (current_page - 1) * PAGE_SIZE
80
+ end_idx = start_idx + PAGE_SIZE
81
+ df_paginated = df.iloc[start_idx:end_idx] if not df.empty else df
82
+
83
+ page_choices = [str(i) for i in range(1, total_pages + 1)]
84
+ page_dropdown = gr.update(choices=page_choices, value=str(current_page))
85
+ prev_btn = gr.update(interactive=(current_page > 1))
86
+ next_btn = gr.update(interactive=(current_page < total_pages))
87
+ page_info = f"{current_page} / {total_pages}"
88
+
89
+ sort_choices = list(df.columns) if not df.empty else []
90
+ default_sort = sort_column if sort_column and sort_column in sort_choices else ("Average" if "Average" in sort_choices else (sort_choices[0] if sort_choices else None))
91
+ sort_column_update = gr.update(choices=sort_choices, value=default_sort)
92
+
93
+ # Get all available columns for column selector (use full list, not filtered)
94
+ # Include all columns except Model in the selector (Model is always shown)
95
+ column_choices = [col for col in all_available_columns if col != "Model"]
96
+ # Preserve current selection, or default to all columns if None or empty
97
+ if selected_columns is None or len(selected_columns) == 0:
98
+ column_value = column_choices
99
+ else:
100
+ # Preserve user's selection, filtering out any invalid choices
101
+ column_value = [col for col in selected_columns if col in column_choices]
102
+ column_selector_update = gr.update(choices=column_choices, value=column_value)
103
+
104
+ return (
105
+ df_paginated,
106
+ format_leaderboard_header(selected_leaderboard, metadata),
107
+ format_metric_details(selected_leaderboard, metadata),
108
+ page_dropdown,
109
+ prev_btn,
110
+ next_btn,
111
+ sort_column_update,
112
+ page_info,
113
+ column_selector_update,
114
+ )
115
+
116
+
117
+ def search_model(model_query):
118
+ """Search for a model and return formatted card."""
119
+ if not model_query or len(model_query) < 2:
120
+ return """
121
+ <div class="no-results">
122
+ <h3>Search for a model</h3>
123
+ <p>Enter a model name to see its benchmarks across all leaderboards</p>
124
+ </div>
125
+ """
126
+
127
+ results, _ = search_model_across_leaderboards(model_query)
128
+
129
+ if not results:
130
+ return f"""
131
+ <div class="no-results">
132
+ <h3>No results for "{model_query}"</h3>
133
+ <p>Try a different model name or check the spelling</p>
134
+ </div>
135
+ """
136
+
137
+ # Use the first matching model
138
+ model_name = list(results.keys())[0]
139
+ model_data = results[model_name]
140
+
141
+ return format_model_card(model_name, model_data)
142
+
143
+
144
+ def compare_models(selected_models):
145
+ """Compare multiple selected models."""
146
+ if not selected_models or len(selected_models) == 0:
147
+ return """
148
+ <div class="no-results">
149
+ <h3>Select models to compare</h3>
150
+ <p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
151
+ </div>
152
+ """
153
+
154
+ # Get data for all selected models
155
+ all_results = {}
156
+ for model_name in selected_models:
157
+ results, _ = search_model_across_leaderboards(model_name)
158
+ if results:
159
+ # Use the first matching model (exact match preferred)
160
+ matched_model = list(results.keys())[0]
161
+ all_results[matched_model] = results[matched_model]
162
+
163
+ if len(all_results) == 1:
164
+ # Single model - show card view
165
+ model_name = list(all_results.keys())[0]
166
+ return format_model_card(model_name, all_results[model_name])
167
+ elif len(all_results) > 1:
168
+ # Multiple models - show comparison
169
+ return format_model_comparison(list(all_results.keys()), all_results)
170
+ else:
171
+ return """
172
+ <div class="no-results">
173
+ <h3>No results found</h3>
174
+ <p>Try selecting different models</p>
175
+ </div>
176
+ """
177
+
178
+
179
+ def get_model_suggestions(query):
180
+ """Get model name suggestions for autocomplete."""
181
+ if not query or len(query) < 2:
182
+ return gr.update(choices=[])
183
+
184
+ _, matches = search_model_across_leaderboards(query)
185
+ return gr.update(choices=matches[:15])
186
+
187
+
188
+ # Load data at startup
189
+ load_hf_dataset_on_startup()
190
+
191
+ # Build interface
192
+ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css()) as demo:
193
+
194
+ # Header
195
+ gr.HTML("""
196
+ <div class="app-header">
197
+ <div class="logo-mark">EΒ³</div>
198
+ <div class="brand">
199
+ <h1>Every Eval Ever</h1>
200
+ <span class="tagline">Browse and compare model benchmarks</span>
201
+ </div>
202
+ <div class="header-right">
203
+ <span class="version-badge">beta</span>
204
+ </div>
205
+ </div>
206
+ """)
207
+
208
+ with gr.Tabs():
209
+ # === TAB 1: Leaderboard View ===
210
+ with gr.TabItem("πŸ“Š Leaderboards"):
211
+ with gr.Row(elem_classes="controls-bar"):
212
+ initial_choices = get_available_leaderboards()
213
+ initial_value = initial_choices[0] if initial_choices else None
214
+
215
+ with gr.Column(scale=2, min_width=200):
216
+ leaderboard_selector = gr.Dropdown(
217
+ choices=initial_choices,
218
+ value=initial_value,
219
+ label="Leaderboard",
220
+ interactive=True
221
+ )
222
+ with gr.Column(scale=3, min_width=250):
223
+ search_box = gr.Textbox(
224
+ label="Filter",
225
+ placeholder="Filter models...",
226
+ show_label=True
227
+ )
228
+ with gr.Column(scale=1, min_width=100):
229
+ refresh_btn = gr.Button("↻ Refresh", variant="secondary", size="sm")
230
+
231
+ init_df, init_header, init_metrics, init_page_dropdown, init_prev, init_next, init_sort_cols, init_page_info, init_column_selector = update_leaderboard_table(initial_value, "", 1, "Average", None)
232
+
233
+ header_view = gr.HTML(value=init_header)
234
+
235
+ # Hidden sort state (default to Average)
236
+ sort_column_dropdown = gr.Dropdown(
237
+ choices=init_sort_cols.get("choices", []) if hasattr(init_sort_cols, 'get') else [],
238
+ value=init_sort_cols.get("value") if hasattr(init_sort_cols, 'get') else None,
239
+ visible=False,
240
+ )
241
+
242
+ # Column selector
243
+ with gr.Row(elem_classes="controls-bar"):
244
+ column_selector = gr.CheckboxGroup(
245
+ choices=init_column_selector.get("choices", []) if isinstance(init_column_selector, dict) else [],
246
+ value=init_column_selector.get("value", []) if isinstance(init_column_selector, dict) else [],
247
+ label="Columns to Display",
248
+ interactive=True,
249
+ show_label=True,
250
+ )
251
+
252
+ leaderboard_table = gr.Dataframe(
253
+ value=init_df,
254
+ label=None,
255
+ interactive=False,
256
+ wrap=False,
257
+ elem_classes="dataframe",
258
+ )
259
+
260
+ # Pagination below table - centered
261
+ with gr.Row(elem_classes="pagination-bar"):
262
+ prev_btn = gr.Button("←", variant="secondary", size="sm", min_width=60)
263
+ page_info = gr.Markdown(value=init_page_info, elem_classes="page-info")
264
+ next_btn = gr.Button("β†’", variant="secondary", size="sm", min_width=60)
265
+ # Extract choices and value from gr.update() dict, ensuring value is in choices
266
+ if isinstance(init_page_dropdown, dict):
267
+ page_choices = init_page_dropdown.get("choices", ["1"])
268
+ page_value = str(init_page_dropdown.get("value", "1")) if init_page_dropdown.get("value") is not None else "1"
269
+ # Ensure value exists in choices
270
+ if page_value not in page_choices:
271
+ page_value = page_choices[0] if page_choices else "1"
272
+ if not page_choices:
273
+ page_choices = ["1"]
274
+ else:
275
+ page_choices = ["1"]
276
+ page_value = "1"
277
+ page_dropdown = gr.Dropdown(
278
+ choices=page_choices,
279
+ value=page_value,
280
+ visible=False,
281
+ )
282
+
283
+ metrics_view = gr.HTML(value=init_metrics)
284
+
285
+ # === TAB 2: Model View ===
286
+ with gr.TabItem("πŸ” Model Lookup"):
287
+ gr.Markdown("### Find and compare models across all leaderboards")
288
+
289
+ selected_models_state = gr.State(value=[])
290
+ default_compare_html = """
291
+ <div class="no-results">
292
+ <h3>Search for models to compare</h3>
293
+ <p>Type in the dropdown above, then click a model to add it</p>
294
+ </div>
295
+ """
296
+
297
+ with gr.Row(elem_classes="controls-bar"):
298
+ with gr.Column(scale=4):
299
+ all_models = get_all_model_names()
300
+ model_dropdown = gr.Dropdown(
301
+ choices=all_models,
302
+ label="Search models to add",
303
+ interactive=True,
304
+ allow_custom_value=False,
305
+ filterable=True,
306
+ )
307
+ with gr.Column(scale=1, min_width=100):
308
+ clear_models_btn = gr.Button("Clear All", variant="secondary", size="sm")
309
+
310
+ selected_models_group = gr.CheckboxGroup(
311
+ choices=[],
312
+ value=[],
313
+ label="Selected Models (click to remove)",
314
+ interactive=True,
315
+ elem_classes="selected-models-group"
316
+ )
317
+
318
+ model_card_view = gr.HTML(value=default_compare_html)
319
+
320
+ # Submission guide
321
+ with gr.Accordion("πŸ“€ How to Submit Data", open=False):
322
+ gr.Markdown("""
323
+ **Submit via GitHub Pull Request:**
324
+
325
+ 1. Fork [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever)
326
+ 2. Add JSON files to `data/<leaderboard>/<developer>/<model>/`
327
+ 3. Open a PR β€” automated validation runs on submission
328
+ 4. After merge, data syncs to HuggingFace automatically
329
+
330
+ [Submission Guide](https://github.com/evaleval/every_eval_ever#contributor-guide) Β· [JSON Schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json)
331
+ """)
332
+
333
+ # === State ===
334
+ current_page_state = gr.State(value=1)
335
+ sort_column_state = gr.State(value="Average")
336
+
337
+ def go_prev(current):
338
+ return max(1, current - 1)
339
+
340
+ def go_next(current):
341
+ return current + 1
342
+
343
+ def reset_page():
344
+ return 1
345
+
346
+ def update_table_only(selected_leaderboard, search_query, current_page, sort_column, selected_columns):
347
+ """Update table without modifying column selector (for column changes)."""
348
+ result = update_leaderboard_table(selected_leaderboard, search_query, current_page, sort_column, selected_columns)
349
+ # Return all outputs except the last one (column_selector)
350
+ return result[:-1]
351
+
352
+ # === Leaderboard Events ===
353
+ leaderboard_selector.change(
354
+ fn=reset_page, outputs=[current_page_state]
355
+ ).then(
356
+ fn=lambda: "Average", outputs=[sort_column_state]
357
+ ).then(
358
+ fn=lambda: None, outputs=[column_selector]
359
+ ).then(
360
+ fn=update_leaderboard_table,
361
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
362
+ outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector]
363
+ )
364
+
365
+ search_box.input(
366
+ fn=reset_page, outputs=[current_page_state]
367
+ ).then(
368
+ fn=update_table_only,
369
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
370
+ outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
371
+ )
372
+
373
+ sort_column_dropdown.change(
374
+ fn=lambda col: col,
375
+ inputs=[sort_column_dropdown],
376
+ outputs=[sort_column_state]
377
+ ).then(
378
+ fn=reset_page, outputs=[current_page_state]
379
+ ).then(
380
+ fn=update_table_only,
381
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
382
+ outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
383
+ )
384
+
385
+ column_selector.change(
386
+ fn=reset_page, outputs=[current_page_state]
387
+ ).then(
388
+ fn=update_table_only,
389
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
390
+ outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
391
+ )
392
+
393
+ page_dropdown.change(
394
+ fn=lambda p: int(p) if p else 1,
395
+ inputs=[page_dropdown],
396
+ outputs=[current_page_state]
397
+ ).then(
398
+ fn=update_table_only,
399
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
400
+ outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
401
+ )
402
+
403
+ prev_btn.click(
404
+ fn=go_prev, inputs=[current_page_state], outputs=[current_page_state]
405
+ ).then(
406
+ fn=update_table_only,
407
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
408
+ outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
409
+ )
410
+
411
+ next_btn.click(
412
+ fn=go_next, inputs=[current_page_state], outputs=[current_page_state]
413
+ ).then(
414
+ fn=update_table_only,
415
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
416
+ outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
417
+ )
418
+
419
+ refresh_btn.click(
420
+ fn=lambda: gr.Dropdown(choices=get_available_leaderboards()),
421
+ outputs=[leaderboard_selector]
422
+ ).then(
423
+ fn=lambda: clear_cache()
424
+ ).then(
425
+ fn=reset_page, outputs=[current_page_state]
426
+ ).then(
427
+ fn=lambda: "Average", outputs=[sort_column_state]
428
+ ).then(
429
+ fn=lambda: None, outputs=[column_selector]
430
+ ).then(
431
+ fn=update_leaderboard_table,
432
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
433
+ outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector]
434
+ )
435
+
436
+ # === Model Search Events ===
437
+ def add_model_and_compare(selected_model, current_selected):
438
+ """Add a model and auto-compare."""
439
+ if not selected_model:
440
+ comparison_html = compare_models(current_selected) if current_selected else default_compare_html
441
+ return (
442
+ current_selected,
443
+ gr.update(value=None),
444
+ gr.update(choices=current_selected, value=current_selected),
445
+ comparison_html
446
+ )
447
+
448
+ if current_selected is None:
449
+ current_selected = []
450
+
451
+ if selected_model not in current_selected:
452
+ current_selected = current_selected + [selected_model]
453
+
454
+ comparison_html = compare_models(current_selected)
455
+
456
+ return (
457
+ current_selected,
458
+ gr.update(value=None),
459
+ gr.update(choices=current_selected, value=current_selected),
460
+ comparison_html
461
+ )
462
+
463
+ def update_selection(selected_list):
464
+ """Update selection from checkbox changes."""
465
+ selected_list = selected_list or []
466
+ comparison_html = compare_models(selected_list) if selected_list else default_compare_html
467
+ return selected_list, comparison_html
468
+
469
+ def clear_all_models():
470
+ """Clear all selected models."""
471
+ return (
472
+ [],
473
+ gr.update(value=None),
474
+ gr.update(choices=[], value=[]),
475
+ default_compare_html
476
+ )
477
+
478
+ # Select from dropdown adds model and auto-compares
479
+ model_dropdown.select(
480
+ fn=add_model_and_compare,
481
+ inputs=[model_dropdown, selected_models_state],
482
+ outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view]
483
+ )
484
+
485
+ selected_models_group.change(
486
+ fn=update_selection,
487
+ inputs=[selected_models_group],
488
+ outputs=[selected_models_state, model_card_view]
489
+ )
490
+
491
+ clear_models_btn.click(
492
+ fn=clear_all_models,
493
+ outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view]
494
+ )
495
+
496
+ DATA_DIR.mkdir(exist_ok=True)
497
+
498
+ if __name__ == "__main__":
499
+ demo.launch()
data_loader.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Loader: Load from HuggingFace, parse JSON files, and build tables.
3
+ """
4
+ import json
5
+ import pandas as pd
6
+ from pathlib import Path
7
+ from datasets import load_dataset
8
+
9
+
10
+ # Global caches
11
+ HF_DATASET_CACHE = {}
12
+ LEADERBOARD_CACHE = {}
13
+ DATA_DIR = Path("leaderboard_data")
14
+
15
+
16
+ def load_hf_dataset_on_startup():
17
+ """Load all splits from HuggingFace dataset at startup."""
18
+ print("Loading dataset from HuggingFace...")
19
+ try:
20
+ dataset = load_dataset("evaleval/every_eval_ever")
21
+
22
+ for split_name, split_data in dataset.items():
23
+ print(f"Loading split: {split_name} ({len(split_data)} rows)")
24
+
25
+ df = split_data.to_pandas()
26
+ parsed_items = []
27
+
28
+ for _, row in df.iterrows():
29
+ evaluation_results = json.loads(row['evaluation_results'])
30
+
31
+ results = {}
32
+ for eval_result in evaluation_results:
33
+ eval_name = eval_result.get("evaluation_name")
34
+ score = eval_result.get("score_details", {}).get("score")
35
+ if eval_name and score is not None:
36
+ results[eval_name] = score
37
+
38
+ additional_details = {}
39
+ if pd.notna(row.get('additional_details')):
40
+ additional_details = json.loads(row['additional_details'])
41
+
42
+ parsed_item = {
43
+ "leaderboard": row['_leaderboard'],
44
+ "provider": row['source_organization_name'],
45
+ "model": row['model_id'],
46
+ "developer": row['model_developer'],
47
+ "params": additional_details.get('params_billions'),
48
+ "architecture": additional_details.get('architecture', 'Unknown'),
49
+ "precision": additional_details.get('precision', 'Unknown'),
50
+ "results": results,
51
+ "raw_data": {
52
+ "schema_version": row['schema_version'],
53
+ "evaluation_id": row['evaluation_id'],
54
+ "retrieved_timestamp": row['retrieved_timestamp'],
55
+ "source_data": json.loads(row['source_data']),
56
+ "evaluation_source": {
57
+ "evaluation_source_name": row['evaluation_source_name'],
58
+ "evaluation_source_type": row['evaluation_source_type']
59
+ },
60
+ "source_metadata": {
61
+ "source_organization_name": row['source_organization_name'],
62
+ "evaluator_relationship": row['evaluator_relationship'],
63
+ },
64
+ "model_info": {
65
+ "name": row['model_name'],
66
+ "id": row['model_id'],
67
+ "developer": row['model_developer'],
68
+ },
69
+ "evaluation_results": evaluation_results,
70
+ "additional_details": additional_details
71
+ }
72
+ }
73
+
74
+ if pd.notna(row.get('source_organization_url')):
75
+ parsed_item["raw_data"]["source_metadata"]["source_organization_url"] = row['source_organization_url']
76
+ if pd.notna(row.get('source_organization_logo_url')):
77
+ parsed_item["raw_data"]["source_metadata"]["source_organization_logo_url"] = row['source_organization_logo_url']
78
+ if pd.notna(row.get('model_inference_platform')):
79
+ parsed_item["raw_data"]["model_info"]["inference_platform"] = row['model_inference_platform']
80
+
81
+ parsed_items.append(parsed_item)
82
+
83
+ HF_DATASET_CACHE[split_name] = parsed_items
84
+
85
+ print(f"Loaded {len(HF_DATASET_CACHE)} leaderboard(s) from HuggingFace")
86
+ return True
87
+ except Exception as e:
88
+ print(f"Warning: Could not load HuggingFace dataset: {e}")
89
+ print("Falling back to local file system...")
90
+ return False
91
+
92
+
93
+ def parse_eval_json(file_path):
94
+ """Parses a single JSON file to extract model, provider, and results."""
95
+ try:
96
+ with open(file_path, 'r') as f:
97
+ data = json.load(f)
98
+
99
+ leaderboard_name = data.get("evaluation_source", {}).get("evaluation_source_name", "Unknown Leaderboard")
100
+ provider_name = data.get("source_metadata", {}).get("source_organization_name", "Unknown Provider")
101
+ model_id = data.get("model_info", {}).get("id", "Unknown Model")
102
+ developer_name = data.get("model_info", {}).get("developer", "Unknown Developer")
103
+
104
+ params = data.get("model_info", {}).get("params_billions", None)
105
+ architecture = data.get("model_info", {}).get("architecture", "Unknown")
106
+ precision = data.get("additional_details", {}).get("precision", "Unknown")
107
+ if precision == "Unknown":
108
+ precision = data.get("model_info", {}).get("precision", "Unknown")
109
+
110
+ results = {}
111
+ if "evaluation_results" in data:
112
+ for res in data["evaluation_results"]:
113
+ eval_name = res.get("evaluation_name", "Unknown Metric")
114
+ score = res.get("score_details", {}).get("score", None)
115
+ if score is not None:
116
+ results[eval_name] = score
117
+
118
+ return {
119
+ "leaderboard": leaderboard_name,
120
+ "provider": provider_name,
121
+ "model": model_id,
122
+ "developer": developer_name,
123
+ "params": params,
124
+ "architecture": architecture,
125
+ "precision": precision,
126
+ "results": results,
127
+ "raw_data": data
128
+ }
129
+ except Exception as e:
130
+ print(f"Error parsing {file_path}: {e}")
131
+ return None
132
+
133
+
134
+ def get_available_leaderboards():
135
+ """Returns available leaderboards from HF cache or local directory."""
136
+ if HF_DATASET_CACHE:
137
+ return list(HF_DATASET_CACHE.keys())
138
+
139
+ if not DATA_DIR.exists():
140
+ return []
141
+ return [d.name for d in DATA_DIR.iterdir() if d.is_dir()]
142
+
143
+
144
+ def walk_eval_files(leaderboard_name):
145
+ """Generator that walks through Leaderboard directory recursively."""
146
+ lb_path = DATA_DIR / leaderboard_name
147
+ if not lb_path.exists():
148
+ return
149
+ yield from lb_path.rglob("*.json")
150
+
151
+
152
+ def get_eval_metadata(selected_leaderboard):
153
+ """Extracts evaluation metadata from the leaderboard data."""
154
+ if not selected_leaderboard:
155
+ return {}
156
+
157
+ eval_metadata = {"evals": {}, "source_info": {}}
158
+
159
+ if selected_leaderboard in HF_DATASET_CACHE:
160
+ parsed_items = HF_DATASET_CACHE[selected_leaderboard]
161
+ if parsed_items:
162
+ parsed = parsed_items[0]
163
+
164
+ source_meta = parsed["raw_data"].get("source_metadata", {})
165
+ source_data_list = parsed["raw_data"].get("source_data", [])
166
+ url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
167
+
168
+ eval_metadata["source_info"] = {
169
+ "organization": source_meta.get("source_organization_name", "Unknown"),
170
+ "relationship": source_meta.get("evaluator_relationship", "Unknown"),
171
+ "url": url
172
+ }
173
+
174
+ if "evaluation_results" in parsed["raw_data"]:
175
+ for res in parsed["raw_data"]["evaluation_results"]:
176
+ eval_name = res.get("evaluation_name", "Unknown Metric")
177
+ if eval_name not in eval_metadata["evals"]:
178
+ metric_config = res.get("metric_config", {})
179
+ eval_metadata["evals"][eval_name] = {
180
+ "description": metric_config.get("evaluation_description", "No description available"),
181
+ "score_type": metric_config.get("score_type", "unknown"),
182
+ "lower_is_better": metric_config.get("lower_is_better", False),
183
+ "min_score": metric_config.get("min_score"),
184
+ "max_score": metric_config.get("max_score"),
185
+ "level_names": metric_config.get("level_names", []),
186
+ "level_metadata": metric_config.get("level_metadata", []),
187
+ "has_unknown_level": metric_config.get("has_unknown_level", False)
188
+ }
189
+ return eval_metadata
190
+
191
+ # Fall back to file system
192
+ for json_file in walk_eval_files(selected_leaderboard):
193
+ parsed = parse_eval_json(json_file)
194
+ if parsed:
195
+ if not eval_metadata["source_info"]:
196
+ source_meta = parsed["raw_data"].get("source_metadata", {})
197
+ source_data_list = parsed["raw_data"].get("source_data", [])
198
+ url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
199
+
200
+ eval_metadata["source_info"] = {
201
+ "organization": source_meta.get("source_organization_name", "Unknown"),
202
+ "relationship": source_meta.get("evaluator_relationship", "Unknown"),
203
+ "url": url
204
+ }
205
+
206
+ if "evaluation_results" in parsed["raw_data"]:
207
+ for res in parsed["raw_data"]["evaluation_results"]:
208
+ eval_name = res.get("evaluation_name", "Unknown Metric")
209
+ if eval_name not in eval_metadata["evals"]:
210
+ metric_config = res.get("metric_config", {})
211
+ eval_metadata["evals"][eval_name] = {
212
+ "description": metric_config.get("evaluation_description", "No description available"),
213
+ "score_type": metric_config.get("score_type", "unknown"),
214
+ "lower_is_better": metric_config.get("lower_is_better", False),
215
+ "min_score": metric_config.get("min_score"),
216
+ "max_score": metric_config.get("max_score"),
217
+ "level_names": metric_config.get("level_names", []),
218
+ "level_metadata": metric_config.get("level_metadata", []),
219
+ "has_unknown_level": metric_config.get("has_unknown_level", False)
220
+ }
221
+ break
222
+
223
+ return eval_metadata
224
+
225
+
226
+ def build_leaderboard_table(selected_leaderboard, search_query="", progress_callback=None):
227
+ """Builds the leaderboard DataFrame from cache or files."""
228
+ if not selected_leaderboard:
229
+ return pd.DataFrame()
230
+
231
+ if selected_leaderboard in LEADERBOARD_CACHE:
232
+ df, _ = LEADERBOARD_CACHE[selected_leaderboard]
233
+ else:
234
+ rows = []
235
+
236
+ if selected_leaderboard in HF_DATASET_CACHE:
237
+ if progress_callback:
238
+ progress_callback(0, desc=f"Loading {selected_leaderboard} from cache...")
239
+
240
+ parsed_items = HF_DATASET_CACHE[selected_leaderboard]
241
+
242
+ for i, parsed in enumerate(parsed_items):
243
+ if i % 100 == 0 and progress_callback:
244
+ progress_callback((i / len(parsed_items)), desc=f"Processing {selected_leaderboard}...")
245
+
246
+ row = {
247
+ "Model": parsed["model"],
248
+ "Developer": parsed["developer"],
249
+ "Params (B)": parsed["params"],
250
+ "Arch": parsed["architecture"],
251
+ "Precision": parsed["precision"]
252
+ }
253
+ row.update(parsed["results"])
254
+ rows.append(row)
255
+ else:
256
+ # Fall back to file system
257
+ if progress_callback:
258
+ progress_callback(0, desc=f"Scanning {selected_leaderboard}...")
259
+
260
+ all_files = list(walk_eval_files(selected_leaderboard))
261
+ total_files = len(all_files)
262
+
263
+ for i, json_file in enumerate(all_files):
264
+ if i % 100 == 0 and progress_callback:
265
+ progress_callback((i / total_files), desc=f"Loading {selected_leaderboard}...")
266
+
267
+ parsed = parse_eval_json(json_file)
268
+ if parsed:
269
+ row = {
270
+ "Model": parsed["model"],
271
+ "Developer": parsed["developer"],
272
+ "Params (B)": parsed["params"],
273
+ "Arch": parsed["architecture"],
274
+ "Precision": parsed["precision"]
275
+ }
276
+ row.update(parsed["results"])
277
+ rows.append(row)
278
+
279
+ if not rows:
280
+ df = pd.DataFrame(columns=["Model", "Developer", "Params (B)", "Arch", "Precision"])
281
+ LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
282
+ return df
283
+
284
+ df = pd.DataFrame(rows)
285
+ df = df.dropna(axis=1, how='all')
286
+
287
+ if df.empty:
288
+ LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
289
+ return df
290
+
291
+ numeric_cols = df.select_dtypes(include=['float', 'int']).columns
292
+ df[numeric_cols] = df[numeric_cols].round(2)
293
+
294
+ # Add Average Score
295
+ eval_only_cols = [c for c in numeric_cols if c not in ["Params (B)"]]
296
+ if len(eval_only_cols) > 0:
297
+ df["Average"] = df[eval_only_cols].mean(axis=1).round(2)
298
+
299
+ # Base columns: Model, Developer, Params, Average
300
+ # Eval columns: all evaluation scores
301
+ # Model detail columns: Arch, Precision (moved to end)
302
+ base_cols = ["Model", "Developer", "Params (B)", "Average"]
303
+ model_detail_cols = ["Arch", "Precision"]
304
+ eval_cols = [c for c in df.columns if c not in base_cols and c not in model_detail_cols]
305
+ base_cols = [c for c in base_cols if c in df.columns]
306
+ model_detail_cols = [c for c in model_detail_cols if c in df.columns]
307
+
308
+ final_cols = base_cols + sorted(eval_cols) + model_detail_cols
309
+ df = df[final_cols]
310
+
311
+ if "Average" in df.columns:
312
+ df = df.sort_values("Average", ascending=False)
313
+
314
+ LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
315
+
316
+ return df
317
+
318
+
319
+ def clear_cache():
320
+ """Clears all caches."""
321
+ LEADERBOARD_CACHE.clear()
322
+
323
+
324
+ def search_model_across_leaderboards(model_query):
325
+ """Search for a model across all leaderboards and return aggregated results."""
326
+ if not model_query or not HF_DATASET_CACHE:
327
+ return {}, []
328
+
329
+ model_query_lower = model_query.lower().strip()
330
+ results = {}
331
+ all_matches = []
332
+
333
+ for leaderboard_name, parsed_items in HF_DATASET_CACHE.items():
334
+ for item in parsed_items:
335
+ model_id = item.get("model", "")
336
+ # Check if query matches model name (case insensitive, partial match)
337
+ if model_query_lower in model_id.lower():
338
+ all_matches.append(model_id)
339
+
340
+ # Exact match gets priority
341
+ if model_id.lower() == model_query_lower or model_id == model_query:
342
+ if model_id not in results:
343
+ results[model_id] = {}
344
+ results[model_id][leaderboard_name] = {
345
+ "developer": item.get("developer"),
346
+ "params": item.get("params"),
347
+ "architecture": item.get("architecture"),
348
+ "precision": item.get("precision"),
349
+ "results": item.get("results", {})
350
+ }
351
+
352
+ # If no exact match, use partial matches
353
+ if not results and all_matches:
354
+ # Get the first partial match
355
+ for leaderboard_name, parsed_items in HF_DATASET_CACHE.items():
356
+ for item in parsed_items:
357
+ model_id = item.get("model", "")
358
+ if model_query_lower in model_id.lower():
359
+ if model_id not in results:
360
+ results[model_id] = {}
361
+ results[model_id][leaderboard_name] = {
362
+ "developer": item.get("developer"),
363
+ "params": item.get("params"),
364
+ "architecture": item.get("architecture"),
365
+ "precision": item.get("precision"),
366
+ "results": item.get("results", {})
367
+ }
368
+
369
+ # Return unique matches for autocomplete
370
+ unique_matches = sorted(set(all_matches))[:20] # Limit to 20 suggestions
371
+
372
+ return results, unique_matches
373
+
374
+
375
+ def get_all_model_names():
376
+ """Get all unique model names across all leaderboards."""
377
+ if not HF_DATASET_CACHE:
378
+ return []
379
+
380
+ models = set()
381
+ for parsed_items in HF_DATASET_CACHE.values():
382
+ for item in parsed_items:
383
+ models.add(item.get("model", ""))
384
+
385
+ return sorted(models)
386
+
eval.schema.json ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "version": "0.0.1",
4
+ "type": "object",
5
+ "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
6
+ "required": [
7
+ "schema_version",
8
+ "evaluation_id",
9
+ "evaluation_source",
10
+ "retrieved_timestamp",
11
+ "source_data",
12
+ "source_metadata",
13
+ "model_info",
14
+ "evaluation_results"
15
+ ],
16
+ "properties": {
17
+ "schema_version": {
18
+ "type": "string",
19
+ "description": "Version of the schema used for this evaluation data"
20
+ },
21
+ "evaluation_id": {
22
+ "type": "string",
23
+ "description": "Unique identifier for this specific evaluation run. Use org_name/eval_name/retrieved_timestamp format"
24
+ },
25
+ "retrieved_timestamp": {
26
+ "type": "string",
27
+ "description": "Timestamp for when this record was created"
28
+ },
29
+ "source_data": {
30
+ "type": "array",
31
+ "description": "URLs for the source of the evaluation data",
32
+ "items": {
33
+ "type": "string"
34
+ }
35
+ },
36
+ "evaluation_source": {
37
+ "type": "object",
38
+ "description": "Details about evaluation origin. There are options that evaluations come from leaderboards (e.g. Live Code Bench Pro) or evaluation platforms (e.g. lm-eval, inspect ai, HELM...).",
39
+ "required": [
40
+ "evaluation_source_name",
41
+ "evaluation_source_type"
42
+ ],
43
+ "properties": {
44
+ "evaluation_source_name": {
45
+ "type": "string",
46
+ "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation."
47
+ },
48
+ "evaluation_source_type": {
49
+ "type": "string",
50
+ "enum": [
51
+ "leaderboard",
52
+ "evaluation_platform"
53
+ ],
54
+ "description": "Type of evaluation source, e.g., leaderboard or evaluation platform"
55
+ }
56
+ }
57
+ },
58
+ "source_metadata": {
59
+ "type": "object",
60
+ "description": "Metadata about the source of the leaderboard data",
61
+ "required": [
62
+ "source_organization_name",
63
+ "evaluator_relationship"
64
+ ],
65
+ "properties": {
66
+ "source_organization_name": {
67
+ "type": "string",
68
+ "description": "Name of the organization that provides the data"
69
+ },
70
+ "source_organization_url": {
71
+ "type": "string",
72
+ "description": "URL for the organization that provides the data"
73
+ },
74
+ "source_organization_logo_url": {
75
+ "type": "string",
76
+ "description": "URL for the Logo for the organization that provides the data"
77
+ },
78
+ "evaluator_relationship": {
79
+ "type": "string",
80
+ "description": "Relationship between the evaluator and the model",
81
+ "enum": [
82
+ "first_party",
83
+ "third_party",
84
+ "collaborative",
85
+ "other"
86
+ ]
87
+ }
88
+ }
89
+ },
90
+ "model_info": {
91
+ "type": "object",
92
+ "description": "Complete model specification including basic information, technical configuration and inference settings",
93
+ "required": [
94
+ "name",
95
+ "id"
96
+ ],
97
+ "properties": {
98
+ "name": {
99
+ "type": "string",
100
+ "description": "Model name provided by evaluation source"
101
+ },
102
+ "id": {
103
+ "type": "string",
104
+ "description": "Model name standarized to HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
105
+ },
106
+ "developer": {
107
+ "type": "string",
108
+ "description": "Name of organization that provides the model (e.g. 'OpenAI')"
109
+ },
110
+ "inference_platform": {
111
+ "type": "string",
112
+ "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
113
+ }
114
+ }
115
+ },
116
+ "evaluation_results": {
117
+ "type": "array",
118
+ "description": "Array of evaluation results",
119
+ "items": {
120
+ "type": "object",
121
+ "required": [
122
+ "evaluation_name",
123
+ "metric_config",
124
+ "score_details"
125
+ ],
126
+ "properties": {
127
+ "evaluation_name": {
128
+ "type": "string",
129
+ "description": "Name of the evaluation"
130
+ },
131
+ "evaluation_timestamp": {
132
+ "type": "string",
133
+ "description": "Timestamp for when the evaluations were run"
134
+ },
135
+ "metric_config": {
136
+ "type": "object",
137
+ "description": "Details about the metric",
138
+ "required": [
139
+ "lower_is_better"
140
+ ],
141
+ "properties": {
142
+ "evaluation_description": {
143
+ "type": "string",
144
+ "description": "Description of the evaluation"
145
+ },
146
+ "lower_is_better": {
147
+ "type": "boolean",
148
+ "description": "Whether a lower score is better"
149
+ },
150
+ "score_type": {
151
+ "type": "string",
152
+ "description": "Type of score",
153
+ "enum": [
154
+ "binary",
155
+ "continuous",
156
+ "levels"
157
+ ]
158
+ },
159
+ "level_names": {
160
+ "type": "array",
161
+ "description": "Names of the score levels",
162
+ "items": {
163
+ "type": "string"
164
+ }
165
+ },
166
+ "level_metadata": {
167
+ "type": "array",
168
+ "description": "Additional Description for each Score Level",
169
+ "items": {
170
+ "type": "string"
171
+ }
172
+ },
173
+ "has_unknown_level": {
174
+ "type": "boolean",
175
+ "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
176
+ },
177
+ "min_score": {
178
+ "type": "number",
179
+ "description": "Minimum possible score for continuous metric"
180
+ },
181
+ "max_score": {
182
+ "type": "number",
183
+ "description": "Maximum possible score for continuous metric"
184
+ }
185
+ },
186
+ "if": {
187
+ "properties": {
188
+ "score_type": {
189
+ "const": "levels"
190
+ }
191
+ }
192
+ },
193
+ "then": {
194
+ "required": [
195
+ "level_names",
196
+ "has_unknown_level"
197
+ ]
198
+ },
199
+ "else": {
200
+ "if": {
201
+ "properties": {
202
+ "score_type": {
203
+ "const": "continuous"
204
+ }
205
+ }
206
+ },
207
+ "then": {
208
+ "required": [
209
+ "min_score",
210
+ "max_score"
211
+ ]
212
+ }
213
+ }
214
+ },
215
+ "score_details": {
216
+ "type": "object",
217
+ "description": "The score for the evaluation and related details",
218
+ "required": [
219
+ "score"
220
+ ],
221
+ "properties": {
222
+ "score": {
223
+ "type": "number",
224
+ "description": "The score for the evaluation"
225
+ },
226
+ "details": {
227
+ "type": "object",
228
+ "description": "Any additional details about the score",
229
+ "additionalProperties": true
230
+ }
231
+ }
232
+ },
233
+ "detailed_evaluation_results_url": {
234
+ "type": "string",
235
+ "description": "Link to detailed evaluation data"
236
+ },
237
+ "generation_config": {
238
+ "type": "object",
239
+ "generation_args": {
240
+ "type": "object",
241
+ "description": "Parameters used to generate results - properties may vary by model type",
242
+ "properties": {
243
+ "temperature": {
244
+ "type": [
245
+ "null",
246
+ "number"
247
+ ],
248
+ "description": "Sampling temperature"
249
+ },
250
+ "top_p": {
251
+ "type": [
252
+ "null",
253
+ "number"
254
+ ],
255
+ "description": "Nucleus sampling parameter"
256
+ },
257
+ "top_k": {
258
+ "type": [
259
+ "null",
260
+ "number"
261
+ ],
262
+ "description": "Top-k sampling parameter"
263
+ },
264
+ "max_tokens": {
265
+ "type": "integer",
266
+ "minimum": 1,
267
+ "description": "Maximum number of tokens to generate"
268
+ }
269
+ },
270
+ "additionalProperties": true
271
+ },
272
+ "additional_details": {
273
+ "type": "string",
274
+ "description": "Additional details about how the results for this metric were generated."
275
+ }
276
+ }
277
+ }
278
+ }
279
+
280
+ }
281
+ }
282
+ }
hf_operations.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Operations: Upload data, create PRs, validate schemas.
3
+ """
4
+ from huggingface_hub import HfApi, login
5
+ import pandas as pd
6
+ import json
7
+ from pathlib import Path
8
+ from jsonschema import validate, ValidationError, Draft7Validator
9
+
10
+
11
+ # Load schema once at module level
12
+ SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
13
+ with open(SCHEMA_PATH, 'r') as f:
14
+ EVAL_SCHEMA = json.load(f)
15
+
16
+
17
+ def validate_json_against_schema(json_data):
18
+ """
19
+ Validate a JSON object against eval.schema.json.
20
+
21
+ Args:
22
+ json_data: Dict containing the evaluation data
23
+
24
+ Returns:
25
+ (bool, str): (is_valid, error_message)
26
+ """
27
+ try:
28
+ validate(instance=json_data, schema=EVAL_SCHEMA)
29
+ return True, "Schema validation passed"
30
+ except ValidationError as e:
31
+ # Extract the most relevant error message
32
+ error_path = " β†’ ".join(str(p) for p in e.path) if e.path else "root"
33
+ return False, f"❌ Schema validation failed at '{error_path}': {e.message}"
34
+ except Exception as e:
35
+ return False, f"❌ Validation error: {str(e)}"
36
+
37
+
38
+ def upload_to_hf_dataset(parquet_file, split_name, repo_id):
39
+ """
40
+ Upload a parquet file as a new split to the HF dataset.
41
+
42
+ Args:
43
+ parquet_file: Path to parquet file
44
+ split_name: Name of the split (leaderboard name)
45
+ repo_id: HuggingFace dataset repository ID
46
+ """
47
+ # TODO: Implement upload logic
48
+ pass
49
+
50
+
51
+ def check_hf_authentication():
52
+ """
53
+ Check if user is authenticated with HuggingFace.
54
+
55
+ Returns:
56
+ (bool, str): (is_authenticated, username or error_message)
57
+ """
58
+ try:
59
+ api = HfApi()
60
+ user_info = api.whoami()
61
+ return True, user_info['name']
62
+ except Exception as e:
63
+ return False, "Not authenticated. Run: huggingface-cli login"
64
+
65
+
66
+ def check_duplicate_pr_exists(leaderboard_name, repo_id):
67
+ """
68
+ Check if a PR already exists for this leaderboard.
69
+
70
+ Args:
71
+ leaderboard_name: Name of the leaderboard
72
+ repo_id: HuggingFace dataset repository ID
73
+
74
+ Returns:
75
+ (bool, str or None): (exists, pr_url if exists)
76
+ """
77
+ try:
78
+ api = HfApi()
79
+ discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
80
+
81
+ # Check for open PRs with matching title
82
+ pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
83
+ for discussion in discussions:
84
+ if discussion.is_pull_request and discussion.status == "open":
85
+ if pr_title_pattern in discussion.title.lower():
86
+ pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
87
+ return True, pr_url
88
+
89
+ return False, None
90
+ except Exception as e:
91
+ # If we can't check, assume no duplicate (fail open)
92
+ print(f"Warning: Could not check for duplicate PRs: {e}")
93
+ return False, None
94
+
95
+
96
+ def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
97
+ """
98
+ Create a pull request to add a new leaderboard split.
99
+
100
+ Args:
101
+ leaderboard_name: Name of the new leaderboard
102
+ parquet_file: Path to parquet file
103
+ repo_id: HuggingFace dataset repository ID
104
+
105
+ Returns:
106
+ (success, pr_url or error_message)
107
+ """
108
+ # 1. Check authentication
109
+ is_auth, auth_result = check_hf_authentication()
110
+ if not is_auth:
111
+ return False, f"❌ {auth_result}"
112
+
113
+ # 2. Check for duplicate PR
114
+ has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
115
+ if has_duplicate:
116
+ return False, f"⚠️ PR already exists: {duplicate_url}"
117
+
118
+ # 3. Validate parquet file exists and has data
119
+ parquet_path = Path(parquet_file)
120
+ if not parquet_path.exists():
121
+ return False, "❌ Parquet file not found"
122
+
123
+ df = pd.read_parquet(parquet_file)
124
+ if len(df) == 0:
125
+ return False, "❌ Parquet file is empty"
126
+
127
+ # 4. Create PR
128
+ try:
129
+ api = HfApi()
130
+
131
+ # Upload the parquet file to the branch
132
+ commit_message = f"Add new leaderboard: {leaderboard_name}"
133
+
134
+ # Upload file and create PR
135
+ commit_info = api.upload_file(
136
+ path_or_fileobj=parquet_file,
137
+ path_in_repo=f"data/{leaderboard_name}.parquet",
138
+ repo_id=repo_id,
139
+ repo_type="dataset",
140
+ commit_message=commit_message,
141
+ create_pr=True,
142
+ )
143
+
144
+ # Extract PR URL from commit info
145
+ pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions"
146
+
147
+ return True, f"PR created ({len(df)} rows): {pr_url}"
148
+
149
+ except Exception as e:
150
+ return False, f"❌ Failed to create PR: {str(e)}"
151
+
152
+
153
+ def validate_schema(parquet_file):
154
+ """
155
+ Validate that a parquet file matches the expected schema.
156
+
157
+ Args:
158
+ parquet_file: Path to parquet file to validate
159
+
160
+ Returns:
161
+ (bool, str): (is_valid, error_message)
162
+ """
163
+ try:
164
+ df = pd.read_parquet(parquet_file)
165
+
166
+ # Required columns
167
+ required_cols = [
168
+ '_leaderboard', '_developer', '_model', '_uuid',
169
+ 'schema_version', 'evaluation_id', 'retrieved_timestamp',
170
+ 'source_data', 'evaluation_source_name', 'evaluation_source_type',
171
+ 'source_organization_name', 'evaluator_relationship',
172
+ 'model_name', 'model_id', 'model_developer',
173
+ 'evaluation_results'
174
+ ]
175
+
176
+ missing = [col for col in required_cols if col not in df.columns]
177
+ if missing:
178
+ return False, f"Missing required columns: {', '.join(missing)}"
179
+
180
+ # Check data types (all should be strings)
181
+ for col in df.columns:
182
+ if df[col].dtype not in ['object', 'string']:
183
+ return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)"
184
+
185
+ return True, "Schema validation passed"
186
+
187
+ except Exception as e:
188
+ return False, f"Validation error: {str(e)}"
189
+
190
+
191
+ def export_to_json(parquet_file, output_dir):
192
+ """
193
+ Export parquet data back to JSON files.
194
+ Uses the parquet_to_folder function from json_to_parquet.py
195
+
196
+ Args:
197
+ parquet_file: Path to parquet file
198
+ output_dir: Directory to write JSON files to
199
+ """
200
+ from json_to_parquet import parquet_to_folder
201
+ parquet_to_folder(parquet_file, output_dir)
202
+
pyproject.toml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "eee-test"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "gradio>=5.49.1",
9
+ "pandas>=2.3.2",
10
+ ]
ui_components.py ADDED
@@ -0,0 +1,1374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ UI Components: Themes, CSS, and HTML formatters for the Gradio interface.
3
+ Nord color theme with balanced contrast.
4
+ """
5
+ import gradio as gr
6
+
7
+
8
+ def get_theme():
9
+ """Returns the Nord-themed Gradio theme, locked to dark mode."""
10
+ return gr.themes.Base(
11
+ primary_hue="blue",
12
+ neutral_hue="slate",
13
+ font=[gr.themes.GoogleFont("DM Sans"), "system-ui", "sans-serif"],
14
+ font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"],
15
+ ).set(
16
+ body_background_fill="#2E3440",
17
+ body_background_fill_dark="#2E3440",
18
+ body_text_color="#ECEFF4",
19
+ body_text_color_dark="#ECEFF4",
20
+ body_text_color_subdued="#4C566A",
21
+ body_text_color_subdued_dark="#4C566A",
22
+ block_background_fill="#3B4252",
23
+ block_background_fill_dark="#3B4252",
24
+ block_border_width="1px",
25
+ block_border_color="#434C5E",
26
+ block_border_color_dark="#434C5E",
27
+ block_label_text_color="#D8DEE9",
28
+ block_label_text_color_dark="#D8DEE9",
29
+ block_title_text_color="#ECEFF4",
30
+ block_title_text_color_dark="#ECEFF4",
31
+ input_background_fill="#2E3440",
32
+ input_background_fill_dark="#2E3440",
33
+ input_border_color="#4C566A",
34
+ input_border_color_dark="#4C566A",
35
+ button_primary_background_fill="#88C0D0",
36
+ button_primary_background_fill_dark="#88C0D0",
37
+ button_primary_text_color="#2E3440",
38
+ button_primary_text_color_dark="#2E3440",
39
+ button_secondary_background_fill="#434C5E",
40
+ button_secondary_background_fill_dark="#434C5E",
41
+ button_secondary_text_color="#ECEFF4",
42
+ button_secondary_text_color_dark="#ECEFF4",
43
+ )
44
+
45
+
46
+ def get_custom_css():
47
+ """Returns custom CSS with Nord colors."""
48
+ return """
49
+ /* === Nord Theme ===
50
+ Polar Night: #2E3440 (bg), #3B4252 (surface), #434C5E, #4C566A
51
+ Snow Storm: #D8DEE9, #E5E9F0, #ECEFF4
52
+ Frost: #8FBCBB, #88C0D0, #81A1C1, #5E81AC
53
+ Aurora: #BF616A, #D08770, #EBCB8B, #A3BE8C, #B48EAD
54
+ */
55
+
56
+ /* Lock the UI to dark Nord regardless of OS preference */
57
+ :root {
58
+ color-scheme: dark;
59
+ background-color: #2E3440;
60
+ }
61
+
62
+ body {
63
+ background: #2E3440 !important;
64
+ color: #ECEFF4 !important;
65
+ }
66
+
67
+ /* === Base === */
68
+ .gradio-container {
69
+ max-width: 100% !important;
70
+ margin: 0 !important;
71
+ padding: 1.25rem 2.5rem 2rem !important;
72
+ background: #2E3440 !important;
73
+ color: #ECEFF4 !important;
74
+ font-family: 'DM Sans', system-ui, sans-serif !important;
75
+ font-size: 16px !important;
76
+ }
77
+
78
+ /* === Header === */
79
+ .app-header {
80
+ display: flex;
81
+ align-items: center;
82
+ gap: 1rem;
83
+ margin-bottom: 1.5rem;
84
+ padding: 1.25rem 1.5rem;
85
+ background: #3B4252;
86
+ border: 1px solid #434C5E;
87
+ border-radius: 12px;
88
+ }
89
+
90
+ .app-header .logo-mark {
91
+ width: 48px;
92
+ height: 48px;
93
+ background: linear-gradient(135deg, #88C0D0 0%, #81A1C1 100%);
94
+ border-radius: 12px;
95
+ display: flex;
96
+ align-items: center;
97
+ justify-content: center;
98
+ font-weight: 800;
99
+ font-size: 1.1rem;
100
+ color: #2E3440;
101
+ }
102
+
103
+ .app-header .brand {
104
+ display: flex;
105
+ flex-direction: column;
106
+ gap: 0.125rem;
107
+ }
108
+
109
+ .app-header h1 {
110
+ margin: 0;
111
+ font-size: 1.5rem;
112
+ font-weight: 700;
113
+ color: #ECEFF4;
114
+ letter-spacing: -0.02em;
115
+ }
116
+
117
+ .app-header .tagline {
118
+ color: #D8DEE9;
119
+ font-size: 0.85rem;
120
+ }
121
+
122
+ .app-header .header-right {
123
+ margin-left: auto;
124
+ display: flex;
125
+ align-items: center;
126
+ gap: 0.75rem;
127
+ }
128
+
129
+ .app-header .version-badge {
130
+ background: rgba(136, 192, 208, 0.2);
131
+ border: 1px solid rgba(136, 192, 208, 0.4);
132
+ border-radius: 6px;
133
+ padding: 0.25rem 0.625rem;
134
+ font-size: 0.7rem;
135
+ font-family: 'JetBrains Mono', monospace;
136
+ color: #88C0D0;
137
+ }
138
+
139
+ /* === Tabs === */
140
+ .tabs {
141
+ border: none !important;
142
+ background: transparent !important;
143
+ }
144
+
145
+ .tab-nav {
146
+ background: #3B4252 !important;
147
+ border: 1px solid #434C5E !important;
148
+ border-radius: 10px !important;
149
+ padding: 0.25rem !important;
150
+ gap: 0.25rem !important;
151
+ margin-bottom: 1.25rem !important;
152
+ display: inline-flex !important;
153
+ }
154
+
155
+ .tab-nav button {
156
+ background: transparent !important;
157
+ border: none !important;
158
+ color: #D8DEE9 !important;
159
+ padding: 0.75rem 1.5rem !important;
160
+ font-size: 0.95rem !important;
161
+ font-weight: 500 !important;
162
+ border-radius: 8px !important;
163
+ transition: all 0.15s ease !important;
164
+ }
165
+
166
+ .tab-nav button.selected {
167
+ color: #2E3440 !important;
168
+ background: #88C0D0 !important;
169
+ }
170
+
171
+ .tab-nav button:hover:not(.selected) {
172
+ background: #434C5E !important;
173
+ color: #ECEFF4 !important;
174
+ }
175
+
176
+ .tabitem {
177
+ background: transparent !important;
178
+ border: none !important;
179
+ padding: 0 !important;
180
+ }
181
+
182
+ /* === Controls bar === */
183
+ .controls-bar {
184
+ background: #3B4252 !important;
185
+ border: 1px solid #434C5E !important;
186
+ border-radius: 10px !important;
187
+ padding: 0.75rem 1.25rem !important;
188
+ margin-bottom: 1rem !important;
189
+ gap: 0.75rem !important;
190
+ }
191
+
192
+ .controls-bar label {
193
+ font-size: 0.75rem !important;
194
+ text-transform: uppercase !important;
195
+ letter-spacing: 0.04em !important;
196
+ color: #D8DEE9 !important;
197
+ font-weight: 500 !important;
198
+ }
199
+
200
+ /* === Info banner === */
201
+ .info-banner {
202
+ background: #3B4252 !important;
203
+ border: 1px solid #434C5E !important;
204
+ border-left: 3px solid #88C0D0 !important;
205
+ border-radius: 0 10px 10px 0 !important;
206
+ padding: 0.75rem 1rem !important;
207
+ margin-bottom: 1rem !important;
208
+ }
209
+
210
+ .info-banner h3 {
211
+ margin: 0;
212
+ font-size: 1.1rem;
213
+ font-weight: 600;
214
+ color: #ECEFF4;
215
+ }
216
+
217
+ .info-banner .eval-tags {
218
+ display: flex;
219
+ flex-wrap: wrap;
220
+ gap: 0.375rem;
221
+ }
222
+
223
+ .info-banner .eval-tag {
224
+ background: rgba(143, 188, 187, 0.15);
225
+ border: 1px solid rgba(143, 188, 187, 0.3);
226
+ border-radius: 4px;
227
+ padding: 0.3rem 0.6rem;
228
+ font-size: 0.8rem;
229
+ font-family: 'JetBrains Mono', monospace;
230
+ color: #8FBCBB;
231
+ }
232
+
233
+ /* === Dataframe - seamless styling === */
234
+ .dataframe,
235
+ .dataframe > div,
236
+ .dataframe > div > div,
237
+ .dataframe .table-wrap,
238
+ .dataframe .svelte-1gfkn6j {
239
+ background: #2E3440 !important;
240
+ border: none !important;
241
+ box-shadow: none !important;
242
+ border-radius: 0 !important;
243
+ }
244
+
245
+ .dataframe table {
246
+ width: 100% !important;
247
+ border-collapse: collapse !important;
248
+ font-size: 0.95rem !important;
249
+ table-layout: auto !important;
250
+ background: #2E3440 !important;
251
+ }
252
+
253
+ .dataframe thead,
254
+ .dataframe thead tr {
255
+ background: #2E3440 !important;
256
+ position: sticky;
257
+ top: 0;
258
+ z-index: 10;
259
+ }
260
+
261
+ .dataframe thead th {
262
+ padding: 0.875rem 1rem !important;
263
+ font-weight: 600 !important;
264
+ font-size: 0.75rem !important;
265
+ text-transform: uppercase !important;
266
+ letter-spacing: 0.05em !important;
267
+ color: #81A1C1 !important;
268
+ border-bottom: 1px solid #434C5E !important;
269
+ border-top: none !important;
270
+ text-align: left !important;
271
+ background: #2E3440 !important;
272
+ }
273
+
274
+ .dataframe tbody,
275
+ .dataframe tbody tr {
276
+ background: #2E3440 !important;
277
+ }
278
+
279
+ .dataframe tbody tr {
280
+ border-bottom: 1px solid #3B4252 !important;
281
+ }
282
+
283
+ .dataframe tbody tr:hover {
284
+ background: rgba(136, 192, 208, 0.04) !important;
285
+ }
286
+
287
+ .dataframe tbody td {
288
+ padding: 0.75rem 1rem !important;
289
+ color: #E5E9F0 !important;
290
+ background: #2E3440 !important;
291
+ overflow: hidden !important;
292
+ text-overflow: ellipsis !important;
293
+ border: none !important;
294
+ }
295
+
296
+ /* === Pagination bar === */
297
+ .pagination-bar {
298
+ margin-top: 1rem !important;
299
+ padding: 1rem 0 !important;
300
+ border-top: 1px solid #3B4252 !important;
301
+ display: flex !important;
302
+ justify-content: center !important;
303
+ align-items: center !important;
304
+ gap: 1rem !important;
305
+ }
306
+
307
+ .page-info {
308
+ font-family: 'JetBrains Mono', monospace !important;
309
+ font-size: 1rem !important;
310
+ color: #D8DEE9 !important;
311
+ min-width: 80px !important;
312
+ text-align: center !important;
313
+ }
314
+
315
+ /* Model name - white, readable */
316
+ .dataframe tbody td:first-child {
317
+ font-weight: 500 !important;
318
+ color: #ECEFF4 !important;
319
+ white-space: nowrap !important;
320
+ }
321
+
322
+ /* All other columns - use monospace for numbers */
323
+ .dataframe tbody td:not(:first-child) {
324
+ font-family: 'JetBrains Mono', monospace !important;
325
+ color: #8FBCBB !important;
326
+ text-align: left !important;
327
+ }
328
+
329
+ .dataframe tbody td:nth-child(2) {
330
+ color: #88C0D0 !important;
331
+ white-space: nowrap !important;
332
+ }
333
+
334
+ .dataframe tbody td:nth-child(3) {
335
+ color: #D08770 !important;
336
+ }
337
+
338
+ .dataframe tbody td:nth-child(4) {
339
+ font-weight: 600 !important;
340
+ color: #A3BE8C !important;
341
+ }
342
+
343
+ .dataframe tbody td:nth-child(n+5) {
344
+ white-space: nowrap !important;
345
+ }
346
+
347
+ /* === Status text === */
348
+ .status-text {
349
+ font-size: 0.9rem !important;
350
+ color: #D8DEE9 !important;
351
+ padding: 0.5rem 0 !important;
352
+ font-family: 'JetBrains Mono', monospace !important;
353
+ }
354
+
355
+ /* === Model Card === */
356
+ .model-card-container {
357
+ display: flex;
358
+ flex-direction: column;
359
+ gap: 1.25rem;
360
+ }
361
+
362
+ .model-card-header {
363
+ background: #3B4252;
364
+ border: 1px solid #434C5E;
365
+ border-radius: 12px;
366
+ padding: 1.5rem 2rem;
367
+ }
368
+
369
+ .model-card-header h2 {
370
+ margin: 0 0 0.5rem 0;
371
+ font-size: 1.5rem;
372
+ font-weight: 600;
373
+ color: #ECEFF4;
374
+ }
375
+
376
+ .model-card-header .model-meta {
377
+ display: flex;
378
+ gap: 1.5rem;
379
+ color: #D8DEE9;
380
+ font-size: 0.95rem;
381
+ }
382
+
383
+ .model-card-header .model-meta strong {
384
+ color: #8FBCBB;
385
+ }
386
+
387
+ .leaderboard-section {
388
+ background: #3B4252;
389
+ border: 1px solid #434C5E;
390
+ border-radius: 10px;
391
+ overflow: hidden;
392
+ }
393
+
394
+ .leaderboard-section-header {
395
+ background: #434C5E;
396
+ padding: 1rem 1.25rem;
397
+ border-bottom: 1px solid #4C566A;
398
+ display: flex;
399
+ justify-content: space-between;
400
+ align-items: center;
401
+ }
402
+
403
+ .leaderboard-section-header h3 {
404
+ margin: 0;
405
+ font-size: 1rem;
406
+ font-weight: 600;
407
+ color: #88C0D0;
408
+ }
409
+
410
+ .leaderboard-section-header .lb-avg {
411
+ background: rgba(163, 190, 140, 0.15);
412
+ border: 1px solid rgba(163, 190, 140, 0.3);
413
+ border-radius: 8px;
414
+ padding: 0.5rem 1rem;
415
+ font-size: 0.85rem;
416
+ color: #D8DEE9;
417
+ }
418
+
419
+ .leaderboard-section-header .lb-avg strong {
420
+ color: #A3BE8C;
421
+ font-family: 'JetBrains Mono', monospace;
422
+ font-size: 1.1rem;
423
+ font-weight: 700;
424
+ }
425
+
426
+ .scores-grid {
427
+ display: grid;
428
+ grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
429
+ gap: 1px;
430
+ background: #434C5E;
431
+ }
432
+
433
+ .score-item {
434
+ background: #3B4252;
435
+ padding: 1rem 1.25rem;
436
+ }
437
+
438
+ .score-item .score-label {
439
+ font-size: 0.8rem;
440
+ text-transform: uppercase;
441
+ letter-spacing: 0.05em;
442
+ color: #D8DEE9;
443
+ margin-bottom: 0.375rem;
444
+ }
445
+
446
+ .score-item .score-value {
447
+ font-size: 1.5rem;
448
+ font-weight: 600;
449
+ font-family: 'JetBrains Mono', monospace;
450
+ color: #A3BE8C;
451
+ }
452
+
453
+ .score-item.highlight .score-value {
454
+ color: #88C0D0;
455
+ }
456
+
457
+ .no-results {
458
+ text-align: center;
459
+ padding: 3rem 1rem;
460
+ color: #D8DEE9;
461
+ }
462
+
463
+ .no-results h3 {
464
+ color: #ECEFF4;
465
+ margin-bottom: 0.5rem;
466
+ }
467
+
468
+
469
+ /* === New Comparison View === */
470
+ .comparison-container {
471
+ display: flex;
472
+ flex-direction: column;
473
+ gap: 1.5rem;
474
+ }
475
+
476
+ .comparison-summary {
477
+ background: #3B4252;
478
+ border: 1px solid #434C5E;
479
+ border-radius: 12px;
480
+ padding: 1.5rem;
481
+ }
482
+
483
+ .comparison-summary h2 {
484
+ margin: 0 0 1rem 0;
485
+ color: #ECEFF4;
486
+ font-size: 1.25rem;
487
+ }
488
+
489
+ .summary-cards {
490
+ display: flex;
491
+ gap: 1rem;
492
+ flex-wrap: wrap;
493
+ }
494
+
495
+ .summary-card {
496
+ flex: 1;
497
+ min-width: 200px;
498
+ background: #2E3440;
499
+ border-radius: 8px;
500
+ padding: 1rem;
501
+ }
502
+
503
+ .summary-card-header {
504
+ display: flex;
505
+ align-items: center;
506
+ gap: 0.5rem;
507
+ margin-bottom: 0.75rem;
508
+ }
509
+
510
+ .model-dot {
511
+ width: 10px;
512
+ height: 10px;
513
+ border-radius: 50%;
514
+ }
515
+
516
+ .model-name {
517
+ font-weight: 600;
518
+ color: #ECEFF4;
519
+ font-size: 0.9rem;
520
+ overflow: hidden;
521
+ text-overflow: ellipsis;
522
+ white-space: nowrap;
523
+ }
524
+
525
+ .summary-card-body {
526
+ display: flex;
527
+ flex-direction: column;
528
+ gap: 0.5rem;
529
+ }
530
+
531
+ .summary-stat {
532
+ display: flex;
533
+ justify-content: space-between;
534
+ align-items: center;
535
+ }
536
+
537
+ .summary-stat .stat-label {
538
+ font-size: 0.75rem;
539
+ color: #D8DEE9;
540
+ text-transform: uppercase;
541
+ letter-spacing: 0.05em;
542
+ }
543
+
544
+ .summary-stat .stat-value {
545
+ font-family: 'JetBrains Mono', monospace;
546
+ color: #8FBCBB;
547
+ }
548
+
549
+ .summary-stat.primary .stat-value.large {
550
+ font-size: 1.5rem;
551
+ font-weight: 700;
552
+ color: #A3BE8C;
553
+ }
554
+
555
+ .leaderboard-comparison-card {
556
+ background: #3B4252;
557
+ border: 1px solid #434C5E;
558
+ border-radius: 12px;
559
+ overflow: hidden;
560
+ }
561
+
562
+ .lb-card-header {
563
+ background: #434C5E;
564
+ padding: 0.875rem 1.25rem;
565
+ }
566
+
567
+ .lb-card-header h3 {
568
+ margin: 0;
569
+ color: #88C0D0;
570
+ font-size: 1rem;
571
+ font-weight: 600;
572
+ }
573
+
574
+ .lb-card-body {
575
+ padding: 1rem 1.25rem;
576
+ display: flex;
577
+ flex-direction: column;
578
+ gap: 0.75rem;
579
+ }
580
+
581
+ .metric-comparison {
582
+ display: flex;
583
+ flex-direction: column;
584
+ gap: 0.375rem;
585
+ }
586
+
587
+ .metric-name-row {
588
+ margin-bottom: 0.25rem;
589
+ }
590
+
591
+ .metric-title {
592
+ font-size: 0.85rem;
593
+ font-weight: 600;
594
+ color: #ECEFF4;
595
+ }
596
+
597
+ .metric-title.sub {
598
+ font-size: 0.75rem;
599
+ font-weight: 500;
600
+ color: #D8DEE9;
601
+ }
602
+
603
+ .model-score-row {
604
+ display: flex;
605
+ align-items: center;
606
+ gap: 0.5rem;
607
+ padding: 0.375rem 0;
608
+ }
609
+
610
+ .model-score-row.compact {
611
+ padding: 0.25rem 0;
612
+ }
613
+
614
+ .model-score-row.best-score {
615
+ background: rgba(163, 190, 140, 0.1);
616
+ border-radius: 4px;
617
+ padding-left: 0.5rem;
618
+ margin-left: -0.5rem;
619
+ }
620
+
621
+ .model-score-row.no-data {
622
+ opacity: 0.5;
623
+ }
624
+
625
+ .model-indicator {
626
+ width: 8px;
627
+ height: 8px;
628
+ border-radius: 2px;
629
+ flex-shrink: 0;
630
+ }
631
+
632
+ .model-indicator.small {
633
+ width: 6px;
634
+ height: 6px;
635
+ }
636
+
637
+ .score-bar-container {
638
+ flex: 1;
639
+ display: flex;
640
+ align-items: center;
641
+ gap: 0.75rem;
642
+ height: 24px;
643
+ background: #2E3440;
644
+ border-radius: 4px;
645
+ padding: 0 0.5rem;
646
+ position: relative;
647
+ }
648
+
649
+ .score-bar {
650
+ position: absolute;
651
+ left: 0;
652
+ top: 0;
653
+ bottom: 0;
654
+ border-radius: 4px;
655
+ opacity: 0.3;
656
+ }
657
+
658
+ .score-bar.thin {
659
+ opacity: 0.2;
660
+ }
661
+
662
+ .score-value {
663
+ position: relative;
664
+ font-family: 'JetBrains Mono', monospace;
665
+ font-size: 0.9rem;
666
+ font-weight: 600;
667
+ color: #ECEFF4;
668
+ z-index: 1;
669
+ }
670
+
671
+ .score-value.small {
672
+ font-size: 0.8rem;
673
+ font-weight: 500;
674
+ }
675
+
676
+ .score-value.dim {
677
+ color: #4C566A;
678
+ }
679
+
680
+ /* === Selected Models Chips === */
681
+ .selected-models-group label {
682
+ display: inline-flex !important;
683
+ align-items: center !important;
684
+ background: #434C5E;
685
+ border: 1px solid #4C566A;
686
+ border-radius: 16px;
687
+ padding: 0.35rem 0.85rem;
688
+ font-size: 0.85rem;
689
+ color: #ECEFF4;
690
+ gap: 0.4rem;
691
+ cursor: pointer;
692
+ margin: 0.15rem 0.3rem 0.15rem 0 !important;
693
+ }
694
+
695
+ .selected-models-group label span::before {
696
+ content: "Γ—";
697
+ font-size: 0.75rem;
698
+ color: #EBCB8B;
699
+ opacity: 0;
700
+ transition: opacity 0.15s ease;
701
+ }
702
+
703
+ .selected-models-group label:hover span::before {
704
+ opacity: 1;
705
+ }
706
+
707
+ .selected-models-group input[type="checkbox"] {
708
+ display: none;
709
+ }
710
+
711
+ /* === Heat Map Table === */
712
+ .heatmap-table-wrapper {
713
+ overflow-x: auto;
714
+ margin-top: 1rem;
715
+ }
716
+
717
+ .heatmap-table {
718
+ width: 100%;
719
+ border-collapse: collapse;
720
+ font-size: 0.85rem;
721
+ }
722
+
723
+ .heatmap-table thead {
724
+ position: sticky;
725
+ top: 0;
726
+ z-index: 10;
727
+ }
728
+
729
+ .heatmap-table th {
730
+ background: #434C5E;
731
+ padding: 0.625rem 0.75rem;
732
+ font-weight: 600;
733
+ font-size: 0.7rem;
734
+ text-transform: uppercase;
735
+ letter-spacing: 0.05em;
736
+ color: #81A1C1;
737
+ text-align: left;
738
+ border-bottom: 2px solid #4C566A;
739
+ white-space: nowrap;
740
+ }
741
+
742
+ .heatmap-table th.metric-header {
743
+ min-width: 120px;
744
+ }
745
+
746
+ .heatmap-table th.model-header {
747
+ text-align: center;
748
+ max-width: 150px;
749
+ overflow: hidden;
750
+ text-overflow: ellipsis;
751
+ }
752
+
753
+ .heatmap-table td {
754
+ padding: 0.5rem 0.75rem;
755
+ border-bottom: 1px solid #3B4252;
756
+ }
757
+
758
+ .heatmap-table td.metric-name {
759
+ font-weight: 500;
760
+ color: #D8DEE9;
761
+ background: #2E3440;
762
+ }
763
+
764
+ .heatmap-table td.score-cell {
765
+ text-align: center;
766
+ font-family: 'JetBrains Mono', monospace;
767
+ font-weight: 500;
768
+ transition: all 0.15s ease;
769
+ }
770
+
771
+ .heatmap-table td.score-cell.best {
772
+ background: rgba(163, 190, 140, 0.25);
773
+ color: #A3BE8C;
774
+ font-weight: 700;
775
+ }
776
+
777
+ .heatmap-table td.score-cell.good {
778
+ background: rgba(163, 190, 140, 0.12);
779
+ color: #A3BE8C;
780
+ }
781
+
782
+ .heatmap-table td.score-cell.mid {
783
+ background: rgba(235, 203, 139, 0.12);
784
+ color: #EBCB8B;
785
+ }
786
+
787
+ .heatmap-table td.score-cell.low {
788
+ background: rgba(208, 135, 112, 0.12);
789
+ color: #D08770;
790
+ }
791
+
792
+ .heatmap-table td.score-cell.worst {
793
+ background: rgba(191, 97, 106, 0.15);
794
+ color: #BF616A;
795
+ }
796
+
797
+ .heatmap-table td.score-cell.na {
798
+ color: #4C566A;
799
+ font-style: italic;
800
+ }
801
+
802
+ .heatmap-table tr.avg-row {
803
+ background: rgba(136, 192, 208, 0.08);
804
+ }
805
+
806
+ .heatmap-table tr.avg-row td.metric-name {
807
+ font-weight: 700;
808
+ color: #88C0D0;
809
+ background: rgba(136, 192, 208, 0.08);
810
+ }
811
+
812
+ /* === Buttons === */
813
+ button {
814
+ border-radius: 8px !important;
815
+ font-weight: 500 !important;
816
+ font-size: 0.95rem !important;
817
+ transition: all 0.15s ease !important;
818
+ }
819
+
820
+ button.primary {
821
+ background: #88C0D0 !important;
822
+ color: #2E3440 !important;
823
+ border: none !important;
824
+ }
825
+
826
+ button.primary:hover:not(:disabled) {
827
+ background: #8FBCBB !important;
828
+ }
829
+
830
+ button.secondary,
831
+ button[variant="secondary"] {
832
+ background: #434C5E !important;
833
+ color: #ECEFF4 !important;
834
+ border: 1px solid #4C566A !important;
835
+ }
836
+
837
+ button.secondary:hover:not(:disabled),
838
+ button[variant="secondary"]:hover:not(:disabled) {
839
+ background: #4C566A !important;
840
+ }
841
+
842
+ button:disabled {
843
+ opacity: 0.35 !important;
844
+ }
845
+
846
+ /* === Inputs === */
847
+ input[type="text"],
848
+ select {
849
+ background: #2E3440 !important;
850
+ border: 1px solid #4C566A !important;
851
+ border-radius: 8px !important;
852
+ color: #ECEFF4 !important;
853
+ font-size: 1rem !important;
854
+ }
855
+
856
+ input[type="text"]:focus,
857
+ select:focus {
858
+ border-color: #88C0D0 !important;
859
+ box-shadow: 0 0 0 3px rgba(136, 192, 208, 0.15) !important;
860
+ outline: none !important;
861
+ }
862
+
863
+ input::placeholder {
864
+ color: #4C566A !important;
865
+ }
866
+
867
+ /* === Accordion === */
868
+ .accordion {
869
+ background: #3B4252 !important;
870
+ border: 1px solid #434C5E !important;
871
+ border-radius: 10px !important;
872
+ margin-top: 1.5rem !important;
873
+ }
874
+
875
+ .accordion > .label-wrap {
876
+ background: transparent !important;
877
+ padding: 1rem 1.25rem !important;
878
+ color: #D8DEE9 !important;
879
+ font-size: 0.95rem !important;
880
+ }
881
+
882
+ .accordion > .wrap {
883
+ padding: 0.5rem 1.25rem 1.25rem !important;
884
+ color: #D8DEE9 !important;
885
+ font-size: 0.95rem !important;
886
+ line-height: 1.6 !important;
887
+ }
888
+
889
+ .accordion code {
890
+ background: #434C5E !important;
891
+ padding: 0.125rem 0.375rem !important;
892
+ border-radius: 4px !important;
893
+ font-family: 'JetBrains Mono', monospace !important;
894
+ font-size: 0.8rem !important;
895
+ color: #8FBCBB !important;
896
+ }
897
+
898
+ /* === Metrics section === */
899
+ .metrics-section {
900
+ margin-top: 1.5rem;
901
+ padding-top: 1.5rem;
902
+ border-top: 1px solid #434C5E;
903
+ }
904
+
905
+ .metrics-section h3 {
906
+ font-size: 0.85rem;
907
+ font-weight: 600;
908
+ color: #D8DEE9;
909
+ margin: 0 0 1rem 0;
910
+ text-transform: uppercase;
911
+ letter-spacing: 0.05em;
912
+ }
913
+
914
+ .metrics-grid {
915
+ display: grid;
916
+ grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
917
+ gap: 0.75rem;
918
+ }
919
+
920
+ .metric-card {
921
+ background: #3B4252;
922
+ border: 1px solid #434C5E;
923
+ border-radius: 8px;
924
+ overflow: hidden;
925
+ }
926
+
927
+ .metric-card-header {
928
+ display: flex;
929
+ justify-content: space-between;
930
+ align-items: center;
931
+ padding: 0.75rem 1rem;
932
+ cursor: pointer;
933
+ list-style: none;
934
+ }
935
+
936
+ .metric-card-header::-webkit-details-marker {
937
+ display: none;
938
+ }
939
+
940
+ .metric-card-name {
941
+ font-weight: 500;
942
+ font-size: 0.95rem;
943
+ color: #ECEFF4;
944
+ }
945
+
946
+ .metric-card-direction {
947
+ font-size: 0.8rem;
948
+ color: #D8DEE9;
949
+ }
950
+
951
+ .metric-card-direction .arrow {
952
+ color: #A3BE8C;
953
+ font-weight: 600;
954
+ }
955
+
956
+ .metric-card-body {
957
+ padding: 0.875rem 1.25rem;
958
+ border-top: 1px solid #434C5E;
959
+ font-size: 0.9rem;
960
+ color: #D8DEE9;
961
+ line-height: 1.5;
962
+ }
963
+
964
+ .metric-type-badge {
965
+ font-size: 0.65rem;
966
+ text-transform: uppercase;
967
+ letter-spacing: 0.05em;
968
+ padding: 0.15rem 0.4rem;
969
+ background: rgba(180, 142, 173, 0.2);
970
+ border: 1px solid rgba(180, 142, 173, 0.35);
971
+ border-radius: 4px;
972
+ color: #B48EAD;
973
+ font-family: 'JetBrains Mono', monospace;
974
+ }
975
+
976
+ /* === Scrollbar === */
977
+ ::-webkit-scrollbar {
978
+ width: 8px;
979
+ height: 8px;
980
+ }
981
+
982
+ ::-webkit-scrollbar-track {
983
+ background: #2E3440;
984
+ }
985
+
986
+ ::-webkit-scrollbar-thumb {
987
+ background: #4C566A;
988
+ border-radius: 4px;
989
+ }
990
+
991
+ ::-webkit-scrollbar-thumb:hover {
992
+ background: #5E81AC;
993
+ }
994
+
995
+ /* === Responsive === */
996
+ @media (max-width: 768px) {
997
+ .gradio-container {
998
+ padding: 1rem !important;
999
+ }
1000
+
1001
+ .scores-grid {
1002
+ grid-template-columns: repeat(2, 1fr);
1003
+ }
1004
+ }
1005
+
1006
+ /* === Overrides === */
1007
+ .gradio-container footer {
1008
+ display: none !important;
1009
+ }
1010
+
1011
+ .block {
1012
+ background: #3B4252 !important;
1013
+ }
1014
+
1015
+ .gradio-radio label {
1016
+ background: #434C5E !important;
1017
+ border: 1px solid #4C566A !important;
1018
+ color: #ECEFF4 !important;
1019
+ border-radius: 8px !important;
1020
+ font-size: 0.85rem !important;
1021
+ }
1022
+
1023
+ .gradio-radio label.selected {
1024
+ background: #88C0D0 !important;
1025
+ border-color: #88C0D0 !important;
1026
+ color: #2E3440 !important;
1027
+ }
1028
+ """
1029
+
1030
+
1031
+ def format_leaderboard_header(selected_leaderboard, metadata):
1032
+ """Formats the leaderboard header info section."""
1033
+ if not selected_leaderboard:
1034
+ return """
1035
+ <div style="text-align: center; padding: 2rem 1rem; color: #D8DEE9;">
1036
+ <div style="font-size: 1.1rem;">Select a leaderboard to explore</div>
1037
+ </div>
1038
+ """
1039
+
1040
+ if not metadata or not metadata.get("evals"):
1041
+ return f"""
1042
+ <div class="info-banner">
1043
+ <h3>{selected_leaderboard}</h3>
1044
+ </div>
1045
+ """
1046
+
1047
+ source_info = metadata.get("source_info", {})
1048
+ org = source_info.get("organization", "Unknown")
1049
+ url = source_info.get("url", "#")
1050
+ eval_names = list(metadata["evals"].keys())
1051
+
1052
+ eval_tags = "".join([f'<span class="eval-tag">{name}</span>' for name in eval_names])
1053
+
1054
+ return f"""
1055
+ <div class="info-banner">
1056
+ <div style="display: flex; justify-content: space-between; align-items: center; gap: 1rem;">
1057
+ <div style="display: flex; align-items: center; gap: 1rem; flex-wrap: wrap;">
1058
+ <h3 style="margin: 0;">{selected_leaderboard}</h3>
1059
+ <span style="color: #D8DEE9; font-size: 0.8rem;">by {org}</span>
1060
+ <div class="eval-tags" style="margin: 0;">{eval_tags}</div>
1061
+ </div>
1062
+ <a href="{url}" target="_blank" style="
1063
+ font-size: 0.75rem;
1064
+ color: #88C0D0;
1065
+ text-decoration: none;
1066
+ padding: 0.375rem 0.75rem;
1067
+ border: 1px solid rgba(136, 192, 208, 0.4);
1068
+ border-radius: 6px;
1069
+ white-space: nowrap;
1070
+ ">Source β†’</a>
1071
+ </div>
1072
+ </div>
1073
+ """
1074
+
1075
+
1076
+ def format_metric_details(selected_leaderboard, metadata):
1077
+ """Formats metric detail cards."""
1078
+ if not selected_leaderboard or not metadata or not metadata.get("evals"):
1079
+ return ""
1080
+
1081
+ evals = metadata.get("evals", {})
1082
+
1083
+ html = """
1084
+ <div class="metrics-section">
1085
+ <h3>Metric Reference</h3>
1086
+ <div class="metrics-grid">
1087
+ """
1088
+
1089
+ for eval_name, info in evals.items():
1090
+ score_type = info['score_type'].upper() if info.get('score_type') else "β€”"
1091
+ direction = "Lower is better" if info.get('lower_is_better') else "Higher is better"
1092
+ arrow = "↓" if info.get('lower_is_better') else "↑"
1093
+
1094
+ details = ""
1095
+ if info.get('score_type') == "continuous" and info.get('min_score') is not None:
1096
+ details = f"Range: [{info['min_score']} – {info['max_score']}]"
1097
+ elif info.get('score_type') == "levels" and info.get('level_names'):
1098
+ details = f"Levels: {', '.join(str(l) for l in info['level_names'])}"
1099
+
1100
+ html += f"""
1101
+ <details class="metric-card">
1102
+ <summary class="metric-card-header">
1103
+ <span class="metric-card-name">{eval_name}</span>
1104
+ <span class="metric-card-direction"><span class="arrow">{arrow}</span> {direction}</span>
1105
+ </summary>
1106
+ <div class="metric-card-body">
1107
+ <div>{info.get('description', 'No description')}</div>
1108
+ <div style="display: flex; justify-content: space-between; align-items: center; margin-top: 0.5rem;">
1109
+ <span style="font-size: 0.75rem; color: #D8DEE9;">{details}</span>
1110
+ <span class="metric-type-badge">{score_type}</span>
1111
+ </div>
1112
+ </div>
1113
+ </details>
1114
+ """
1115
+
1116
+ html += "</div></div>"
1117
+ return html
1118
+
1119
+
1120
+ def format_model_card(model_name, model_data):
1121
+ """Formats a model card showing all evals across leaderboards."""
1122
+ if not model_data:
1123
+ return """
1124
+ <div class="no-results">
1125
+ <h3>No results found</h3>
1126
+ <p>Try searching for a different model name</p>
1127
+ </div>
1128
+ """
1129
+
1130
+ first = list(model_data.values())[0]
1131
+ developer = first.get("developer", "Unknown")
1132
+ params = first.get("params")
1133
+ arch = first.get("architecture", "Unknown")
1134
+
1135
+ params_str = f"{params}B" if params else "β€”"
1136
+
1137
+ html = f"""
1138
+ <div class="model-card-container">
1139
+ <div class="model-card-header">
1140
+ <h2>{model_name}</h2>
1141
+ <div class="model-meta">
1142
+ <span><strong>Developer:</strong> {developer}</span>
1143
+ <span><strong>Parameters:</strong> {params_str}</span>
1144
+ <span><strong>Architecture:</strong> {arch}</span>
1145
+ </div>
1146
+ </div>
1147
+ """
1148
+
1149
+ for leaderboard_name, data in model_data.items():
1150
+ results = data.get("results", {})
1151
+ if not results:
1152
+ continue
1153
+
1154
+ scores = [v for v in results.values() if v is not None]
1155
+ avg = sum(scores) / len(scores) if scores else None
1156
+ avg_str = f"{avg:.2f}" if avg else "β€”"
1157
+
1158
+ html += f"""
1159
+ <div class="leaderboard-section">
1160
+ <div class="leaderboard-section-header">
1161
+ <h3>{leaderboard_name}</h3>
1162
+ <span class="lb-avg">Avg: <strong>{avg_str}</strong></span>
1163
+ </div>
1164
+ <div class="scores-grid">
1165
+ """
1166
+
1167
+ sorted_results = sorted(results.items(), key=lambda x: x[1] if x[1] is not None else 0, reverse=True)
1168
+
1169
+ for i, (metric_name, score) in enumerate(sorted_results):
1170
+ score_display = f"{score:.2f}" if score is not None else "β€”"
1171
+ highlight_class = "highlight" if i == 0 else ""
1172
+
1173
+ html += f"""
1174
+ <div class="score-item {highlight_class}">
1175
+ <div class="score-label">{metric_name}</div>
1176
+ <div class="score-value">{score_display}</div>
1177
+ </div>
1178
+ """
1179
+
1180
+ html += "</div></div>"
1181
+
1182
+ html += "</div>"
1183
+ return html
1184
+
1185
+
1186
+ def format_model_comparison(selected_models, all_results):
1187
+ """Formats a comparison view showing multiple models with visual indicators."""
1188
+ if not selected_models or not all_results:
1189
+ return """
1190
+ <div class="no-results">
1191
+ <h3>Select models to compare</h3>
1192
+ <p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
1193
+ </div>
1194
+ """
1195
+
1196
+ # Get all unique leaderboards across selected models
1197
+ all_leaderboards = set()
1198
+ model_data_dict = {}
1199
+
1200
+ for model_name in selected_models:
1201
+ if model_name in all_results:
1202
+ model_data_dict[model_name] = all_results[model_name]
1203
+ for leaderboard_name in all_results[model_name].keys():
1204
+ all_leaderboards.add(leaderboard_name)
1205
+
1206
+ if not model_data_dict:
1207
+ return """
1208
+ <div class="no-results">
1209
+ <h3>No data found for selected models</h3>
1210
+ <p>Try selecting different models</p>
1211
+ </div>
1212
+ """
1213
+
1214
+ all_leaderboards = sorted(all_leaderboards)
1215
+ model_colors = ['#88C0D0', '#A3BE8C', '#EBCB8B', '#D08770', '#B48EAD', '#8FBCBB', '#81A1C1', '#BF616A']
1216
+
1217
+ # Calculate overall averages for summary
1218
+ overall_avgs = {}
1219
+ for model_name in selected_models:
1220
+ if model_name in model_data_dict:
1221
+ all_scores = []
1222
+ for lb_data in model_data_dict[model_name].values():
1223
+ all_scores.extend([v for v in lb_data.get("results", {}).values() if v is not None])
1224
+ overall_avgs[model_name] = sum(all_scores) / len(all_scores) if all_scores else None
1225
+
1226
+ html = """
1227
+ <div class="comparison-container">
1228
+ <div class="comparison-summary">
1229
+ <h2>Model Comparison</h2>
1230
+ <div class="summary-cards">
1231
+ """
1232
+
1233
+ # Summary cards for each model
1234
+ for i, model_name in enumerate(selected_models):
1235
+ color = model_colors[i % len(model_colors)]
1236
+ avg = overall_avgs.get(model_name)
1237
+ avg_str = f"{avg:.2f}" if avg is not None else "β€”"
1238
+
1239
+ # Get model info
1240
+ model_info = list(model_data_dict.get(model_name, {}).values())
1241
+ developer = model_info[0].get("developer", "Unknown") if model_info else "Unknown"
1242
+
1243
+ html += f"""
1244
+ <div class="summary-card" style="border-left: 4px solid {color};">
1245
+ <div class="summary-card-header">
1246
+ <span class="model-dot" style="background: {color};"></span>
1247
+ <span class="model-name">{model_name}</span>
1248
+ </div>
1249
+ <div class="summary-card-body">
1250
+ <div class="summary-stat">
1251
+ <span class="stat-label">Developer</span>
1252
+ <span class="stat-value">{developer}</span>
1253
+ </div>
1254
+ <div class="summary-stat primary">
1255
+ <span class="stat-label">Overall Avg</span>
1256
+ <span class="stat-value large">{avg_str}</span>
1257
+ </div>
1258
+ </div>
1259
+ </div>
1260
+ """
1261
+
1262
+ html += """
1263
+ </div>
1264
+ </div>
1265
+ """
1266
+
1267
+ # Leaderboard comparison cards
1268
+ for leaderboard_name in all_leaderboards:
1269
+ leaderboard_metrics = set()
1270
+ for model_data in model_data_dict.values():
1271
+ if leaderboard_name in model_data:
1272
+ results = model_data[leaderboard_name].get("results", {})
1273
+ leaderboard_metrics.update(results.keys())
1274
+
1275
+ leaderboard_metrics = sorted(leaderboard_metrics)
1276
+ if not leaderboard_metrics:
1277
+ continue
1278
+
1279
+ # Calculate averages for ranking
1280
+ model_avgs = {}
1281
+ for model_name in selected_models:
1282
+ if model_name in model_data_dict and leaderboard_name in model_data_dict[model_name]:
1283
+ results = model_data_dict[model_name][leaderboard_name].get("results", {})
1284
+ scores = [v for v in results.values() if v is not None]
1285
+ model_avgs[model_name] = sum(scores) / len(scores) if scores else None
1286
+
1287
+ html += f"""
1288
+ <div class="leaderboard-comparison-card">
1289
+ <div class="lb-card-header">
1290
+ <h3>{leaderboard_name}</h3>
1291
+ </div>
1292
+ <div class="lb-card-body">
1293
+ """
1294
+
1295
+ # Compact heat-map table
1296
+ html += '<div class="heatmap-table-wrapper">'
1297
+ html += '<table class="heatmap-table">'
1298
+
1299
+ # Header with model names
1300
+ html += '<thead><tr><th class="metric-header">Metric</th>'
1301
+ for i, model_name in enumerate(selected_models):
1302
+ # Truncate long names
1303
+ short_name = model_name if len(model_name) <= 20 else model_name[:18] + "…"
1304
+ html += f'<th class="model-header" title="{model_name}">{short_name}</th>'
1305
+ html += '</tr></thead>'
1306
+
1307
+ html += '<tbody>'
1308
+
1309
+ # Average row first
1310
+ html += '<tr class="avg-row"><td class="metric-name">Average</td>'
1311
+ valid_avgs_list = [model_avgs.get(m) for m in selected_models if model_avgs.get(m) is not None]
1312
+ max_avg_val = max(valid_avgs_list) if valid_avgs_list else None
1313
+
1314
+ for model_name in selected_models:
1315
+ avg = model_avgs.get(model_name)
1316
+ if avg is not None:
1317
+ cell_class = "best" if avg == max_avg_val and len(valid_avgs_list) > 1 else ""
1318
+ html += f'<td class="score-cell {cell_class}">{avg:.2f}</td>'
1319
+ else:
1320
+ html += '<td class="score-cell na">β€”</td>'
1321
+ html += '</tr>'
1322
+
1323
+ # Individual metric rows
1324
+ for metric_name in leaderboard_metrics:
1325
+ html += f'<tr><td class="metric-name">{metric_name}</td>'
1326
+
1327
+ # Get all scores for this metric
1328
+ metric_scores = {}
1329
+ for model_name in selected_models:
1330
+ if model_name in model_data_dict and leaderboard_name in model_data_dict[model_name]:
1331
+ results = model_data_dict[model_name][leaderboard_name].get("results", {})
1332
+ metric_scores[model_name] = results.get(metric_name)
1333
+
1334
+ valid_scores = [v for v in metric_scores.values() if v is not None]
1335
+ if valid_scores:
1336
+ max_score = max(valid_scores)
1337
+ min_score = min(valid_scores)
1338
+ score_range = max_score - min_score if max_score > min_score else 1
1339
+ else:
1340
+ max_score = min_score = score_range = None
1341
+
1342
+ for model_name in selected_models:
1343
+ score = metric_scores.get(model_name)
1344
+ if score is not None and score_range is not None:
1345
+ # Determine color class based on relative position
1346
+ if len(valid_scores) > 1:
1347
+ pct = (score - min_score) / score_range if score_range > 0 else 1
1348
+ if score == max_score:
1349
+ cell_class = "best"
1350
+ elif pct >= 0.75:
1351
+ cell_class = "good"
1352
+ elif pct >= 0.5:
1353
+ cell_class = "mid"
1354
+ elif pct >= 0.25:
1355
+ cell_class = "low"
1356
+ else:
1357
+ cell_class = "worst"
1358
+ else:
1359
+ cell_class = ""
1360
+ html += f'<td class="score-cell {cell_class}">{score:.2f}</td>'
1361
+ else:
1362
+ html += '<td class="score-cell na">β€”</td>'
1363
+
1364
+ html += '</tr>'
1365
+
1366
+ html += '</tbody></table></div>'
1367
+
1368
+ html += """
1369
+ </div>
1370
+ </div>
1371
+ """
1372
+
1373
+ html += "</div>"
1374
+ return html