Spaces:
Running
Running
Commit
Β·
a92080e
1
Parent(s):
149ce10
moving to EEE hf org
Browse files- .gitignore +9 -0
- app.py +499 -0
- data_loader.py +386 -0
- eval.schema.json +282 -0
- hf_operations.py +202 -0
- pyproject.toml +10 -0
- ui_components.py +1374 -0
.gitignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.DS_Store
|
| 2 |
+
.secrets
|
| 3 |
+
.actrc
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.pyc
|
| 6 |
+
parquet_output/
|
| 7 |
+
*.venv*
|
| 8 |
+
*.md
|
| 9 |
+
*.ipynb_checkpoints
|
app.py
ADDED
|
@@ -0,0 +1,499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluation Leaderboard - Gradio Interface
|
| 3 |
+
Displays model evaluation results from HuggingFace datasets.
|
| 4 |
+
"""
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
from data_loader import (
|
| 10 |
+
load_hf_dataset_on_startup,
|
| 11 |
+
get_available_leaderboards,
|
| 12 |
+
get_eval_metadata,
|
| 13 |
+
build_leaderboard_table,
|
| 14 |
+
clear_cache,
|
| 15 |
+
search_model_across_leaderboards,
|
| 16 |
+
get_all_model_names,
|
| 17 |
+
DATA_DIR
|
| 18 |
+
)
|
| 19 |
+
from ui_components import (
|
| 20 |
+
get_theme,
|
| 21 |
+
get_custom_css,
|
| 22 |
+
format_leaderboard_header,
|
| 23 |
+
format_metric_details,
|
| 24 |
+
format_model_card,
|
| 25 |
+
format_model_comparison,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
PAGE_SIZE = 50
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def update_leaderboard_table(selected_leaderboard, search_query="", current_page=1, sort_column=None, selected_columns=None, progress=gr.Progress()):
|
| 32 |
+
"""Loads and aggregates data for the selected leaderboard."""
|
| 33 |
+
if not selected_leaderboard:
|
| 34 |
+
return (
|
| 35 |
+
pd.DataFrame(),
|
| 36 |
+
format_leaderboard_header(None, {}),
|
| 37 |
+
format_metric_details(None, {}),
|
| 38 |
+
gr.update(choices=[], value=None),
|
| 39 |
+
gr.update(interactive=False),
|
| 40 |
+
gr.update(interactive=False),
|
| 41 |
+
gr.update(choices=[], value=None),
|
| 42 |
+
"0 / 0",
|
| 43 |
+
gr.update(choices=[], value=[]),
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
metadata = get_eval_metadata(selected_leaderboard)
|
| 47 |
+
|
| 48 |
+
def progress_callback(value, desc):
|
| 49 |
+
progress(value, desc=desc)
|
| 50 |
+
|
| 51 |
+
df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
|
| 52 |
+
|
| 53 |
+
# Get all available columns BEFORE filtering (for column selector)
|
| 54 |
+
all_available_columns = list(df.columns) if not df.empty else []
|
| 55 |
+
|
| 56 |
+
# Filter columns if selected (if None or empty, show all columns)
|
| 57 |
+
if selected_columns is not None and len(selected_columns) > 0:
|
| 58 |
+
# Ensure Model column is always included
|
| 59 |
+
base_cols = ["Model"]
|
| 60 |
+
available_cols = list(df.columns)
|
| 61 |
+
cols_to_show = [col for col in base_cols if col in available_cols]
|
| 62 |
+
# Add Developer and other selected columns
|
| 63 |
+
cols_to_show.extend([col for col in selected_columns if col in available_cols and col not in cols_to_show])
|
| 64 |
+
if cols_to_show:
|
| 65 |
+
df = df[cols_to_show]
|
| 66 |
+
|
| 67 |
+
if search_query and not df.empty:
|
| 68 |
+
mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
|
| 69 |
+
df = df[mask]
|
| 70 |
+
|
| 71 |
+
filtered_count = len(df)
|
| 72 |
+
|
| 73 |
+
if sort_column and sort_column in df.columns and not df.empty:
|
| 74 |
+
df = df.sort_values(by=sort_column, ascending=False, na_position='last')
|
| 75 |
+
|
| 76 |
+
total_pages = max(1, (filtered_count + PAGE_SIZE - 1) // PAGE_SIZE) if filtered_count > 0 else 1
|
| 77 |
+
current_page = max(1, min(current_page, total_pages))
|
| 78 |
+
|
| 79 |
+
start_idx = (current_page - 1) * PAGE_SIZE
|
| 80 |
+
end_idx = start_idx + PAGE_SIZE
|
| 81 |
+
df_paginated = df.iloc[start_idx:end_idx] if not df.empty else df
|
| 82 |
+
|
| 83 |
+
page_choices = [str(i) for i in range(1, total_pages + 1)]
|
| 84 |
+
page_dropdown = gr.update(choices=page_choices, value=str(current_page))
|
| 85 |
+
prev_btn = gr.update(interactive=(current_page > 1))
|
| 86 |
+
next_btn = gr.update(interactive=(current_page < total_pages))
|
| 87 |
+
page_info = f"{current_page} / {total_pages}"
|
| 88 |
+
|
| 89 |
+
sort_choices = list(df.columns) if not df.empty else []
|
| 90 |
+
default_sort = sort_column if sort_column and sort_column in sort_choices else ("Average" if "Average" in sort_choices else (sort_choices[0] if sort_choices else None))
|
| 91 |
+
sort_column_update = gr.update(choices=sort_choices, value=default_sort)
|
| 92 |
+
|
| 93 |
+
# Get all available columns for column selector (use full list, not filtered)
|
| 94 |
+
# Include all columns except Model in the selector (Model is always shown)
|
| 95 |
+
column_choices = [col for col in all_available_columns if col != "Model"]
|
| 96 |
+
# Preserve current selection, or default to all columns if None or empty
|
| 97 |
+
if selected_columns is None or len(selected_columns) == 0:
|
| 98 |
+
column_value = column_choices
|
| 99 |
+
else:
|
| 100 |
+
# Preserve user's selection, filtering out any invalid choices
|
| 101 |
+
column_value = [col for col in selected_columns if col in column_choices]
|
| 102 |
+
column_selector_update = gr.update(choices=column_choices, value=column_value)
|
| 103 |
+
|
| 104 |
+
return (
|
| 105 |
+
df_paginated,
|
| 106 |
+
format_leaderboard_header(selected_leaderboard, metadata),
|
| 107 |
+
format_metric_details(selected_leaderboard, metadata),
|
| 108 |
+
page_dropdown,
|
| 109 |
+
prev_btn,
|
| 110 |
+
next_btn,
|
| 111 |
+
sort_column_update,
|
| 112 |
+
page_info,
|
| 113 |
+
column_selector_update,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def search_model(model_query):
|
| 118 |
+
"""Search for a model and return formatted card."""
|
| 119 |
+
if not model_query or len(model_query) < 2:
|
| 120 |
+
return """
|
| 121 |
+
<div class="no-results">
|
| 122 |
+
<h3>Search for a model</h3>
|
| 123 |
+
<p>Enter a model name to see its benchmarks across all leaderboards</p>
|
| 124 |
+
</div>
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
results, _ = search_model_across_leaderboards(model_query)
|
| 128 |
+
|
| 129 |
+
if not results:
|
| 130 |
+
return f"""
|
| 131 |
+
<div class="no-results">
|
| 132 |
+
<h3>No results for "{model_query}"</h3>
|
| 133 |
+
<p>Try a different model name or check the spelling</p>
|
| 134 |
+
</div>
|
| 135 |
+
"""
|
| 136 |
+
|
| 137 |
+
# Use the first matching model
|
| 138 |
+
model_name = list(results.keys())[0]
|
| 139 |
+
model_data = results[model_name]
|
| 140 |
+
|
| 141 |
+
return format_model_card(model_name, model_data)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def compare_models(selected_models):
|
| 145 |
+
"""Compare multiple selected models."""
|
| 146 |
+
if not selected_models or len(selected_models) == 0:
|
| 147 |
+
return """
|
| 148 |
+
<div class="no-results">
|
| 149 |
+
<h3>Select models to compare</h3>
|
| 150 |
+
<p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
|
| 151 |
+
</div>
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
# Get data for all selected models
|
| 155 |
+
all_results = {}
|
| 156 |
+
for model_name in selected_models:
|
| 157 |
+
results, _ = search_model_across_leaderboards(model_name)
|
| 158 |
+
if results:
|
| 159 |
+
# Use the first matching model (exact match preferred)
|
| 160 |
+
matched_model = list(results.keys())[0]
|
| 161 |
+
all_results[matched_model] = results[matched_model]
|
| 162 |
+
|
| 163 |
+
if len(all_results) == 1:
|
| 164 |
+
# Single model - show card view
|
| 165 |
+
model_name = list(all_results.keys())[0]
|
| 166 |
+
return format_model_card(model_name, all_results[model_name])
|
| 167 |
+
elif len(all_results) > 1:
|
| 168 |
+
# Multiple models - show comparison
|
| 169 |
+
return format_model_comparison(list(all_results.keys()), all_results)
|
| 170 |
+
else:
|
| 171 |
+
return """
|
| 172 |
+
<div class="no-results">
|
| 173 |
+
<h3>No results found</h3>
|
| 174 |
+
<p>Try selecting different models</p>
|
| 175 |
+
</div>
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def get_model_suggestions(query):
|
| 180 |
+
"""Get model name suggestions for autocomplete."""
|
| 181 |
+
if not query or len(query) < 2:
|
| 182 |
+
return gr.update(choices=[])
|
| 183 |
+
|
| 184 |
+
_, matches = search_model_across_leaderboards(query)
|
| 185 |
+
return gr.update(choices=matches[:15])
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# Load data at startup
|
| 189 |
+
load_hf_dataset_on_startup()
|
| 190 |
+
|
| 191 |
+
# Build interface
|
| 192 |
+
with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css()) as demo:
|
| 193 |
+
|
| 194 |
+
# Header
|
| 195 |
+
gr.HTML("""
|
| 196 |
+
<div class="app-header">
|
| 197 |
+
<div class="logo-mark">EΒ³</div>
|
| 198 |
+
<div class="brand">
|
| 199 |
+
<h1>Every Eval Ever</h1>
|
| 200 |
+
<span class="tagline">Browse and compare model benchmarks</span>
|
| 201 |
+
</div>
|
| 202 |
+
<div class="header-right">
|
| 203 |
+
<span class="version-badge">beta</span>
|
| 204 |
+
</div>
|
| 205 |
+
</div>
|
| 206 |
+
""")
|
| 207 |
+
|
| 208 |
+
with gr.Tabs():
|
| 209 |
+
# === TAB 1: Leaderboard View ===
|
| 210 |
+
with gr.TabItem("π Leaderboards"):
|
| 211 |
+
with gr.Row(elem_classes="controls-bar"):
|
| 212 |
+
initial_choices = get_available_leaderboards()
|
| 213 |
+
initial_value = initial_choices[0] if initial_choices else None
|
| 214 |
+
|
| 215 |
+
with gr.Column(scale=2, min_width=200):
|
| 216 |
+
leaderboard_selector = gr.Dropdown(
|
| 217 |
+
choices=initial_choices,
|
| 218 |
+
value=initial_value,
|
| 219 |
+
label="Leaderboard",
|
| 220 |
+
interactive=True
|
| 221 |
+
)
|
| 222 |
+
with gr.Column(scale=3, min_width=250):
|
| 223 |
+
search_box = gr.Textbox(
|
| 224 |
+
label="Filter",
|
| 225 |
+
placeholder="Filter models...",
|
| 226 |
+
show_label=True
|
| 227 |
+
)
|
| 228 |
+
with gr.Column(scale=1, min_width=100):
|
| 229 |
+
refresh_btn = gr.Button("β» Refresh", variant="secondary", size="sm")
|
| 230 |
+
|
| 231 |
+
init_df, init_header, init_metrics, init_page_dropdown, init_prev, init_next, init_sort_cols, init_page_info, init_column_selector = update_leaderboard_table(initial_value, "", 1, "Average", None)
|
| 232 |
+
|
| 233 |
+
header_view = gr.HTML(value=init_header)
|
| 234 |
+
|
| 235 |
+
# Hidden sort state (default to Average)
|
| 236 |
+
sort_column_dropdown = gr.Dropdown(
|
| 237 |
+
choices=init_sort_cols.get("choices", []) if hasattr(init_sort_cols, 'get') else [],
|
| 238 |
+
value=init_sort_cols.get("value") if hasattr(init_sort_cols, 'get') else None,
|
| 239 |
+
visible=False,
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
# Column selector
|
| 243 |
+
with gr.Row(elem_classes="controls-bar"):
|
| 244 |
+
column_selector = gr.CheckboxGroup(
|
| 245 |
+
choices=init_column_selector.get("choices", []) if isinstance(init_column_selector, dict) else [],
|
| 246 |
+
value=init_column_selector.get("value", []) if isinstance(init_column_selector, dict) else [],
|
| 247 |
+
label="Columns to Display",
|
| 248 |
+
interactive=True,
|
| 249 |
+
show_label=True,
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
leaderboard_table = gr.Dataframe(
|
| 253 |
+
value=init_df,
|
| 254 |
+
label=None,
|
| 255 |
+
interactive=False,
|
| 256 |
+
wrap=False,
|
| 257 |
+
elem_classes="dataframe",
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
# Pagination below table - centered
|
| 261 |
+
with gr.Row(elem_classes="pagination-bar"):
|
| 262 |
+
prev_btn = gr.Button("β", variant="secondary", size="sm", min_width=60)
|
| 263 |
+
page_info = gr.Markdown(value=init_page_info, elem_classes="page-info")
|
| 264 |
+
next_btn = gr.Button("β", variant="secondary", size="sm", min_width=60)
|
| 265 |
+
# Extract choices and value from gr.update() dict, ensuring value is in choices
|
| 266 |
+
if isinstance(init_page_dropdown, dict):
|
| 267 |
+
page_choices = init_page_dropdown.get("choices", ["1"])
|
| 268 |
+
page_value = str(init_page_dropdown.get("value", "1")) if init_page_dropdown.get("value") is not None else "1"
|
| 269 |
+
# Ensure value exists in choices
|
| 270 |
+
if page_value not in page_choices:
|
| 271 |
+
page_value = page_choices[0] if page_choices else "1"
|
| 272 |
+
if not page_choices:
|
| 273 |
+
page_choices = ["1"]
|
| 274 |
+
else:
|
| 275 |
+
page_choices = ["1"]
|
| 276 |
+
page_value = "1"
|
| 277 |
+
page_dropdown = gr.Dropdown(
|
| 278 |
+
choices=page_choices,
|
| 279 |
+
value=page_value,
|
| 280 |
+
visible=False,
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
metrics_view = gr.HTML(value=init_metrics)
|
| 284 |
+
|
| 285 |
+
# === TAB 2: Model View ===
|
| 286 |
+
with gr.TabItem("π Model Lookup"):
|
| 287 |
+
gr.Markdown("### Find and compare models across all leaderboards")
|
| 288 |
+
|
| 289 |
+
selected_models_state = gr.State(value=[])
|
| 290 |
+
default_compare_html = """
|
| 291 |
+
<div class="no-results">
|
| 292 |
+
<h3>Search for models to compare</h3>
|
| 293 |
+
<p>Type in the dropdown above, then click a model to add it</p>
|
| 294 |
+
</div>
|
| 295 |
+
"""
|
| 296 |
+
|
| 297 |
+
with gr.Row(elem_classes="controls-bar"):
|
| 298 |
+
with gr.Column(scale=4):
|
| 299 |
+
all_models = get_all_model_names()
|
| 300 |
+
model_dropdown = gr.Dropdown(
|
| 301 |
+
choices=all_models,
|
| 302 |
+
label="Search models to add",
|
| 303 |
+
interactive=True,
|
| 304 |
+
allow_custom_value=False,
|
| 305 |
+
filterable=True,
|
| 306 |
+
)
|
| 307 |
+
with gr.Column(scale=1, min_width=100):
|
| 308 |
+
clear_models_btn = gr.Button("Clear All", variant="secondary", size="sm")
|
| 309 |
+
|
| 310 |
+
selected_models_group = gr.CheckboxGroup(
|
| 311 |
+
choices=[],
|
| 312 |
+
value=[],
|
| 313 |
+
label="Selected Models (click to remove)",
|
| 314 |
+
interactive=True,
|
| 315 |
+
elem_classes="selected-models-group"
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
model_card_view = gr.HTML(value=default_compare_html)
|
| 319 |
+
|
| 320 |
+
# Submission guide
|
| 321 |
+
with gr.Accordion("π€ How to Submit Data", open=False):
|
| 322 |
+
gr.Markdown("""
|
| 323 |
+
**Submit via GitHub Pull Request:**
|
| 324 |
+
|
| 325 |
+
1. Fork [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever)
|
| 326 |
+
2. Add JSON files to `data/<leaderboard>/<developer>/<model>/`
|
| 327 |
+
3. Open a PR β automated validation runs on submission
|
| 328 |
+
4. After merge, data syncs to HuggingFace automatically
|
| 329 |
+
|
| 330 |
+
[Submission Guide](https://github.com/evaleval/every_eval_ever#contributor-guide) Β· [JSON Schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json)
|
| 331 |
+
""")
|
| 332 |
+
|
| 333 |
+
# === State ===
|
| 334 |
+
current_page_state = gr.State(value=1)
|
| 335 |
+
sort_column_state = gr.State(value="Average")
|
| 336 |
+
|
| 337 |
+
def go_prev(current):
|
| 338 |
+
return max(1, current - 1)
|
| 339 |
+
|
| 340 |
+
def go_next(current):
|
| 341 |
+
return current + 1
|
| 342 |
+
|
| 343 |
+
def reset_page():
|
| 344 |
+
return 1
|
| 345 |
+
|
| 346 |
+
def update_table_only(selected_leaderboard, search_query, current_page, sort_column, selected_columns):
|
| 347 |
+
"""Update table without modifying column selector (for column changes)."""
|
| 348 |
+
result = update_leaderboard_table(selected_leaderboard, search_query, current_page, sort_column, selected_columns)
|
| 349 |
+
# Return all outputs except the last one (column_selector)
|
| 350 |
+
return result[:-1]
|
| 351 |
+
|
| 352 |
+
# === Leaderboard Events ===
|
| 353 |
+
leaderboard_selector.change(
|
| 354 |
+
fn=reset_page, outputs=[current_page_state]
|
| 355 |
+
).then(
|
| 356 |
+
fn=lambda: "Average", outputs=[sort_column_state]
|
| 357 |
+
).then(
|
| 358 |
+
fn=lambda: None, outputs=[column_selector]
|
| 359 |
+
).then(
|
| 360 |
+
fn=update_leaderboard_table,
|
| 361 |
+
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
|
| 362 |
+
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector]
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
search_box.input(
|
| 366 |
+
fn=reset_page, outputs=[current_page_state]
|
| 367 |
+
).then(
|
| 368 |
+
fn=update_table_only,
|
| 369 |
+
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
|
| 370 |
+
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
|
| 371 |
+
)
|
| 372 |
+
|
| 373 |
+
sort_column_dropdown.change(
|
| 374 |
+
fn=lambda col: col,
|
| 375 |
+
inputs=[sort_column_dropdown],
|
| 376 |
+
outputs=[sort_column_state]
|
| 377 |
+
).then(
|
| 378 |
+
fn=reset_page, outputs=[current_page_state]
|
| 379 |
+
).then(
|
| 380 |
+
fn=update_table_only,
|
| 381 |
+
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
|
| 382 |
+
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
column_selector.change(
|
| 386 |
+
fn=reset_page, outputs=[current_page_state]
|
| 387 |
+
).then(
|
| 388 |
+
fn=update_table_only,
|
| 389 |
+
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
|
| 390 |
+
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
page_dropdown.change(
|
| 394 |
+
fn=lambda p: int(p) if p else 1,
|
| 395 |
+
inputs=[page_dropdown],
|
| 396 |
+
outputs=[current_page_state]
|
| 397 |
+
).then(
|
| 398 |
+
fn=update_table_only,
|
| 399 |
+
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
|
| 400 |
+
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
|
| 401 |
+
)
|
| 402 |
+
|
| 403 |
+
prev_btn.click(
|
| 404 |
+
fn=go_prev, inputs=[current_page_state], outputs=[current_page_state]
|
| 405 |
+
).then(
|
| 406 |
+
fn=update_table_only,
|
| 407 |
+
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
|
| 408 |
+
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
next_btn.click(
|
| 412 |
+
fn=go_next, inputs=[current_page_state], outputs=[current_page_state]
|
| 413 |
+
).then(
|
| 414 |
+
fn=update_table_only,
|
| 415 |
+
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
|
| 416 |
+
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
refresh_btn.click(
|
| 420 |
+
fn=lambda: gr.Dropdown(choices=get_available_leaderboards()),
|
| 421 |
+
outputs=[leaderboard_selector]
|
| 422 |
+
).then(
|
| 423 |
+
fn=lambda: clear_cache()
|
| 424 |
+
).then(
|
| 425 |
+
fn=reset_page, outputs=[current_page_state]
|
| 426 |
+
).then(
|
| 427 |
+
fn=lambda: "Average", outputs=[sort_column_state]
|
| 428 |
+
).then(
|
| 429 |
+
fn=lambda: None, outputs=[column_selector]
|
| 430 |
+
).then(
|
| 431 |
+
fn=update_leaderboard_table,
|
| 432 |
+
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
|
| 433 |
+
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector]
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
# === Model Search Events ===
|
| 437 |
+
def add_model_and_compare(selected_model, current_selected):
|
| 438 |
+
"""Add a model and auto-compare."""
|
| 439 |
+
if not selected_model:
|
| 440 |
+
comparison_html = compare_models(current_selected) if current_selected else default_compare_html
|
| 441 |
+
return (
|
| 442 |
+
current_selected,
|
| 443 |
+
gr.update(value=None),
|
| 444 |
+
gr.update(choices=current_selected, value=current_selected),
|
| 445 |
+
comparison_html
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
if current_selected is None:
|
| 449 |
+
current_selected = []
|
| 450 |
+
|
| 451 |
+
if selected_model not in current_selected:
|
| 452 |
+
current_selected = current_selected + [selected_model]
|
| 453 |
+
|
| 454 |
+
comparison_html = compare_models(current_selected)
|
| 455 |
+
|
| 456 |
+
return (
|
| 457 |
+
current_selected,
|
| 458 |
+
gr.update(value=None),
|
| 459 |
+
gr.update(choices=current_selected, value=current_selected),
|
| 460 |
+
comparison_html
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
def update_selection(selected_list):
|
| 464 |
+
"""Update selection from checkbox changes."""
|
| 465 |
+
selected_list = selected_list or []
|
| 466 |
+
comparison_html = compare_models(selected_list) if selected_list else default_compare_html
|
| 467 |
+
return selected_list, comparison_html
|
| 468 |
+
|
| 469 |
+
def clear_all_models():
|
| 470 |
+
"""Clear all selected models."""
|
| 471 |
+
return (
|
| 472 |
+
[],
|
| 473 |
+
gr.update(value=None),
|
| 474 |
+
gr.update(choices=[], value=[]),
|
| 475 |
+
default_compare_html
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
# Select from dropdown adds model and auto-compares
|
| 479 |
+
model_dropdown.select(
|
| 480 |
+
fn=add_model_and_compare,
|
| 481 |
+
inputs=[model_dropdown, selected_models_state],
|
| 482 |
+
outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view]
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
selected_models_group.change(
|
| 486 |
+
fn=update_selection,
|
| 487 |
+
inputs=[selected_models_group],
|
| 488 |
+
outputs=[selected_models_state, model_card_view]
|
| 489 |
+
)
|
| 490 |
+
|
| 491 |
+
clear_models_btn.click(
|
| 492 |
+
fn=clear_all_models,
|
| 493 |
+
outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view]
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
DATA_DIR.mkdir(exist_ok=True)
|
| 497 |
+
|
| 498 |
+
if __name__ == "__main__":
|
| 499 |
+
demo.launch()
|
data_loader.py
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Loader: Load from HuggingFace, parse JSON files, and build tables.
|
| 3 |
+
"""
|
| 4 |
+
import json
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from datasets import load_dataset
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# Global caches
|
| 11 |
+
HF_DATASET_CACHE = {}
|
| 12 |
+
LEADERBOARD_CACHE = {}
|
| 13 |
+
DATA_DIR = Path("leaderboard_data")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def load_hf_dataset_on_startup():
|
| 17 |
+
"""Load all splits from HuggingFace dataset at startup."""
|
| 18 |
+
print("Loading dataset from HuggingFace...")
|
| 19 |
+
try:
|
| 20 |
+
dataset = load_dataset("evaleval/every_eval_ever")
|
| 21 |
+
|
| 22 |
+
for split_name, split_data in dataset.items():
|
| 23 |
+
print(f"Loading split: {split_name} ({len(split_data)} rows)")
|
| 24 |
+
|
| 25 |
+
df = split_data.to_pandas()
|
| 26 |
+
parsed_items = []
|
| 27 |
+
|
| 28 |
+
for _, row in df.iterrows():
|
| 29 |
+
evaluation_results = json.loads(row['evaluation_results'])
|
| 30 |
+
|
| 31 |
+
results = {}
|
| 32 |
+
for eval_result in evaluation_results:
|
| 33 |
+
eval_name = eval_result.get("evaluation_name")
|
| 34 |
+
score = eval_result.get("score_details", {}).get("score")
|
| 35 |
+
if eval_name and score is not None:
|
| 36 |
+
results[eval_name] = score
|
| 37 |
+
|
| 38 |
+
additional_details = {}
|
| 39 |
+
if pd.notna(row.get('additional_details')):
|
| 40 |
+
additional_details = json.loads(row['additional_details'])
|
| 41 |
+
|
| 42 |
+
parsed_item = {
|
| 43 |
+
"leaderboard": row['_leaderboard'],
|
| 44 |
+
"provider": row['source_organization_name'],
|
| 45 |
+
"model": row['model_id'],
|
| 46 |
+
"developer": row['model_developer'],
|
| 47 |
+
"params": additional_details.get('params_billions'),
|
| 48 |
+
"architecture": additional_details.get('architecture', 'Unknown'),
|
| 49 |
+
"precision": additional_details.get('precision', 'Unknown'),
|
| 50 |
+
"results": results,
|
| 51 |
+
"raw_data": {
|
| 52 |
+
"schema_version": row['schema_version'],
|
| 53 |
+
"evaluation_id": row['evaluation_id'],
|
| 54 |
+
"retrieved_timestamp": row['retrieved_timestamp'],
|
| 55 |
+
"source_data": json.loads(row['source_data']),
|
| 56 |
+
"evaluation_source": {
|
| 57 |
+
"evaluation_source_name": row['evaluation_source_name'],
|
| 58 |
+
"evaluation_source_type": row['evaluation_source_type']
|
| 59 |
+
},
|
| 60 |
+
"source_metadata": {
|
| 61 |
+
"source_organization_name": row['source_organization_name'],
|
| 62 |
+
"evaluator_relationship": row['evaluator_relationship'],
|
| 63 |
+
},
|
| 64 |
+
"model_info": {
|
| 65 |
+
"name": row['model_name'],
|
| 66 |
+
"id": row['model_id'],
|
| 67 |
+
"developer": row['model_developer'],
|
| 68 |
+
},
|
| 69 |
+
"evaluation_results": evaluation_results,
|
| 70 |
+
"additional_details": additional_details
|
| 71 |
+
}
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
if pd.notna(row.get('source_organization_url')):
|
| 75 |
+
parsed_item["raw_data"]["source_metadata"]["source_organization_url"] = row['source_organization_url']
|
| 76 |
+
if pd.notna(row.get('source_organization_logo_url')):
|
| 77 |
+
parsed_item["raw_data"]["source_metadata"]["source_organization_logo_url"] = row['source_organization_logo_url']
|
| 78 |
+
if pd.notna(row.get('model_inference_platform')):
|
| 79 |
+
parsed_item["raw_data"]["model_info"]["inference_platform"] = row['model_inference_platform']
|
| 80 |
+
|
| 81 |
+
parsed_items.append(parsed_item)
|
| 82 |
+
|
| 83 |
+
HF_DATASET_CACHE[split_name] = parsed_items
|
| 84 |
+
|
| 85 |
+
print(f"Loaded {len(HF_DATASET_CACHE)} leaderboard(s) from HuggingFace")
|
| 86 |
+
return True
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f"Warning: Could not load HuggingFace dataset: {e}")
|
| 89 |
+
print("Falling back to local file system...")
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def parse_eval_json(file_path):
|
| 94 |
+
"""Parses a single JSON file to extract model, provider, and results."""
|
| 95 |
+
try:
|
| 96 |
+
with open(file_path, 'r') as f:
|
| 97 |
+
data = json.load(f)
|
| 98 |
+
|
| 99 |
+
leaderboard_name = data.get("evaluation_source", {}).get("evaluation_source_name", "Unknown Leaderboard")
|
| 100 |
+
provider_name = data.get("source_metadata", {}).get("source_organization_name", "Unknown Provider")
|
| 101 |
+
model_id = data.get("model_info", {}).get("id", "Unknown Model")
|
| 102 |
+
developer_name = data.get("model_info", {}).get("developer", "Unknown Developer")
|
| 103 |
+
|
| 104 |
+
params = data.get("model_info", {}).get("params_billions", None)
|
| 105 |
+
architecture = data.get("model_info", {}).get("architecture", "Unknown")
|
| 106 |
+
precision = data.get("additional_details", {}).get("precision", "Unknown")
|
| 107 |
+
if precision == "Unknown":
|
| 108 |
+
precision = data.get("model_info", {}).get("precision", "Unknown")
|
| 109 |
+
|
| 110 |
+
results = {}
|
| 111 |
+
if "evaluation_results" in data:
|
| 112 |
+
for res in data["evaluation_results"]:
|
| 113 |
+
eval_name = res.get("evaluation_name", "Unknown Metric")
|
| 114 |
+
score = res.get("score_details", {}).get("score", None)
|
| 115 |
+
if score is not None:
|
| 116 |
+
results[eval_name] = score
|
| 117 |
+
|
| 118 |
+
return {
|
| 119 |
+
"leaderboard": leaderboard_name,
|
| 120 |
+
"provider": provider_name,
|
| 121 |
+
"model": model_id,
|
| 122 |
+
"developer": developer_name,
|
| 123 |
+
"params": params,
|
| 124 |
+
"architecture": architecture,
|
| 125 |
+
"precision": precision,
|
| 126 |
+
"results": results,
|
| 127 |
+
"raw_data": data
|
| 128 |
+
}
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"Error parsing {file_path}: {e}")
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def get_available_leaderboards():
|
| 135 |
+
"""Returns available leaderboards from HF cache or local directory."""
|
| 136 |
+
if HF_DATASET_CACHE:
|
| 137 |
+
return list(HF_DATASET_CACHE.keys())
|
| 138 |
+
|
| 139 |
+
if not DATA_DIR.exists():
|
| 140 |
+
return []
|
| 141 |
+
return [d.name for d in DATA_DIR.iterdir() if d.is_dir()]
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def walk_eval_files(leaderboard_name):
|
| 145 |
+
"""Generator that walks through Leaderboard directory recursively."""
|
| 146 |
+
lb_path = DATA_DIR / leaderboard_name
|
| 147 |
+
if not lb_path.exists():
|
| 148 |
+
return
|
| 149 |
+
yield from lb_path.rglob("*.json")
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def get_eval_metadata(selected_leaderboard):
|
| 153 |
+
"""Extracts evaluation metadata from the leaderboard data."""
|
| 154 |
+
if not selected_leaderboard:
|
| 155 |
+
return {}
|
| 156 |
+
|
| 157 |
+
eval_metadata = {"evals": {}, "source_info": {}}
|
| 158 |
+
|
| 159 |
+
if selected_leaderboard in HF_DATASET_CACHE:
|
| 160 |
+
parsed_items = HF_DATASET_CACHE[selected_leaderboard]
|
| 161 |
+
if parsed_items:
|
| 162 |
+
parsed = parsed_items[0]
|
| 163 |
+
|
| 164 |
+
source_meta = parsed["raw_data"].get("source_metadata", {})
|
| 165 |
+
source_data_list = parsed["raw_data"].get("source_data", [])
|
| 166 |
+
url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
|
| 167 |
+
|
| 168 |
+
eval_metadata["source_info"] = {
|
| 169 |
+
"organization": source_meta.get("source_organization_name", "Unknown"),
|
| 170 |
+
"relationship": source_meta.get("evaluator_relationship", "Unknown"),
|
| 171 |
+
"url": url
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
if "evaluation_results" in parsed["raw_data"]:
|
| 175 |
+
for res in parsed["raw_data"]["evaluation_results"]:
|
| 176 |
+
eval_name = res.get("evaluation_name", "Unknown Metric")
|
| 177 |
+
if eval_name not in eval_metadata["evals"]:
|
| 178 |
+
metric_config = res.get("metric_config", {})
|
| 179 |
+
eval_metadata["evals"][eval_name] = {
|
| 180 |
+
"description": metric_config.get("evaluation_description", "No description available"),
|
| 181 |
+
"score_type": metric_config.get("score_type", "unknown"),
|
| 182 |
+
"lower_is_better": metric_config.get("lower_is_better", False),
|
| 183 |
+
"min_score": metric_config.get("min_score"),
|
| 184 |
+
"max_score": metric_config.get("max_score"),
|
| 185 |
+
"level_names": metric_config.get("level_names", []),
|
| 186 |
+
"level_metadata": metric_config.get("level_metadata", []),
|
| 187 |
+
"has_unknown_level": metric_config.get("has_unknown_level", False)
|
| 188 |
+
}
|
| 189 |
+
return eval_metadata
|
| 190 |
+
|
| 191 |
+
# Fall back to file system
|
| 192 |
+
for json_file in walk_eval_files(selected_leaderboard):
|
| 193 |
+
parsed = parse_eval_json(json_file)
|
| 194 |
+
if parsed:
|
| 195 |
+
if not eval_metadata["source_info"]:
|
| 196 |
+
source_meta = parsed["raw_data"].get("source_metadata", {})
|
| 197 |
+
source_data_list = parsed["raw_data"].get("source_data", [])
|
| 198 |
+
url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
|
| 199 |
+
|
| 200 |
+
eval_metadata["source_info"] = {
|
| 201 |
+
"organization": source_meta.get("source_organization_name", "Unknown"),
|
| 202 |
+
"relationship": source_meta.get("evaluator_relationship", "Unknown"),
|
| 203 |
+
"url": url
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
if "evaluation_results" in parsed["raw_data"]:
|
| 207 |
+
for res in parsed["raw_data"]["evaluation_results"]:
|
| 208 |
+
eval_name = res.get("evaluation_name", "Unknown Metric")
|
| 209 |
+
if eval_name not in eval_metadata["evals"]:
|
| 210 |
+
metric_config = res.get("metric_config", {})
|
| 211 |
+
eval_metadata["evals"][eval_name] = {
|
| 212 |
+
"description": metric_config.get("evaluation_description", "No description available"),
|
| 213 |
+
"score_type": metric_config.get("score_type", "unknown"),
|
| 214 |
+
"lower_is_better": metric_config.get("lower_is_better", False),
|
| 215 |
+
"min_score": metric_config.get("min_score"),
|
| 216 |
+
"max_score": metric_config.get("max_score"),
|
| 217 |
+
"level_names": metric_config.get("level_names", []),
|
| 218 |
+
"level_metadata": metric_config.get("level_metadata", []),
|
| 219 |
+
"has_unknown_level": metric_config.get("has_unknown_level", False)
|
| 220 |
+
}
|
| 221 |
+
break
|
| 222 |
+
|
| 223 |
+
return eval_metadata
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def build_leaderboard_table(selected_leaderboard, search_query="", progress_callback=None):
|
| 227 |
+
"""Builds the leaderboard DataFrame from cache or files."""
|
| 228 |
+
if not selected_leaderboard:
|
| 229 |
+
return pd.DataFrame()
|
| 230 |
+
|
| 231 |
+
if selected_leaderboard in LEADERBOARD_CACHE:
|
| 232 |
+
df, _ = LEADERBOARD_CACHE[selected_leaderboard]
|
| 233 |
+
else:
|
| 234 |
+
rows = []
|
| 235 |
+
|
| 236 |
+
if selected_leaderboard in HF_DATASET_CACHE:
|
| 237 |
+
if progress_callback:
|
| 238 |
+
progress_callback(0, desc=f"Loading {selected_leaderboard} from cache...")
|
| 239 |
+
|
| 240 |
+
parsed_items = HF_DATASET_CACHE[selected_leaderboard]
|
| 241 |
+
|
| 242 |
+
for i, parsed in enumerate(parsed_items):
|
| 243 |
+
if i % 100 == 0 and progress_callback:
|
| 244 |
+
progress_callback((i / len(parsed_items)), desc=f"Processing {selected_leaderboard}...")
|
| 245 |
+
|
| 246 |
+
row = {
|
| 247 |
+
"Model": parsed["model"],
|
| 248 |
+
"Developer": parsed["developer"],
|
| 249 |
+
"Params (B)": parsed["params"],
|
| 250 |
+
"Arch": parsed["architecture"],
|
| 251 |
+
"Precision": parsed["precision"]
|
| 252 |
+
}
|
| 253 |
+
row.update(parsed["results"])
|
| 254 |
+
rows.append(row)
|
| 255 |
+
else:
|
| 256 |
+
# Fall back to file system
|
| 257 |
+
if progress_callback:
|
| 258 |
+
progress_callback(0, desc=f"Scanning {selected_leaderboard}...")
|
| 259 |
+
|
| 260 |
+
all_files = list(walk_eval_files(selected_leaderboard))
|
| 261 |
+
total_files = len(all_files)
|
| 262 |
+
|
| 263 |
+
for i, json_file in enumerate(all_files):
|
| 264 |
+
if i % 100 == 0 and progress_callback:
|
| 265 |
+
progress_callback((i / total_files), desc=f"Loading {selected_leaderboard}...")
|
| 266 |
+
|
| 267 |
+
parsed = parse_eval_json(json_file)
|
| 268 |
+
if parsed:
|
| 269 |
+
row = {
|
| 270 |
+
"Model": parsed["model"],
|
| 271 |
+
"Developer": parsed["developer"],
|
| 272 |
+
"Params (B)": parsed["params"],
|
| 273 |
+
"Arch": parsed["architecture"],
|
| 274 |
+
"Precision": parsed["precision"]
|
| 275 |
+
}
|
| 276 |
+
row.update(parsed["results"])
|
| 277 |
+
rows.append(row)
|
| 278 |
+
|
| 279 |
+
if not rows:
|
| 280 |
+
df = pd.DataFrame(columns=["Model", "Developer", "Params (B)", "Arch", "Precision"])
|
| 281 |
+
LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
|
| 282 |
+
return df
|
| 283 |
+
|
| 284 |
+
df = pd.DataFrame(rows)
|
| 285 |
+
df = df.dropna(axis=1, how='all')
|
| 286 |
+
|
| 287 |
+
if df.empty:
|
| 288 |
+
LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
|
| 289 |
+
return df
|
| 290 |
+
|
| 291 |
+
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
|
| 292 |
+
df[numeric_cols] = df[numeric_cols].round(2)
|
| 293 |
+
|
| 294 |
+
# Add Average Score
|
| 295 |
+
eval_only_cols = [c for c in numeric_cols if c not in ["Params (B)"]]
|
| 296 |
+
if len(eval_only_cols) > 0:
|
| 297 |
+
df["Average"] = df[eval_only_cols].mean(axis=1).round(2)
|
| 298 |
+
|
| 299 |
+
# Base columns: Model, Developer, Params, Average
|
| 300 |
+
# Eval columns: all evaluation scores
|
| 301 |
+
# Model detail columns: Arch, Precision (moved to end)
|
| 302 |
+
base_cols = ["Model", "Developer", "Params (B)", "Average"]
|
| 303 |
+
model_detail_cols = ["Arch", "Precision"]
|
| 304 |
+
eval_cols = [c for c in df.columns if c not in base_cols and c not in model_detail_cols]
|
| 305 |
+
base_cols = [c for c in base_cols if c in df.columns]
|
| 306 |
+
model_detail_cols = [c for c in model_detail_cols if c in df.columns]
|
| 307 |
+
|
| 308 |
+
final_cols = base_cols + sorted(eval_cols) + model_detail_cols
|
| 309 |
+
df = df[final_cols]
|
| 310 |
+
|
| 311 |
+
if "Average" in df.columns:
|
| 312 |
+
df = df.sort_values("Average", ascending=False)
|
| 313 |
+
|
| 314 |
+
LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
|
| 315 |
+
|
| 316 |
+
return df
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def clear_cache():
|
| 320 |
+
"""Clears all caches."""
|
| 321 |
+
LEADERBOARD_CACHE.clear()
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def search_model_across_leaderboards(model_query):
|
| 325 |
+
"""Search for a model across all leaderboards and return aggregated results."""
|
| 326 |
+
if not model_query or not HF_DATASET_CACHE:
|
| 327 |
+
return {}, []
|
| 328 |
+
|
| 329 |
+
model_query_lower = model_query.lower().strip()
|
| 330 |
+
results = {}
|
| 331 |
+
all_matches = []
|
| 332 |
+
|
| 333 |
+
for leaderboard_name, parsed_items in HF_DATASET_CACHE.items():
|
| 334 |
+
for item in parsed_items:
|
| 335 |
+
model_id = item.get("model", "")
|
| 336 |
+
# Check if query matches model name (case insensitive, partial match)
|
| 337 |
+
if model_query_lower in model_id.lower():
|
| 338 |
+
all_matches.append(model_id)
|
| 339 |
+
|
| 340 |
+
# Exact match gets priority
|
| 341 |
+
if model_id.lower() == model_query_lower or model_id == model_query:
|
| 342 |
+
if model_id not in results:
|
| 343 |
+
results[model_id] = {}
|
| 344 |
+
results[model_id][leaderboard_name] = {
|
| 345 |
+
"developer": item.get("developer"),
|
| 346 |
+
"params": item.get("params"),
|
| 347 |
+
"architecture": item.get("architecture"),
|
| 348 |
+
"precision": item.get("precision"),
|
| 349 |
+
"results": item.get("results", {})
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
# If no exact match, use partial matches
|
| 353 |
+
if not results and all_matches:
|
| 354 |
+
# Get the first partial match
|
| 355 |
+
for leaderboard_name, parsed_items in HF_DATASET_CACHE.items():
|
| 356 |
+
for item in parsed_items:
|
| 357 |
+
model_id = item.get("model", "")
|
| 358 |
+
if model_query_lower in model_id.lower():
|
| 359 |
+
if model_id not in results:
|
| 360 |
+
results[model_id] = {}
|
| 361 |
+
results[model_id][leaderboard_name] = {
|
| 362 |
+
"developer": item.get("developer"),
|
| 363 |
+
"params": item.get("params"),
|
| 364 |
+
"architecture": item.get("architecture"),
|
| 365 |
+
"precision": item.get("precision"),
|
| 366 |
+
"results": item.get("results", {})
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
# Return unique matches for autocomplete
|
| 370 |
+
unique_matches = sorted(set(all_matches))[:20] # Limit to 20 suggestions
|
| 371 |
+
|
| 372 |
+
return results, unique_matches
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def get_all_model_names():
|
| 376 |
+
"""Get all unique model names across all leaderboards."""
|
| 377 |
+
if not HF_DATASET_CACHE:
|
| 378 |
+
return []
|
| 379 |
+
|
| 380 |
+
models = set()
|
| 381 |
+
for parsed_items in HF_DATASET_CACHE.values():
|
| 382 |
+
for item in parsed_items:
|
| 383 |
+
models.add(item.get("model", ""))
|
| 384 |
+
|
| 385 |
+
return sorted(models)
|
| 386 |
+
|
eval.schema.json
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
| 3 |
+
"version": "0.0.1",
|
| 4 |
+
"type": "object",
|
| 5 |
+
"description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
|
| 6 |
+
"required": [
|
| 7 |
+
"schema_version",
|
| 8 |
+
"evaluation_id",
|
| 9 |
+
"evaluation_source",
|
| 10 |
+
"retrieved_timestamp",
|
| 11 |
+
"source_data",
|
| 12 |
+
"source_metadata",
|
| 13 |
+
"model_info",
|
| 14 |
+
"evaluation_results"
|
| 15 |
+
],
|
| 16 |
+
"properties": {
|
| 17 |
+
"schema_version": {
|
| 18 |
+
"type": "string",
|
| 19 |
+
"description": "Version of the schema used for this evaluation data"
|
| 20 |
+
},
|
| 21 |
+
"evaluation_id": {
|
| 22 |
+
"type": "string",
|
| 23 |
+
"description": "Unique identifier for this specific evaluation run. Use org_name/eval_name/retrieved_timestamp format"
|
| 24 |
+
},
|
| 25 |
+
"retrieved_timestamp": {
|
| 26 |
+
"type": "string",
|
| 27 |
+
"description": "Timestamp for when this record was created"
|
| 28 |
+
},
|
| 29 |
+
"source_data": {
|
| 30 |
+
"type": "array",
|
| 31 |
+
"description": "URLs for the source of the evaluation data",
|
| 32 |
+
"items": {
|
| 33 |
+
"type": "string"
|
| 34 |
+
}
|
| 35 |
+
},
|
| 36 |
+
"evaluation_source": {
|
| 37 |
+
"type": "object",
|
| 38 |
+
"description": "Details about evaluation origin. There are options that evaluations come from leaderboards (e.g. Live Code Bench Pro) or evaluation platforms (e.g. lm-eval, inspect ai, HELM...).",
|
| 39 |
+
"required": [
|
| 40 |
+
"evaluation_source_name",
|
| 41 |
+
"evaluation_source_type"
|
| 42 |
+
],
|
| 43 |
+
"properties": {
|
| 44 |
+
"evaluation_source_name": {
|
| 45 |
+
"type": "string",
|
| 46 |
+
"description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation."
|
| 47 |
+
},
|
| 48 |
+
"evaluation_source_type": {
|
| 49 |
+
"type": "string",
|
| 50 |
+
"enum": [
|
| 51 |
+
"leaderboard",
|
| 52 |
+
"evaluation_platform"
|
| 53 |
+
],
|
| 54 |
+
"description": "Type of evaluation source, e.g., leaderboard or evaluation platform"
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
},
|
| 58 |
+
"source_metadata": {
|
| 59 |
+
"type": "object",
|
| 60 |
+
"description": "Metadata about the source of the leaderboard data",
|
| 61 |
+
"required": [
|
| 62 |
+
"source_organization_name",
|
| 63 |
+
"evaluator_relationship"
|
| 64 |
+
],
|
| 65 |
+
"properties": {
|
| 66 |
+
"source_organization_name": {
|
| 67 |
+
"type": "string",
|
| 68 |
+
"description": "Name of the organization that provides the data"
|
| 69 |
+
},
|
| 70 |
+
"source_organization_url": {
|
| 71 |
+
"type": "string",
|
| 72 |
+
"description": "URL for the organization that provides the data"
|
| 73 |
+
},
|
| 74 |
+
"source_organization_logo_url": {
|
| 75 |
+
"type": "string",
|
| 76 |
+
"description": "URL for the Logo for the organization that provides the data"
|
| 77 |
+
},
|
| 78 |
+
"evaluator_relationship": {
|
| 79 |
+
"type": "string",
|
| 80 |
+
"description": "Relationship between the evaluator and the model",
|
| 81 |
+
"enum": [
|
| 82 |
+
"first_party",
|
| 83 |
+
"third_party",
|
| 84 |
+
"collaborative",
|
| 85 |
+
"other"
|
| 86 |
+
]
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
},
|
| 90 |
+
"model_info": {
|
| 91 |
+
"type": "object",
|
| 92 |
+
"description": "Complete model specification including basic information, technical configuration and inference settings",
|
| 93 |
+
"required": [
|
| 94 |
+
"name",
|
| 95 |
+
"id"
|
| 96 |
+
],
|
| 97 |
+
"properties": {
|
| 98 |
+
"name": {
|
| 99 |
+
"type": "string",
|
| 100 |
+
"description": "Model name provided by evaluation source"
|
| 101 |
+
},
|
| 102 |
+
"id": {
|
| 103 |
+
"type": "string",
|
| 104 |
+
"description": "Model name standarized to HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
|
| 105 |
+
},
|
| 106 |
+
"developer": {
|
| 107 |
+
"type": "string",
|
| 108 |
+
"description": "Name of organization that provides the model (e.g. 'OpenAI')"
|
| 109 |
+
},
|
| 110 |
+
"inference_platform": {
|
| 111 |
+
"type": "string",
|
| 112 |
+
"description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"evaluation_results": {
|
| 117 |
+
"type": "array",
|
| 118 |
+
"description": "Array of evaluation results",
|
| 119 |
+
"items": {
|
| 120 |
+
"type": "object",
|
| 121 |
+
"required": [
|
| 122 |
+
"evaluation_name",
|
| 123 |
+
"metric_config",
|
| 124 |
+
"score_details"
|
| 125 |
+
],
|
| 126 |
+
"properties": {
|
| 127 |
+
"evaluation_name": {
|
| 128 |
+
"type": "string",
|
| 129 |
+
"description": "Name of the evaluation"
|
| 130 |
+
},
|
| 131 |
+
"evaluation_timestamp": {
|
| 132 |
+
"type": "string",
|
| 133 |
+
"description": "Timestamp for when the evaluations were run"
|
| 134 |
+
},
|
| 135 |
+
"metric_config": {
|
| 136 |
+
"type": "object",
|
| 137 |
+
"description": "Details about the metric",
|
| 138 |
+
"required": [
|
| 139 |
+
"lower_is_better"
|
| 140 |
+
],
|
| 141 |
+
"properties": {
|
| 142 |
+
"evaluation_description": {
|
| 143 |
+
"type": "string",
|
| 144 |
+
"description": "Description of the evaluation"
|
| 145 |
+
},
|
| 146 |
+
"lower_is_better": {
|
| 147 |
+
"type": "boolean",
|
| 148 |
+
"description": "Whether a lower score is better"
|
| 149 |
+
},
|
| 150 |
+
"score_type": {
|
| 151 |
+
"type": "string",
|
| 152 |
+
"description": "Type of score",
|
| 153 |
+
"enum": [
|
| 154 |
+
"binary",
|
| 155 |
+
"continuous",
|
| 156 |
+
"levels"
|
| 157 |
+
]
|
| 158 |
+
},
|
| 159 |
+
"level_names": {
|
| 160 |
+
"type": "array",
|
| 161 |
+
"description": "Names of the score levels",
|
| 162 |
+
"items": {
|
| 163 |
+
"type": "string"
|
| 164 |
+
}
|
| 165 |
+
},
|
| 166 |
+
"level_metadata": {
|
| 167 |
+
"type": "array",
|
| 168 |
+
"description": "Additional Description for each Score Level",
|
| 169 |
+
"items": {
|
| 170 |
+
"type": "string"
|
| 171 |
+
}
|
| 172 |
+
},
|
| 173 |
+
"has_unknown_level": {
|
| 174 |
+
"type": "boolean",
|
| 175 |
+
"description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
|
| 176 |
+
},
|
| 177 |
+
"min_score": {
|
| 178 |
+
"type": "number",
|
| 179 |
+
"description": "Minimum possible score for continuous metric"
|
| 180 |
+
},
|
| 181 |
+
"max_score": {
|
| 182 |
+
"type": "number",
|
| 183 |
+
"description": "Maximum possible score for continuous metric"
|
| 184 |
+
}
|
| 185 |
+
},
|
| 186 |
+
"if": {
|
| 187 |
+
"properties": {
|
| 188 |
+
"score_type": {
|
| 189 |
+
"const": "levels"
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
},
|
| 193 |
+
"then": {
|
| 194 |
+
"required": [
|
| 195 |
+
"level_names",
|
| 196 |
+
"has_unknown_level"
|
| 197 |
+
]
|
| 198 |
+
},
|
| 199 |
+
"else": {
|
| 200 |
+
"if": {
|
| 201 |
+
"properties": {
|
| 202 |
+
"score_type": {
|
| 203 |
+
"const": "continuous"
|
| 204 |
+
}
|
| 205 |
+
}
|
| 206 |
+
},
|
| 207 |
+
"then": {
|
| 208 |
+
"required": [
|
| 209 |
+
"min_score",
|
| 210 |
+
"max_score"
|
| 211 |
+
]
|
| 212 |
+
}
|
| 213 |
+
}
|
| 214 |
+
},
|
| 215 |
+
"score_details": {
|
| 216 |
+
"type": "object",
|
| 217 |
+
"description": "The score for the evaluation and related details",
|
| 218 |
+
"required": [
|
| 219 |
+
"score"
|
| 220 |
+
],
|
| 221 |
+
"properties": {
|
| 222 |
+
"score": {
|
| 223 |
+
"type": "number",
|
| 224 |
+
"description": "The score for the evaluation"
|
| 225 |
+
},
|
| 226 |
+
"details": {
|
| 227 |
+
"type": "object",
|
| 228 |
+
"description": "Any additional details about the score",
|
| 229 |
+
"additionalProperties": true
|
| 230 |
+
}
|
| 231 |
+
}
|
| 232 |
+
},
|
| 233 |
+
"detailed_evaluation_results_url": {
|
| 234 |
+
"type": "string",
|
| 235 |
+
"description": "Link to detailed evaluation data"
|
| 236 |
+
},
|
| 237 |
+
"generation_config": {
|
| 238 |
+
"type": "object",
|
| 239 |
+
"generation_args": {
|
| 240 |
+
"type": "object",
|
| 241 |
+
"description": "Parameters used to generate results - properties may vary by model type",
|
| 242 |
+
"properties": {
|
| 243 |
+
"temperature": {
|
| 244 |
+
"type": [
|
| 245 |
+
"null",
|
| 246 |
+
"number"
|
| 247 |
+
],
|
| 248 |
+
"description": "Sampling temperature"
|
| 249 |
+
},
|
| 250 |
+
"top_p": {
|
| 251 |
+
"type": [
|
| 252 |
+
"null",
|
| 253 |
+
"number"
|
| 254 |
+
],
|
| 255 |
+
"description": "Nucleus sampling parameter"
|
| 256 |
+
},
|
| 257 |
+
"top_k": {
|
| 258 |
+
"type": [
|
| 259 |
+
"null",
|
| 260 |
+
"number"
|
| 261 |
+
],
|
| 262 |
+
"description": "Top-k sampling parameter"
|
| 263 |
+
},
|
| 264 |
+
"max_tokens": {
|
| 265 |
+
"type": "integer",
|
| 266 |
+
"minimum": 1,
|
| 267 |
+
"description": "Maximum number of tokens to generate"
|
| 268 |
+
}
|
| 269 |
+
},
|
| 270 |
+
"additionalProperties": true
|
| 271 |
+
},
|
| 272 |
+
"additional_details": {
|
| 273 |
+
"type": "string",
|
| 274 |
+
"description": "Additional details about how the results for this metric were generated."
|
| 275 |
+
}
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
}
|
| 281 |
+
}
|
| 282 |
+
}
|
hf_operations.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HuggingFace Operations: Upload data, create PRs, validate schemas.
|
| 3 |
+
"""
|
| 4 |
+
from huggingface_hub import HfApi, login
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from jsonschema import validate, ValidationError, Draft7Validator
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# Load schema once at module level
|
| 12 |
+
SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
|
| 13 |
+
with open(SCHEMA_PATH, 'r') as f:
|
| 14 |
+
EVAL_SCHEMA = json.load(f)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def validate_json_against_schema(json_data):
|
| 18 |
+
"""
|
| 19 |
+
Validate a JSON object against eval.schema.json.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
json_data: Dict containing the evaluation data
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
(bool, str): (is_valid, error_message)
|
| 26 |
+
"""
|
| 27 |
+
try:
|
| 28 |
+
validate(instance=json_data, schema=EVAL_SCHEMA)
|
| 29 |
+
return True, "Schema validation passed"
|
| 30 |
+
except ValidationError as e:
|
| 31 |
+
# Extract the most relevant error message
|
| 32 |
+
error_path = " β ".join(str(p) for p in e.path) if e.path else "root"
|
| 33 |
+
return False, f"β Schema validation failed at '{error_path}': {e.message}"
|
| 34 |
+
except Exception as e:
|
| 35 |
+
return False, f"β Validation error: {str(e)}"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def upload_to_hf_dataset(parquet_file, split_name, repo_id):
|
| 39 |
+
"""
|
| 40 |
+
Upload a parquet file as a new split to the HF dataset.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
parquet_file: Path to parquet file
|
| 44 |
+
split_name: Name of the split (leaderboard name)
|
| 45 |
+
repo_id: HuggingFace dataset repository ID
|
| 46 |
+
"""
|
| 47 |
+
# TODO: Implement upload logic
|
| 48 |
+
pass
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def check_hf_authentication():
|
| 52 |
+
"""
|
| 53 |
+
Check if user is authenticated with HuggingFace.
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
(bool, str): (is_authenticated, username or error_message)
|
| 57 |
+
"""
|
| 58 |
+
try:
|
| 59 |
+
api = HfApi()
|
| 60 |
+
user_info = api.whoami()
|
| 61 |
+
return True, user_info['name']
|
| 62 |
+
except Exception as e:
|
| 63 |
+
return False, "Not authenticated. Run: huggingface-cli login"
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def check_duplicate_pr_exists(leaderboard_name, repo_id):
|
| 67 |
+
"""
|
| 68 |
+
Check if a PR already exists for this leaderboard.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
leaderboard_name: Name of the leaderboard
|
| 72 |
+
repo_id: HuggingFace dataset repository ID
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
(bool, str or None): (exists, pr_url if exists)
|
| 76 |
+
"""
|
| 77 |
+
try:
|
| 78 |
+
api = HfApi()
|
| 79 |
+
discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
|
| 80 |
+
|
| 81 |
+
# Check for open PRs with matching title
|
| 82 |
+
pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
|
| 83 |
+
for discussion in discussions:
|
| 84 |
+
if discussion.is_pull_request and discussion.status == "open":
|
| 85 |
+
if pr_title_pattern in discussion.title.lower():
|
| 86 |
+
pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
|
| 87 |
+
return True, pr_url
|
| 88 |
+
|
| 89 |
+
return False, None
|
| 90 |
+
except Exception as e:
|
| 91 |
+
# If we can't check, assume no duplicate (fail open)
|
| 92 |
+
print(f"Warning: Could not check for duplicate PRs: {e}")
|
| 93 |
+
return False, None
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
|
| 97 |
+
"""
|
| 98 |
+
Create a pull request to add a new leaderboard split.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
leaderboard_name: Name of the new leaderboard
|
| 102 |
+
parquet_file: Path to parquet file
|
| 103 |
+
repo_id: HuggingFace dataset repository ID
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
(success, pr_url or error_message)
|
| 107 |
+
"""
|
| 108 |
+
# 1. Check authentication
|
| 109 |
+
is_auth, auth_result = check_hf_authentication()
|
| 110 |
+
if not is_auth:
|
| 111 |
+
return False, f"β {auth_result}"
|
| 112 |
+
|
| 113 |
+
# 2. Check for duplicate PR
|
| 114 |
+
has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
|
| 115 |
+
if has_duplicate:
|
| 116 |
+
return False, f"β οΈ PR already exists: {duplicate_url}"
|
| 117 |
+
|
| 118 |
+
# 3. Validate parquet file exists and has data
|
| 119 |
+
parquet_path = Path(parquet_file)
|
| 120 |
+
if not parquet_path.exists():
|
| 121 |
+
return False, "β Parquet file not found"
|
| 122 |
+
|
| 123 |
+
df = pd.read_parquet(parquet_file)
|
| 124 |
+
if len(df) == 0:
|
| 125 |
+
return False, "β Parquet file is empty"
|
| 126 |
+
|
| 127 |
+
# 4. Create PR
|
| 128 |
+
try:
|
| 129 |
+
api = HfApi()
|
| 130 |
+
|
| 131 |
+
# Upload the parquet file to the branch
|
| 132 |
+
commit_message = f"Add new leaderboard: {leaderboard_name}"
|
| 133 |
+
|
| 134 |
+
# Upload file and create PR
|
| 135 |
+
commit_info = api.upload_file(
|
| 136 |
+
path_or_fileobj=parquet_file,
|
| 137 |
+
path_in_repo=f"data/{leaderboard_name}.parquet",
|
| 138 |
+
repo_id=repo_id,
|
| 139 |
+
repo_type="dataset",
|
| 140 |
+
commit_message=commit_message,
|
| 141 |
+
create_pr=True,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# Extract PR URL from commit info
|
| 145 |
+
pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions"
|
| 146 |
+
|
| 147 |
+
return True, f"PR created ({len(df)} rows): {pr_url}"
|
| 148 |
+
|
| 149 |
+
except Exception as e:
|
| 150 |
+
return False, f"β Failed to create PR: {str(e)}"
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def validate_schema(parquet_file):
|
| 154 |
+
"""
|
| 155 |
+
Validate that a parquet file matches the expected schema.
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
parquet_file: Path to parquet file to validate
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
(bool, str): (is_valid, error_message)
|
| 162 |
+
"""
|
| 163 |
+
try:
|
| 164 |
+
df = pd.read_parquet(parquet_file)
|
| 165 |
+
|
| 166 |
+
# Required columns
|
| 167 |
+
required_cols = [
|
| 168 |
+
'_leaderboard', '_developer', '_model', '_uuid',
|
| 169 |
+
'schema_version', 'evaluation_id', 'retrieved_timestamp',
|
| 170 |
+
'source_data', 'evaluation_source_name', 'evaluation_source_type',
|
| 171 |
+
'source_organization_name', 'evaluator_relationship',
|
| 172 |
+
'model_name', 'model_id', 'model_developer',
|
| 173 |
+
'evaluation_results'
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
missing = [col for col in required_cols if col not in df.columns]
|
| 177 |
+
if missing:
|
| 178 |
+
return False, f"Missing required columns: {', '.join(missing)}"
|
| 179 |
+
|
| 180 |
+
# Check data types (all should be strings)
|
| 181 |
+
for col in df.columns:
|
| 182 |
+
if df[col].dtype not in ['object', 'string']:
|
| 183 |
+
return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)"
|
| 184 |
+
|
| 185 |
+
return True, "Schema validation passed"
|
| 186 |
+
|
| 187 |
+
except Exception as e:
|
| 188 |
+
return False, f"Validation error: {str(e)}"
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def export_to_json(parquet_file, output_dir):
|
| 192 |
+
"""
|
| 193 |
+
Export parquet data back to JSON files.
|
| 194 |
+
Uses the parquet_to_folder function from json_to_parquet.py
|
| 195 |
+
|
| 196 |
+
Args:
|
| 197 |
+
parquet_file: Path to parquet file
|
| 198 |
+
output_dir: Directory to write JSON files to
|
| 199 |
+
"""
|
| 200 |
+
from json_to_parquet import parquet_to_folder
|
| 201 |
+
parquet_to_folder(parquet_file, output_dir)
|
| 202 |
+
|
pyproject.toml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "eee-test"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.11"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"gradio>=5.49.1",
|
| 9 |
+
"pandas>=2.3.2",
|
| 10 |
+
]
|
ui_components.py
ADDED
|
@@ -0,0 +1,1374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
UI Components: Themes, CSS, and HTML formatters for the Gradio interface.
|
| 3 |
+
Nord color theme with balanced contrast.
|
| 4 |
+
"""
|
| 5 |
+
import gradio as gr
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def get_theme():
|
| 9 |
+
"""Returns the Nord-themed Gradio theme, locked to dark mode."""
|
| 10 |
+
return gr.themes.Base(
|
| 11 |
+
primary_hue="blue",
|
| 12 |
+
neutral_hue="slate",
|
| 13 |
+
font=[gr.themes.GoogleFont("DM Sans"), "system-ui", "sans-serif"],
|
| 14 |
+
font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"],
|
| 15 |
+
).set(
|
| 16 |
+
body_background_fill="#2E3440",
|
| 17 |
+
body_background_fill_dark="#2E3440",
|
| 18 |
+
body_text_color="#ECEFF4",
|
| 19 |
+
body_text_color_dark="#ECEFF4",
|
| 20 |
+
body_text_color_subdued="#4C566A",
|
| 21 |
+
body_text_color_subdued_dark="#4C566A",
|
| 22 |
+
block_background_fill="#3B4252",
|
| 23 |
+
block_background_fill_dark="#3B4252",
|
| 24 |
+
block_border_width="1px",
|
| 25 |
+
block_border_color="#434C5E",
|
| 26 |
+
block_border_color_dark="#434C5E",
|
| 27 |
+
block_label_text_color="#D8DEE9",
|
| 28 |
+
block_label_text_color_dark="#D8DEE9",
|
| 29 |
+
block_title_text_color="#ECEFF4",
|
| 30 |
+
block_title_text_color_dark="#ECEFF4",
|
| 31 |
+
input_background_fill="#2E3440",
|
| 32 |
+
input_background_fill_dark="#2E3440",
|
| 33 |
+
input_border_color="#4C566A",
|
| 34 |
+
input_border_color_dark="#4C566A",
|
| 35 |
+
button_primary_background_fill="#88C0D0",
|
| 36 |
+
button_primary_background_fill_dark="#88C0D0",
|
| 37 |
+
button_primary_text_color="#2E3440",
|
| 38 |
+
button_primary_text_color_dark="#2E3440",
|
| 39 |
+
button_secondary_background_fill="#434C5E",
|
| 40 |
+
button_secondary_background_fill_dark="#434C5E",
|
| 41 |
+
button_secondary_text_color="#ECEFF4",
|
| 42 |
+
button_secondary_text_color_dark="#ECEFF4",
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def get_custom_css():
|
| 47 |
+
"""Returns custom CSS with Nord colors."""
|
| 48 |
+
return """
|
| 49 |
+
/* === Nord Theme ===
|
| 50 |
+
Polar Night: #2E3440 (bg), #3B4252 (surface), #434C5E, #4C566A
|
| 51 |
+
Snow Storm: #D8DEE9, #E5E9F0, #ECEFF4
|
| 52 |
+
Frost: #8FBCBB, #88C0D0, #81A1C1, #5E81AC
|
| 53 |
+
Aurora: #BF616A, #D08770, #EBCB8B, #A3BE8C, #B48EAD
|
| 54 |
+
*/
|
| 55 |
+
|
| 56 |
+
/* Lock the UI to dark Nord regardless of OS preference */
|
| 57 |
+
:root {
|
| 58 |
+
color-scheme: dark;
|
| 59 |
+
background-color: #2E3440;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
body {
|
| 63 |
+
background: #2E3440 !important;
|
| 64 |
+
color: #ECEFF4 !important;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
/* === Base === */
|
| 68 |
+
.gradio-container {
|
| 69 |
+
max-width: 100% !important;
|
| 70 |
+
margin: 0 !important;
|
| 71 |
+
padding: 1.25rem 2.5rem 2rem !important;
|
| 72 |
+
background: #2E3440 !important;
|
| 73 |
+
color: #ECEFF4 !important;
|
| 74 |
+
font-family: 'DM Sans', system-ui, sans-serif !important;
|
| 75 |
+
font-size: 16px !important;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
/* === Header === */
|
| 79 |
+
.app-header {
|
| 80 |
+
display: flex;
|
| 81 |
+
align-items: center;
|
| 82 |
+
gap: 1rem;
|
| 83 |
+
margin-bottom: 1.5rem;
|
| 84 |
+
padding: 1.25rem 1.5rem;
|
| 85 |
+
background: #3B4252;
|
| 86 |
+
border: 1px solid #434C5E;
|
| 87 |
+
border-radius: 12px;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.app-header .logo-mark {
|
| 91 |
+
width: 48px;
|
| 92 |
+
height: 48px;
|
| 93 |
+
background: linear-gradient(135deg, #88C0D0 0%, #81A1C1 100%);
|
| 94 |
+
border-radius: 12px;
|
| 95 |
+
display: flex;
|
| 96 |
+
align-items: center;
|
| 97 |
+
justify-content: center;
|
| 98 |
+
font-weight: 800;
|
| 99 |
+
font-size: 1.1rem;
|
| 100 |
+
color: #2E3440;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
.app-header .brand {
|
| 104 |
+
display: flex;
|
| 105 |
+
flex-direction: column;
|
| 106 |
+
gap: 0.125rem;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
.app-header h1 {
|
| 110 |
+
margin: 0;
|
| 111 |
+
font-size: 1.5rem;
|
| 112 |
+
font-weight: 700;
|
| 113 |
+
color: #ECEFF4;
|
| 114 |
+
letter-spacing: -0.02em;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.app-header .tagline {
|
| 118 |
+
color: #D8DEE9;
|
| 119 |
+
font-size: 0.85rem;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
.app-header .header-right {
|
| 123 |
+
margin-left: auto;
|
| 124 |
+
display: flex;
|
| 125 |
+
align-items: center;
|
| 126 |
+
gap: 0.75rem;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
.app-header .version-badge {
|
| 130 |
+
background: rgba(136, 192, 208, 0.2);
|
| 131 |
+
border: 1px solid rgba(136, 192, 208, 0.4);
|
| 132 |
+
border-radius: 6px;
|
| 133 |
+
padding: 0.25rem 0.625rem;
|
| 134 |
+
font-size: 0.7rem;
|
| 135 |
+
font-family: 'JetBrains Mono', monospace;
|
| 136 |
+
color: #88C0D0;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
/* === Tabs === */
|
| 140 |
+
.tabs {
|
| 141 |
+
border: none !important;
|
| 142 |
+
background: transparent !important;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
.tab-nav {
|
| 146 |
+
background: #3B4252 !important;
|
| 147 |
+
border: 1px solid #434C5E !important;
|
| 148 |
+
border-radius: 10px !important;
|
| 149 |
+
padding: 0.25rem !important;
|
| 150 |
+
gap: 0.25rem !important;
|
| 151 |
+
margin-bottom: 1.25rem !important;
|
| 152 |
+
display: inline-flex !important;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
.tab-nav button {
|
| 156 |
+
background: transparent !important;
|
| 157 |
+
border: none !important;
|
| 158 |
+
color: #D8DEE9 !important;
|
| 159 |
+
padding: 0.75rem 1.5rem !important;
|
| 160 |
+
font-size: 0.95rem !important;
|
| 161 |
+
font-weight: 500 !important;
|
| 162 |
+
border-radius: 8px !important;
|
| 163 |
+
transition: all 0.15s ease !important;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
.tab-nav button.selected {
|
| 167 |
+
color: #2E3440 !important;
|
| 168 |
+
background: #88C0D0 !important;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
.tab-nav button:hover:not(.selected) {
|
| 172 |
+
background: #434C5E !important;
|
| 173 |
+
color: #ECEFF4 !important;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
.tabitem {
|
| 177 |
+
background: transparent !important;
|
| 178 |
+
border: none !important;
|
| 179 |
+
padding: 0 !important;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
/* === Controls bar === */
|
| 183 |
+
.controls-bar {
|
| 184 |
+
background: #3B4252 !important;
|
| 185 |
+
border: 1px solid #434C5E !important;
|
| 186 |
+
border-radius: 10px !important;
|
| 187 |
+
padding: 0.75rem 1.25rem !important;
|
| 188 |
+
margin-bottom: 1rem !important;
|
| 189 |
+
gap: 0.75rem !important;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
.controls-bar label {
|
| 193 |
+
font-size: 0.75rem !important;
|
| 194 |
+
text-transform: uppercase !important;
|
| 195 |
+
letter-spacing: 0.04em !important;
|
| 196 |
+
color: #D8DEE9 !important;
|
| 197 |
+
font-weight: 500 !important;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
/* === Info banner === */
|
| 201 |
+
.info-banner {
|
| 202 |
+
background: #3B4252 !important;
|
| 203 |
+
border: 1px solid #434C5E !important;
|
| 204 |
+
border-left: 3px solid #88C0D0 !important;
|
| 205 |
+
border-radius: 0 10px 10px 0 !important;
|
| 206 |
+
padding: 0.75rem 1rem !important;
|
| 207 |
+
margin-bottom: 1rem !important;
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
.info-banner h3 {
|
| 211 |
+
margin: 0;
|
| 212 |
+
font-size: 1.1rem;
|
| 213 |
+
font-weight: 600;
|
| 214 |
+
color: #ECEFF4;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
.info-banner .eval-tags {
|
| 218 |
+
display: flex;
|
| 219 |
+
flex-wrap: wrap;
|
| 220 |
+
gap: 0.375rem;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
.info-banner .eval-tag {
|
| 224 |
+
background: rgba(143, 188, 187, 0.15);
|
| 225 |
+
border: 1px solid rgba(143, 188, 187, 0.3);
|
| 226 |
+
border-radius: 4px;
|
| 227 |
+
padding: 0.3rem 0.6rem;
|
| 228 |
+
font-size: 0.8rem;
|
| 229 |
+
font-family: 'JetBrains Mono', monospace;
|
| 230 |
+
color: #8FBCBB;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
/* === Dataframe - seamless styling === */
|
| 234 |
+
.dataframe,
|
| 235 |
+
.dataframe > div,
|
| 236 |
+
.dataframe > div > div,
|
| 237 |
+
.dataframe .table-wrap,
|
| 238 |
+
.dataframe .svelte-1gfkn6j {
|
| 239 |
+
background: #2E3440 !important;
|
| 240 |
+
border: none !important;
|
| 241 |
+
box-shadow: none !important;
|
| 242 |
+
border-radius: 0 !important;
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
.dataframe table {
|
| 246 |
+
width: 100% !important;
|
| 247 |
+
border-collapse: collapse !important;
|
| 248 |
+
font-size: 0.95rem !important;
|
| 249 |
+
table-layout: auto !important;
|
| 250 |
+
background: #2E3440 !important;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
.dataframe thead,
|
| 254 |
+
.dataframe thead tr {
|
| 255 |
+
background: #2E3440 !important;
|
| 256 |
+
position: sticky;
|
| 257 |
+
top: 0;
|
| 258 |
+
z-index: 10;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
.dataframe thead th {
|
| 262 |
+
padding: 0.875rem 1rem !important;
|
| 263 |
+
font-weight: 600 !important;
|
| 264 |
+
font-size: 0.75rem !important;
|
| 265 |
+
text-transform: uppercase !important;
|
| 266 |
+
letter-spacing: 0.05em !important;
|
| 267 |
+
color: #81A1C1 !important;
|
| 268 |
+
border-bottom: 1px solid #434C5E !important;
|
| 269 |
+
border-top: none !important;
|
| 270 |
+
text-align: left !important;
|
| 271 |
+
background: #2E3440 !important;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
.dataframe tbody,
|
| 275 |
+
.dataframe tbody tr {
|
| 276 |
+
background: #2E3440 !important;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
.dataframe tbody tr {
|
| 280 |
+
border-bottom: 1px solid #3B4252 !important;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
.dataframe tbody tr:hover {
|
| 284 |
+
background: rgba(136, 192, 208, 0.04) !important;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.dataframe tbody td {
|
| 288 |
+
padding: 0.75rem 1rem !important;
|
| 289 |
+
color: #E5E9F0 !important;
|
| 290 |
+
background: #2E3440 !important;
|
| 291 |
+
overflow: hidden !important;
|
| 292 |
+
text-overflow: ellipsis !important;
|
| 293 |
+
border: none !important;
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
/* === Pagination bar === */
|
| 297 |
+
.pagination-bar {
|
| 298 |
+
margin-top: 1rem !important;
|
| 299 |
+
padding: 1rem 0 !important;
|
| 300 |
+
border-top: 1px solid #3B4252 !important;
|
| 301 |
+
display: flex !important;
|
| 302 |
+
justify-content: center !important;
|
| 303 |
+
align-items: center !important;
|
| 304 |
+
gap: 1rem !important;
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
.page-info {
|
| 308 |
+
font-family: 'JetBrains Mono', monospace !important;
|
| 309 |
+
font-size: 1rem !important;
|
| 310 |
+
color: #D8DEE9 !important;
|
| 311 |
+
min-width: 80px !important;
|
| 312 |
+
text-align: center !important;
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
/* Model name - white, readable */
|
| 316 |
+
.dataframe tbody td:first-child {
|
| 317 |
+
font-weight: 500 !important;
|
| 318 |
+
color: #ECEFF4 !important;
|
| 319 |
+
white-space: nowrap !important;
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
/* All other columns - use monospace for numbers */
|
| 323 |
+
.dataframe tbody td:not(:first-child) {
|
| 324 |
+
font-family: 'JetBrains Mono', monospace !important;
|
| 325 |
+
color: #8FBCBB !important;
|
| 326 |
+
text-align: left !important;
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
.dataframe tbody td:nth-child(2) {
|
| 330 |
+
color: #88C0D0 !important;
|
| 331 |
+
white-space: nowrap !important;
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
.dataframe tbody td:nth-child(3) {
|
| 335 |
+
color: #D08770 !important;
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
.dataframe tbody td:nth-child(4) {
|
| 339 |
+
font-weight: 600 !important;
|
| 340 |
+
color: #A3BE8C !important;
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
.dataframe tbody td:nth-child(n+5) {
|
| 344 |
+
white-space: nowrap !important;
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
/* === Status text === */
|
| 348 |
+
.status-text {
|
| 349 |
+
font-size: 0.9rem !important;
|
| 350 |
+
color: #D8DEE9 !important;
|
| 351 |
+
padding: 0.5rem 0 !important;
|
| 352 |
+
font-family: 'JetBrains Mono', monospace !important;
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
/* === Model Card === */
|
| 356 |
+
.model-card-container {
|
| 357 |
+
display: flex;
|
| 358 |
+
flex-direction: column;
|
| 359 |
+
gap: 1.25rem;
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
.model-card-header {
|
| 363 |
+
background: #3B4252;
|
| 364 |
+
border: 1px solid #434C5E;
|
| 365 |
+
border-radius: 12px;
|
| 366 |
+
padding: 1.5rem 2rem;
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
.model-card-header h2 {
|
| 370 |
+
margin: 0 0 0.5rem 0;
|
| 371 |
+
font-size: 1.5rem;
|
| 372 |
+
font-weight: 600;
|
| 373 |
+
color: #ECEFF4;
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
.model-card-header .model-meta {
|
| 377 |
+
display: flex;
|
| 378 |
+
gap: 1.5rem;
|
| 379 |
+
color: #D8DEE9;
|
| 380 |
+
font-size: 0.95rem;
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
.model-card-header .model-meta strong {
|
| 384 |
+
color: #8FBCBB;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
.leaderboard-section {
|
| 388 |
+
background: #3B4252;
|
| 389 |
+
border: 1px solid #434C5E;
|
| 390 |
+
border-radius: 10px;
|
| 391 |
+
overflow: hidden;
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
.leaderboard-section-header {
|
| 395 |
+
background: #434C5E;
|
| 396 |
+
padding: 1rem 1.25rem;
|
| 397 |
+
border-bottom: 1px solid #4C566A;
|
| 398 |
+
display: flex;
|
| 399 |
+
justify-content: space-between;
|
| 400 |
+
align-items: center;
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
.leaderboard-section-header h3 {
|
| 404 |
+
margin: 0;
|
| 405 |
+
font-size: 1rem;
|
| 406 |
+
font-weight: 600;
|
| 407 |
+
color: #88C0D0;
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
.leaderboard-section-header .lb-avg {
|
| 411 |
+
background: rgba(163, 190, 140, 0.15);
|
| 412 |
+
border: 1px solid rgba(163, 190, 140, 0.3);
|
| 413 |
+
border-radius: 8px;
|
| 414 |
+
padding: 0.5rem 1rem;
|
| 415 |
+
font-size: 0.85rem;
|
| 416 |
+
color: #D8DEE9;
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
.leaderboard-section-header .lb-avg strong {
|
| 420 |
+
color: #A3BE8C;
|
| 421 |
+
font-family: 'JetBrains Mono', monospace;
|
| 422 |
+
font-size: 1.1rem;
|
| 423 |
+
font-weight: 700;
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
.scores-grid {
|
| 427 |
+
display: grid;
|
| 428 |
+
grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
|
| 429 |
+
gap: 1px;
|
| 430 |
+
background: #434C5E;
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
.score-item {
|
| 434 |
+
background: #3B4252;
|
| 435 |
+
padding: 1rem 1.25rem;
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
.score-item .score-label {
|
| 439 |
+
font-size: 0.8rem;
|
| 440 |
+
text-transform: uppercase;
|
| 441 |
+
letter-spacing: 0.05em;
|
| 442 |
+
color: #D8DEE9;
|
| 443 |
+
margin-bottom: 0.375rem;
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
.score-item .score-value {
|
| 447 |
+
font-size: 1.5rem;
|
| 448 |
+
font-weight: 600;
|
| 449 |
+
font-family: 'JetBrains Mono', monospace;
|
| 450 |
+
color: #A3BE8C;
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
.score-item.highlight .score-value {
|
| 454 |
+
color: #88C0D0;
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
.no-results {
|
| 458 |
+
text-align: center;
|
| 459 |
+
padding: 3rem 1rem;
|
| 460 |
+
color: #D8DEE9;
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
.no-results h3 {
|
| 464 |
+
color: #ECEFF4;
|
| 465 |
+
margin-bottom: 0.5rem;
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
/* === New Comparison View === */
|
| 470 |
+
.comparison-container {
|
| 471 |
+
display: flex;
|
| 472 |
+
flex-direction: column;
|
| 473 |
+
gap: 1.5rem;
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
.comparison-summary {
|
| 477 |
+
background: #3B4252;
|
| 478 |
+
border: 1px solid #434C5E;
|
| 479 |
+
border-radius: 12px;
|
| 480 |
+
padding: 1.5rem;
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
.comparison-summary h2 {
|
| 484 |
+
margin: 0 0 1rem 0;
|
| 485 |
+
color: #ECEFF4;
|
| 486 |
+
font-size: 1.25rem;
|
| 487 |
+
}
|
| 488 |
+
|
| 489 |
+
.summary-cards {
|
| 490 |
+
display: flex;
|
| 491 |
+
gap: 1rem;
|
| 492 |
+
flex-wrap: wrap;
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
.summary-card {
|
| 496 |
+
flex: 1;
|
| 497 |
+
min-width: 200px;
|
| 498 |
+
background: #2E3440;
|
| 499 |
+
border-radius: 8px;
|
| 500 |
+
padding: 1rem;
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
.summary-card-header {
|
| 504 |
+
display: flex;
|
| 505 |
+
align-items: center;
|
| 506 |
+
gap: 0.5rem;
|
| 507 |
+
margin-bottom: 0.75rem;
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
.model-dot {
|
| 511 |
+
width: 10px;
|
| 512 |
+
height: 10px;
|
| 513 |
+
border-radius: 50%;
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
.model-name {
|
| 517 |
+
font-weight: 600;
|
| 518 |
+
color: #ECEFF4;
|
| 519 |
+
font-size: 0.9rem;
|
| 520 |
+
overflow: hidden;
|
| 521 |
+
text-overflow: ellipsis;
|
| 522 |
+
white-space: nowrap;
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
.summary-card-body {
|
| 526 |
+
display: flex;
|
| 527 |
+
flex-direction: column;
|
| 528 |
+
gap: 0.5rem;
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
.summary-stat {
|
| 532 |
+
display: flex;
|
| 533 |
+
justify-content: space-between;
|
| 534 |
+
align-items: center;
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
.summary-stat .stat-label {
|
| 538 |
+
font-size: 0.75rem;
|
| 539 |
+
color: #D8DEE9;
|
| 540 |
+
text-transform: uppercase;
|
| 541 |
+
letter-spacing: 0.05em;
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
.summary-stat .stat-value {
|
| 545 |
+
font-family: 'JetBrains Mono', monospace;
|
| 546 |
+
color: #8FBCBB;
|
| 547 |
+
}
|
| 548 |
+
|
| 549 |
+
.summary-stat.primary .stat-value.large {
|
| 550 |
+
font-size: 1.5rem;
|
| 551 |
+
font-weight: 700;
|
| 552 |
+
color: #A3BE8C;
|
| 553 |
+
}
|
| 554 |
+
|
| 555 |
+
.leaderboard-comparison-card {
|
| 556 |
+
background: #3B4252;
|
| 557 |
+
border: 1px solid #434C5E;
|
| 558 |
+
border-radius: 12px;
|
| 559 |
+
overflow: hidden;
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
.lb-card-header {
|
| 563 |
+
background: #434C5E;
|
| 564 |
+
padding: 0.875rem 1.25rem;
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
.lb-card-header h3 {
|
| 568 |
+
margin: 0;
|
| 569 |
+
color: #88C0D0;
|
| 570 |
+
font-size: 1rem;
|
| 571 |
+
font-weight: 600;
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
.lb-card-body {
|
| 575 |
+
padding: 1rem 1.25rem;
|
| 576 |
+
display: flex;
|
| 577 |
+
flex-direction: column;
|
| 578 |
+
gap: 0.75rem;
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
+
.metric-comparison {
|
| 582 |
+
display: flex;
|
| 583 |
+
flex-direction: column;
|
| 584 |
+
gap: 0.375rem;
|
| 585 |
+
}
|
| 586 |
+
|
| 587 |
+
.metric-name-row {
|
| 588 |
+
margin-bottom: 0.25rem;
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
.metric-title {
|
| 592 |
+
font-size: 0.85rem;
|
| 593 |
+
font-weight: 600;
|
| 594 |
+
color: #ECEFF4;
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
.metric-title.sub {
|
| 598 |
+
font-size: 0.75rem;
|
| 599 |
+
font-weight: 500;
|
| 600 |
+
color: #D8DEE9;
|
| 601 |
+
}
|
| 602 |
+
|
| 603 |
+
.model-score-row {
|
| 604 |
+
display: flex;
|
| 605 |
+
align-items: center;
|
| 606 |
+
gap: 0.5rem;
|
| 607 |
+
padding: 0.375rem 0;
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
.model-score-row.compact {
|
| 611 |
+
padding: 0.25rem 0;
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
.model-score-row.best-score {
|
| 615 |
+
background: rgba(163, 190, 140, 0.1);
|
| 616 |
+
border-radius: 4px;
|
| 617 |
+
padding-left: 0.5rem;
|
| 618 |
+
margin-left: -0.5rem;
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
.model-score-row.no-data {
|
| 622 |
+
opacity: 0.5;
|
| 623 |
+
}
|
| 624 |
+
|
| 625 |
+
.model-indicator {
|
| 626 |
+
width: 8px;
|
| 627 |
+
height: 8px;
|
| 628 |
+
border-radius: 2px;
|
| 629 |
+
flex-shrink: 0;
|
| 630 |
+
}
|
| 631 |
+
|
| 632 |
+
.model-indicator.small {
|
| 633 |
+
width: 6px;
|
| 634 |
+
height: 6px;
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
.score-bar-container {
|
| 638 |
+
flex: 1;
|
| 639 |
+
display: flex;
|
| 640 |
+
align-items: center;
|
| 641 |
+
gap: 0.75rem;
|
| 642 |
+
height: 24px;
|
| 643 |
+
background: #2E3440;
|
| 644 |
+
border-radius: 4px;
|
| 645 |
+
padding: 0 0.5rem;
|
| 646 |
+
position: relative;
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
.score-bar {
|
| 650 |
+
position: absolute;
|
| 651 |
+
left: 0;
|
| 652 |
+
top: 0;
|
| 653 |
+
bottom: 0;
|
| 654 |
+
border-radius: 4px;
|
| 655 |
+
opacity: 0.3;
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
.score-bar.thin {
|
| 659 |
+
opacity: 0.2;
|
| 660 |
+
}
|
| 661 |
+
|
| 662 |
+
.score-value {
|
| 663 |
+
position: relative;
|
| 664 |
+
font-family: 'JetBrains Mono', monospace;
|
| 665 |
+
font-size: 0.9rem;
|
| 666 |
+
font-weight: 600;
|
| 667 |
+
color: #ECEFF4;
|
| 668 |
+
z-index: 1;
|
| 669 |
+
}
|
| 670 |
+
|
| 671 |
+
.score-value.small {
|
| 672 |
+
font-size: 0.8rem;
|
| 673 |
+
font-weight: 500;
|
| 674 |
+
}
|
| 675 |
+
|
| 676 |
+
.score-value.dim {
|
| 677 |
+
color: #4C566A;
|
| 678 |
+
}
|
| 679 |
+
|
| 680 |
+
/* === Selected Models Chips === */
|
| 681 |
+
.selected-models-group label {
|
| 682 |
+
display: inline-flex !important;
|
| 683 |
+
align-items: center !important;
|
| 684 |
+
background: #434C5E;
|
| 685 |
+
border: 1px solid #4C566A;
|
| 686 |
+
border-radius: 16px;
|
| 687 |
+
padding: 0.35rem 0.85rem;
|
| 688 |
+
font-size: 0.85rem;
|
| 689 |
+
color: #ECEFF4;
|
| 690 |
+
gap: 0.4rem;
|
| 691 |
+
cursor: pointer;
|
| 692 |
+
margin: 0.15rem 0.3rem 0.15rem 0 !important;
|
| 693 |
+
}
|
| 694 |
+
|
| 695 |
+
.selected-models-group label span::before {
|
| 696 |
+
content: "Γ";
|
| 697 |
+
font-size: 0.75rem;
|
| 698 |
+
color: #EBCB8B;
|
| 699 |
+
opacity: 0;
|
| 700 |
+
transition: opacity 0.15s ease;
|
| 701 |
+
}
|
| 702 |
+
|
| 703 |
+
.selected-models-group label:hover span::before {
|
| 704 |
+
opacity: 1;
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
.selected-models-group input[type="checkbox"] {
|
| 708 |
+
display: none;
|
| 709 |
+
}
|
| 710 |
+
|
| 711 |
+
/* === Heat Map Table === */
|
| 712 |
+
.heatmap-table-wrapper {
|
| 713 |
+
overflow-x: auto;
|
| 714 |
+
margin-top: 1rem;
|
| 715 |
+
}
|
| 716 |
+
|
| 717 |
+
.heatmap-table {
|
| 718 |
+
width: 100%;
|
| 719 |
+
border-collapse: collapse;
|
| 720 |
+
font-size: 0.85rem;
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
.heatmap-table thead {
|
| 724 |
+
position: sticky;
|
| 725 |
+
top: 0;
|
| 726 |
+
z-index: 10;
|
| 727 |
+
}
|
| 728 |
+
|
| 729 |
+
.heatmap-table th {
|
| 730 |
+
background: #434C5E;
|
| 731 |
+
padding: 0.625rem 0.75rem;
|
| 732 |
+
font-weight: 600;
|
| 733 |
+
font-size: 0.7rem;
|
| 734 |
+
text-transform: uppercase;
|
| 735 |
+
letter-spacing: 0.05em;
|
| 736 |
+
color: #81A1C1;
|
| 737 |
+
text-align: left;
|
| 738 |
+
border-bottom: 2px solid #4C566A;
|
| 739 |
+
white-space: nowrap;
|
| 740 |
+
}
|
| 741 |
+
|
| 742 |
+
.heatmap-table th.metric-header {
|
| 743 |
+
min-width: 120px;
|
| 744 |
+
}
|
| 745 |
+
|
| 746 |
+
.heatmap-table th.model-header {
|
| 747 |
+
text-align: center;
|
| 748 |
+
max-width: 150px;
|
| 749 |
+
overflow: hidden;
|
| 750 |
+
text-overflow: ellipsis;
|
| 751 |
+
}
|
| 752 |
+
|
| 753 |
+
.heatmap-table td {
|
| 754 |
+
padding: 0.5rem 0.75rem;
|
| 755 |
+
border-bottom: 1px solid #3B4252;
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
+
.heatmap-table td.metric-name {
|
| 759 |
+
font-weight: 500;
|
| 760 |
+
color: #D8DEE9;
|
| 761 |
+
background: #2E3440;
|
| 762 |
+
}
|
| 763 |
+
|
| 764 |
+
.heatmap-table td.score-cell {
|
| 765 |
+
text-align: center;
|
| 766 |
+
font-family: 'JetBrains Mono', monospace;
|
| 767 |
+
font-weight: 500;
|
| 768 |
+
transition: all 0.15s ease;
|
| 769 |
+
}
|
| 770 |
+
|
| 771 |
+
.heatmap-table td.score-cell.best {
|
| 772 |
+
background: rgba(163, 190, 140, 0.25);
|
| 773 |
+
color: #A3BE8C;
|
| 774 |
+
font-weight: 700;
|
| 775 |
+
}
|
| 776 |
+
|
| 777 |
+
.heatmap-table td.score-cell.good {
|
| 778 |
+
background: rgba(163, 190, 140, 0.12);
|
| 779 |
+
color: #A3BE8C;
|
| 780 |
+
}
|
| 781 |
+
|
| 782 |
+
.heatmap-table td.score-cell.mid {
|
| 783 |
+
background: rgba(235, 203, 139, 0.12);
|
| 784 |
+
color: #EBCB8B;
|
| 785 |
+
}
|
| 786 |
+
|
| 787 |
+
.heatmap-table td.score-cell.low {
|
| 788 |
+
background: rgba(208, 135, 112, 0.12);
|
| 789 |
+
color: #D08770;
|
| 790 |
+
}
|
| 791 |
+
|
| 792 |
+
.heatmap-table td.score-cell.worst {
|
| 793 |
+
background: rgba(191, 97, 106, 0.15);
|
| 794 |
+
color: #BF616A;
|
| 795 |
+
}
|
| 796 |
+
|
| 797 |
+
.heatmap-table td.score-cell.na {
|
| 798 |
+
color: #4C566A;
|
| 799 |
+
font-style: italic;
|
| 800 |
+
}
|
| 801 |
+
|
| 802 |
+
.heatmap-table tr.avg-row {
|
| 803 |
+
background: rgba(136, 192, 208, 0.08);
|
| 804 |
+
}
|
| 805 |
+
|
| 806 |
+
.heatmap-table tr.avg-row td.metric-name {
|
| 807 |
+
font-weight: 700;
|
| 808 |
+
color: #88C0D0;
|
| 809 |
+
background: rgba(136, 192, 208, 0.08);
|
| 810 |
+
}
|
| 811 |
+
|
| 812 |
+
/* === Buttons === */
|
| 813 |
+
button {
|
| 814 |
+
border-radius: 8px !important;
|
| 815 |
+
font-weight: 500 !important;
|
| 816 |
+
font-size: 0.95rem !important;
|
| 817 |
+
transition: all 0.15s ease !important;
|
| 818 |
+
}
|
| 819 |
+
|
| 820 |
+
button.primary {
|
| 821 |
+
background: #88C0D0 !important;
|
| 822 |
+
color: #2E3440 !important;
|
| 823 |
+
border: none !important;
|
| 824 |
+
}
|
| 825 |
+
|
| 826 |
+
button.primary:hover:not(:disabled) {
|
| 827 |
+
background: #8FBCBB !important;
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
button.secondary,
|
| 831 |
+
button[variant="secondary"] {
|
| 832 |
+
background: #434C5E !important;
|
| 833 |
+
color: #ECEFF4 !important;
|
| 834 |
+
border: 1px solid #4C566A !important;
|
| 835 |
+
}
|
| 836 |
+
|
| 837 |
+
button.secondary:hover:not(:disabled),
|
| 838 |
+
button[variant="secondary"]:hover:not(:disabled) {
|
| 839 |
+
background: #4C566A !important;
|
| 840 |
+
}
|
| 841 |
+
|
| 842 |
+
button:disabled {
|
| 843 |
+
opacity: 0.35 !important;
|
| 844 |
+
}
|
| 845 |
+
|
| 846 |
+
/* === Inputs === */
|
| 847 |
+
input[type="text"],
|
| 848 |
+
select {
|
| 849 |
+
background: #2E3440 !important;
|
| 850 |
+
border: 1px solid #4C566A !important;
|
| 851 |
+
border-radius: 8px !important;
|
| 852 |
+
color: #ECEFF4 !important;
|
| 853 |
+
font-size: 1rem !important;
|
| 854 |
+
}
|
| 855 |
+
|
| 856 |
+
input[type="text"]:focus,
|
| 857 |
+
select:focus {
|
| 858 |
+
border-color: #88C0D0 !important;
|
| 859 |
+
box-shadow: 0 0 0 3px rgba(136, 192, 208, 0.15) !important;
|
| 860 |
+
outline: none !important;
|
| 861 |
+
}
|
| 862 |
+
|
| 863 |
+
input::placeholder {
|
| 864 |
+
color: #4C566A !important;
|
| 865 |
+
}
|
| 866 |
+
|
| 867 |
+
/* === Accordion === */
|
| 868 |
+
.accordion {
|
| 869 |
+
background: #3B4252 !important;
|
| 870 |
+
border: 1px solid #434C5E !important;
|
| 871 |
+
border-radius: 10px !important;
|
| 872 |
+
margin-top: 1.5rem !important;
|
| 873 |
+
}
|
| 874 |
+
|
| 875 |
+
.accordion > .label-wrap {
|
| 876 |
+
background: transparent !important;
|
| 877 |
+
padding: 1rem 1.25rem !important;
|
| 878 |
+
color: #D8DEE9 !important;
|
| 879 |
+
font-size: 0.95rem !important;
|
| 880 |
+
}
|
| 881 |
+
|
| 882 |
+
.accordion > .wrap {
|
| 883 |
+
padding: 0.5rem 1.25rem 1.25rem !important;
|
| 884 |
+
color: #D8DEE9 !important;
|
| 885 |
+
font-size: 0.95rem !important;
|
| 886 |
+
line-height: 1.6 !important;
|
| 887 |
+
}
|
| 888 |
+
|
| 889 |
+
.accordion code {
|
| 890 |
+
background: #434C5E !important;
|
| 891 |
+
padding: 0.125rem 0.375rem !important;
|
| 892 |
+
border-radius: 4px !important;
|
| 893 |
+
font-family: 'JetBrains Mono', monospace !important;
|
| 894 |
+
font-size: 0.8rem !important;
|
| 895 |
+
color: #8FBCBB !important;
|
| 896 |
+
}
|
| 897 |
+
|
| 898 |
+
/* === Metrics section === */
|
| 899 |
+
.metrics-section {
|
| 900 |
+
margin-top: 1.5rem;
|
| 901 |
+
padding-top: 1.5rem;
|
| 902 |
+
border-top: 1px solid #434C5E;
|
| 903 |
+
}
|
| 904 |
+
|
| 905 |
+
.metrics-section h3 {
|
| 906 |
+
font-size: 0.85rem;
|
| 907 |
+
font-weight: 600;
|
| 908 |
+
color: #D8DEE9;
|
| 909 |
+
margin: 0 0 1rem 0;
|
| 910 |
+
text-transform: uppercase;
|
| 911 |
+
letter-spacing: 0.05em;
|
| 912 |
+
}
|
| 913 |
+
|
| 914 |
+
.metrics-grid {
|
| 915 |
+
display: grid;
|
| 916 |
+
grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
|
| 917 |
+
gap: 0.75rem;
|
| 918 |
+
}
|
| 919 |
+
|
| 920 |
+
.metric-card {
|
| 921 |
+
background: #3B4252;
|
| 922 |
+
border: 1px solid #434C5E;
|
| 923 |
+
border-radius: 8px;
|
| 924 |
+
overflow: hidden;
|
| 925 |
+
}
|
| 926 |
+
|
| 927 |
+
.metric-card-header {
|
| 928 |
+
display: flex;
|
| 929 |
+
justify-content: space-between;
|
| 930 |
+
align-items: center;
|
| 931 |
+
padding: 0.75rem 1rem;
|
| 932 |
+
cursor: pointer;
|
| 933 |
+
list-style: none;
|
| 934 |
+
}
|
| 935 |
+
|
| 936 |
+
.metric-card-header::-webkit-details-marker {
|
| 937 |
+
display: none;
|
| 938 |
+
}
|
| 939 |
+
|
| 940 |
+
.metric-card-name {
|
| 941 |
+
font-weight: 500;
|
| 942 |
+
font-size: 0.95rem;
|
| 943 |
+
color: #ECEFF4;
|
| 944 |
+
}
|
| 945 |
+
|
| 946 |
+
.metric-card-direction {
|
| 947 |
+
font-size: 0.8rem;
|
| 948 |
+
color: #D8DEE9;
|
| 949 |
+
}
|
| 950 |
+
|
| 951 |
+
.metric-card-direction .arrow {
|
| 952 |
+
color: #A3BE8C;
|
| 953 |
+
font-weight: 600;
|
| 954 |
+
}
|
| 955 |
+
|
| 956 |
+
.metric-card-body {
|
| 957 |
+
padding: 0.875rem 1.25rem;
|
| 958 |
+
border-top: 1px solid #434C5E;
|
| 959 |
+
font-size: 0.9rem;
|
| 960 |
+
color: #D8DEE9;
|
| 961 |
+
line-height: 1.5;
|
| 962 |
+
}
|
| 963 |
+
|
| 964 |
+
.metric-type-badge {
|
| 965 |
+
font-size: 0.65rem;
|
| 966 |
+
text-transform: uppercase;
|
| 967 |
+
letter-spacing: 0.05em;
|
| 968 |
+
padding: 0.15rem 0.4rem;
|
| 969 |
+
background: rgba(180, 142, 173, 0.2);
|
| 970 |
+
border: 1px solid rgba(180, 142, 173, 0.35);
|
| 971 |
+
border-radius: 4px;
|
| 972 |
+
color: #B48EAD;
|
| 973 |
+
font-family: 'JetBrains Mono', monospace;
|
| 974 |
+
}
|
| 975 |
+
|
| 976 |
+
/* === Scrollbar === */
|
| 977 |
+
::-webkit-scrollbar {
|
| 978 |
+
width: 8px;
|
| 979 |
+
height: 8px;
|
| 980 |
+
}
|
| 981 |
+
|
| 982 |
+
::-webkit-scrollbar-track {
|
| 983 |
+
background: #2E3440;
|
| 984 |
+
}
|
| 985 |
+
|
| 986 |
+
::-webkit-scrollbar-thumb {
|
| 987 |
+
background: #4C566A;
|
| 988 |
+
border-radius: 4px;
|
| 989 |
+
}
|
| 990 |
+
|
| 991 |
+
::-webkit-scrollbar-thumb:hover {
|
| 992 |
+
background: #5E81AC;
|
| 993 |
+
}
|
| 994 |
+
|
| 995 |
+
/* === Responsive === */
|
| 996 |
+
@media (max-width: 768px) {
|
| 997 |
+
.gradio-container {
|
| 998 |
+
padding: 1rem !important;
|
| 999 |
+
}
|
| 1000 |
+
|
| 1001 |
+
.scores-grid {
|
| 1002 |
+
grid-template-columns: repeat(2, 1fr);
|
| 1003 |
+
}
|
| 1004 |
+
}
|
| 1005 |
+
|
| 1006 |
+
/* === Overrides === */
|
| 1007 |
+
.gradio-container footer {
|
| 1008 |
+
display: none !important;
|
| 1009 |
+
}
|
| 1010 |
+
|
| 1011 |
+
.block {
|
| 1012 |
+
background: #3B4252 !important;
|
| 1013 |
+
}
|
| 1014 |
+
|
| 1015 |
+
.gradio-radio label {
|
| 1016 |
+
background: #434C5E !important;
|
| 1017 |
+
border: 1px solid #4C566A !important;
|
| 1018 |
+
color: #ECEFF4 !important;
|
| 1019 |
+
border-radius: 8px !important;
|
| 1020 |
+
font-size: 0.85rem !important;
|
| 1021 |
+
}
|
| 1022 |
+
|
| 1023 |
+
.gradio-radio label.selected {
|
| 1024 |
+
background: #88C0D0 !important;
|
| 1025 |
+
border-color: #88C0D0 !important;
|
| 1026 |
+
color: #2E3440 !important;
|
| 1027 |
+
}
|
| 1028 |
+
"""
|
| 1029 |
+
|
| 1030 |
+
|
| 1031 |
+
def format_leaderboard_header(selected_leaderboard, metadata):
|
| 1032 |
+
"""Formats the leaderboard header info section."""
|
| 1033 |
+
if not selected_leaderboard:
|
| 1034 |
+
return """
|
| 1035 |
+
<div style="text-align: center; padding: 2rem 1rem; color: #D8DEE9;">
|
| 1036 |
+
<div style="font-size: 1.1rem;">Select a leaderboard to explore</div>
|
| 1037 |
+
</div>
|
| 1038 |
+
"""
|
| 1039 |
+
|
| 1040 |
+
if not metadata or not metadata.get("evals"):
|
| 1041 |
+
return f"""
|
| 1042 |
+
<div class="info-banner">
|
| 1043 |
+
<h3>{selected_leaderboard}</h3>
|
| 1044 |
+
</div>
|
| 1045 |
+
"""
|
| 1046 |
+
|
| 1047 |
+
source_info = metadata.get("source_info", {})
|
| 1048 |
+
org = source_info.get("organization", "Unknown")
|
| 1049 |
+
url = source_info.get("url", "#")
|
| 1050 |
+
eval_names = list(metadata["evals"].keys())
|
| 1051 |
+
|
| 1052 |
+
eval_tags = "".join([f'<span class="eval-tag">{name}</span>' for name in eval_names])
|
| 1053 |
+
|
| 1054 |
+
return f"""
|
| 1055 |
+
<div class="info-banner">
|
| 1056 |
+
<div style="display: flex; justify-content: space-between; align-items: center; gap: 1rem;">
|
| 1057 |
+
<div style="display: flex; align-items: center; gap: 1rem; flex-wrap: wrap;">
|
| 1058 |
+
<h3 style="margin: 0;">{selected_leaderboard}</h3>
|
| 1059 |
+
<span style="color: #D8DEE9; font-size: 0.8rem;">by {org}</span>
|
| 1060 |
+
<div class="eval-tags" style="margin: 0;">{eval_tags}</div>
|
| 1061 |
+
</div>
|
| 1062 |
+
<a href="{url}" target="_blank" style="
|
| 1063 |
+
font-size: 0.75rem;
|
| 1064 |
+
color: #88C0D0;
|
| 1065 |
+
text-decoration: none;
|
| 1066 |
+
padding: 0.375rem 0.75rem;
|
| 1067 |
+
border: 1px solid rgba(136, 192, 208, 0.4);
|
| 1068 |
+
border-radius: 6px;
|
| 1069 |
+
white-space: nowrap;
|
| 1070 |
+
">Source β</a>
|
| 1071 |
+
</div>
|
| 1072 |
+
</div>
|
| 1073 |
+
"""
|
| 1074 |
+
|
| 1075 |
+
|
| 1076 |
+
def format_metric_details(selected_leaderboard, metadata):
|
| 1077 |
+
"""Formats metric detail cards."""
|
| 1078 |
+
if not selected_leaderboard or not metadata or not metadata.get("evals"):
|
| 1079 |
+
return ""
|
| 1080 |
+
|
| 1081 |
+
evals = metadata.get("evals", {})
|
| 1082 |
+
|
| 1083 |
+
html = """
|
| 1084 |
+
<div class="metrics-section">
|
| 1085 |
+
<h3>Metric Reference</h3>
|
| 1086 |
+
<div class="metrics-grid">
|
| 1087 |
+
"""
|
| 1088 |
+
|
| 1089 |
+
for eval_name, info in evals.items():
|
| 1090 |
+
score_type = info['score_type'].upper() if info.get('score_type') else "β"
|
| 1091 |
+
direction = "Lower is better" if info.get('lower_is_better') else "Higher is better"
|
| 1092 |
+
arrow = "β" if info.get('lower_is_better') else "β"
|
| 1093 |
+
|
| 1094 |
+
details = ""
|
| 1095 |
+
if info.get('score_type') == "continuous" and info.get('min_score') is not None:
|
| 1096 |
+
details = f"Range: [{info['min_score']} β {info['max_score']}]"
|
| 1097 |
+
elif info.get('score_type') == "levels" and info.get('level_names'):
|
| 1098 |
+
details = f"Levels: {', '.join(str(l) for l in info['level_names'])}"
|
| 1099 |
+
|
| 1100 |
+
html += f"""
|
| 1101 |
+
<details class="metric-card">
|
| 1102 |
+
<summary class="metric-card-header">
|
| 1103 |
+
<span class="metric-card-name">{eval_name}</span>
|
| 1104 |
+
<span class="metric-card-direction"><span class="arrow">{arrow}</span> {direction}</span>
|
| 1105 |
+
</summary>
|
| 1106 |
+
<div class="metric-card-body">
|
| 1107 |
+
<div>{info.get('description', 'No description')}</div>
|
| 1108 |
+
<div style="display: flex; justify-content: space-between; align-items: center; margin-top: 0.5rem;">
|
| 1109 |
+
<span style="font-size: 0.75rem; color: #D8DEE9;">{details}</span>
|
| 1110 |
+
<span class="metric-type-badge">{score_type}</span>
|
| 1111 |
+
</div>
|
| 1112 |
+
</div>
|
| 1113 |
+
</details>
|
| 1114 |
+
"""
|
| 1115 |
+
|
| 1116 |
+
html += "</div></div>"
|
| 1117 |
+
return html
|
| 1118 |
+
|
| 1119 |
+
|
| 1120 |
+
def format_model_card(model_name, model_data):
|
| 1121 |
+
"""Formats a model card showing all evals across leaderboards."""
|
| 1122 |
+
if not model_data:
|
| 1123 |
+
return """
|
| 1124 |
+
<div class="no-results">
|
| 1125 |
+
<h3>No results found</h3>
|
| 1126 |
+
<p>Try searching for a different model name</p>
|
| 1127 |
+
</div>
|
| 1128 |
+
"""
|
| 1129 |
+
|
| 1130 |
+
first = list(model_data.values())[0]
|
| 1131 |
+
developer = first.get("developer", "Unknown")
|
| 1132 |
+
params = first.get("params")
|
| 1133 |
+
arch = first.get("architecture", "Unknown")
|
| 1134 |
+
|
| 1135 |
+
params_str = f"{params}B" if params else "β"
|
| 1136 |
+
|
| 1137 |
+
html = f"""
|
| 1138 |
+
<div class="model-card-container">
|
| 1139 |
+
<div class="model-card-header">
|
| 1140 |
+
<h2>{model_name}</h2>
|
| 1141 |
+
<div class="model-meta">
|
| 1142 |
+
<span><strong>Developer:</strong> {developer}</span>
|
| 1143 |
+
<span><strong>Parameters:</strong> {params_str}</span>
|
| 1144 |
+
<span><strong>Architecture:</strong> {arch}</span>
|
| 1145 |
+
</div>
|
| 1146 |
+
</div>
|
| 1147 |
+
"""
|
| 1148 |
+
|
| 1149 |
+
for leaderboard_name, data in model_data.items():
|
| 1150 |
+
results = data.get("results", {})
|
| 1151 |
+
if not results:
|
| 1152 |
+
continue
|
| 1153 |
+
|
| 1154 |
+
scores = [v for v in results.values() if v is not None]
|
| 1155 |
+
avg = sum(scores) / len(scores) if scores else None
|
| 1156 |
+
avg_str = f"{avg:.2f}" if avg else "β"
|
| 1157 |
+
|
| 1158 |
+
html += f"""
|
| 1159 |
+
<div class="leaderboard-section">
|
| 1160 |
+
<div class="leaderboard-section-header">
|
| 1161 |
+
<h3>{leaderboard_name}</h3>
|
| 1162 |
+
<span class="lb-avg">Avg: <strong>{avg_str}</strong></span>
|
| 1163 |
+
</div>
|
| 1164 |
+
<div class="scores-grid">
|
| 1165 |
+
"""
|
| 1166 |
+
|
| 1167 |
+
sorted_results = sorted(results.items(), key=lambda x: x[1] if x[1] is not None else 0, reverse=True)
|
| 1168 |
+
|
| 1169 |
+
for i, (metric_name, score) in enumerate(sorted_results):
|
| 1170 |
+
score_display = f"{score:.2f}" if score is not None else "β"
|
| 1171 |
+
highlight_class = "highlight" if i == 0 else ""
|
| 1172 |
+
|
| 1173 |
+
html += f"""
|
| 1174 |
+
<div class="score-item {highlight_class}">
|
| 1175 |
+
<div class="score-label">{metric_name}</div>
|
| 1176 |
+
<div class="score-value">{score_display}</div>
|
| 1177 |
+
</div>
|
| 1178 |
+
"""
|
| 1179 |
+
|
| 1180 |
+
html += "</div></div>"
|
| 1181 |
+
|
| 1182 |
+
html += "</div>"
|
| 1183 |
+
return html
|
| 1184 |
+
|
| 1185 |
+
|
| 1186 |
+
def format_model_comparison(selected_models, all_results):
|
| 1187 |
+
"""Formats a comparison view showing multiple models with visual indicators."""
|
| 1188 |
+
if not selected_models or not all_results:
|
| 1189 |
+
return """
|
| 1190 |
+
<div class="no-results">
|
| 1191 |
+
<h3>Select models to compare</h3>
|
| 1192 |
+
<p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
|
| 1193 |
+
</div>
|
| 1194 |
+
"""
|
| 1195 |
+
|
| 1196 |
+
# Get all unique leaderboards across selected models
|
| 1197 |
+
all_leaderboards = set()
|
| 1198 |
+
model_data_dict = {}
|
| 1199 |
+
|
| 1200 |
+
for model_name in selected_models:
|
| 1201 |
+
if model_name in all_results:
|
| 1202 |
+
model_data_dict[model_name] = all_results[model_name]
|
| 1203 |
+
for leaderboard_name in all_results[model_name].keys():
|
| 1204 |
+
all_leaderboards.add(leaderboard_name)
|
| 1205 |
+
|
| 1206 |
+
if not model_data_dict:
|
| 1207 |
+
return """
|
| 1208 |
+
<div class="no-results">
|
| 1209 |
+
<h3>No data found for selected models</h3>
|
| 1210 |
+
<p>Try selecting different models</p>
|
| 1211 |
+
</div>
|
| 1212 |
+
"""
|
| 1213 |
+
|
| 1214 |
+
all_leaderboards = sorted(all_leaderboards)
|
| 1215 |
+
model_colors = ['#88C0D0', '#A3BE8C', '#EBCB8B', '#D08770', '#B48EAD', '#8FBCBB', '#81A1C1', '#BF616A']
|
| 1216 |
+
|
| 1217 |
+
# Calculate overall averages for summary
|
| 1218 |
+
overall_avgs = {}
|
| 1219 |
+
for model_name in selected_models:
|
| 1220 |
+
if model_name in model_data_dict:
|
| 1221 |
+
all_scores = []
|
| 1222 |
+
for lb_data in model_data_dict[model_name].values():
|
| 1223 |
+
all_scores.extend([v for v in lb_data.get("results", {}).values() if v is not None])
|
| 1224 |
+
overall_avgs[model_name] = sum(all_scores) / len(all_scores) if all_scores else None
|
| 1225 |
+
|
| 1226 |
+
html = """
|
| 1227 |
+
<div class="comparison-container">
|
| 1228 |
+
<div class="comparison-summary">
|
| 1229 |
+
<h2>Model Comparison</h2>
|
| 1230 |
+
<div class="summary-cards">
|
| 1231 |
+
"""
|
| 1232 |
+
|
| 1233 |
+
# Summary cards for each model
|
| 1234 |
+
for i, model_name in enumerate(selected_models):
|
| 1235 |
+
color = model_colors[i % len(model_colors)]
|
| 1236 |
+
avg = overall_avgs.get(model_name)
|
| 1237 |
+
avg_str = f"{avg:.2f}" if avg is not None else "β"
|
| 1238 |
+
|
| 1239 |
+
# Get model info
|
| 1240 |
+
model_info = list(model_data_dict.get(model_name, {}).values())
|
| 1241 |
+
developer = model_info[0].get("developer", "Unknown") if model_info else "Unknown"
|
| 1242 |
+
|
| 1243 |
+
html += f"""
|
| 1244 |
+
<div class="summary-card" style="border-left: 4px solid {color};">
|
| 1245 |
+
<div class="summary-card-header">
|
| 1246 |
+
<span class="model-dot" style="background: {color};"></span>
|
| 1247 |
+
<span class="model-name">{model_name}</span>
|
| 1248 |
+
</div>
|
| 1249 |
+
<div class="summary-card-body">
|
| 1250 |
+
<div class="summary-stat">
|
| 1251 |
+
<span class="stat-label">Developer</span>
|
| 1252 |
+
<span class="stat-value">{developer}</span>
|
| 1253 |
+
</div>
|
| 1254 |
+
<div class="summary-stat primary">
|
| 1255 |
+
<span class="stat-label">Overall Avg</span>
|
| 1256 |
+
<span class="stat-value large">{avg_str}</span>
|
| 1257 |
+
</div>
|
| 1258 |
+
</div>
|
| 1259 |
+
</div>
|
| 1260 |
+
"""
|
| 1261 |
+
|
| 1262 |
+
html += """
|
| 1263 |
+
</div>
|
| 1264 |
+
</div>
|
| 1265 |
+
"""
|
| 1266 |
+
|
| 1267 |
+
# Leaderboard comparison cards
|
| 1268 |
+
for leaderboard_name in all_leaderboards:
|
| 1269 |
+
leaderboard_metrics = set()
|
| 1270 |
+
for model_data in model_data_dict.values():
|
| 1271 |
+
if leaderboard_name in model_data:
|
| 1272 |
+
results = model_data[leaderboard_name].get("results", {})
|
| 1273 |
+
leaderboard_metrics.update(results.keys())
|
| 1274 |
+
|
| 1275 |
+
leaderboard_metrics = sorted(leaderboard_metrics)
|
| 1276 |
+
if not leaderboard_metrics:
|
| 1277 |
+
continue
|
| 1278 |
+
|
| 1279 |
+
# Calculate averages for ranking
|
| 1280 |
+
model_avgs = {}
|
| 1281 |
+
for model_name in selected_models:
|
| 1282 |
+
if model_name in model_data_dict and leaderboard_name in model_data_dict[model_name]:
|
| 1283 |
+
results = model_data_dict[model_name][leaderboard_name].get("results", {})
|
| 1284 |
+
scores = [v for v in results.values() if v is not None]
|
| 1285 |
+
model_avgs[model_name] = sum(scores) / len(scores) if scores else None
|
| 1286 |
+
|
| 1287 |
+
html += f"""
|
| 1288 |
+
<div class="leaderboard-comparison-card">
|
| 1289 |
+
<div class="lb-card-header">
|
| 1290 |
+
<h3>{leaderboard_name}</h3>
|
| 1291 |
+
</div>
|
| 1292 |
+
<div class="lb-card-body">
|
| 1293 |
+
"""
|
| 1294 |
+
|
| 1295 |
+
# Compact heat-map table
|
| 1296 |
+
html += '<div class="heatmap-table-wrapper">'
|
| 1297 |
+
html += '<table class="heatmap-table">'
|
| 1298 |
+
|
| 1299 |
+
# Header with model names
|
| 1300 |
+
html += '<thead><tr><th class="metric-header">Metric</th>'
|
| 1301 |
+
for i, model_name in enumerate(selected_models):
|
| 1302 |
+
# Truncate long names
|
| 1303 |
+
short_name = model_name if len(model_name) <= 20 else model_name[:18] + "β¦"
|
| 1304 |
+
html += f'<th class="model-header" title="{model_name}">{short_name}</th>'
|
| 1305 |
+
html += '</tr></thead>'
|
| 1306 |
+
|
| 1307 |
+
html += '<tbody>'
|
| 1308 |
+
|
| 1309 |
+
# Average row first
|
| 1310 |
+
html += '<tr class="avg-row"><td class="metric-name">Average</td>'
|
| 1311 |
+
valid_avgs_list = [model_avgs.get(m) for m in selected_models if model_avgs.get(m) is not None]
|
| 1312 |
+
max_avg_val = max(valid_avgs_list) if valid_avgs_list else None
|
| 1313 |
+
|
| 1314 |
+
for model_name in selected_models:
|
| 1315 |
+
avg = model_avgs.get(model_name)
|
| 1316 |
+
if avg is not None:
|
| 1317 |
+
cell_class = "best" if avg == max_avg_val and len(valid_avgs_list) > 1 else ""
|
| 1318 |
+
html += f'<td class="score-cell {cell_class}">{avg:.2f}</td>'
|
| 1319 |
+
else:
|
| 1320 |
+
html += '<td class="score-cell na">β</td>'
|
| 1321 |
+
html += '</tr>'
|
| 1322 |
+
|
| 1323 |
+
# Individual metric rows
|
| 1324 |
+
for metric_name in leaderboard_metrics:
|
| 1325 |
+
html += f'<tr><td class="metric-name">{metric_name}</td>'
|
| 1326 |
+
|
| 1327 |
+
# Get all scores for this metric
|
| 1328 |
+
metric_scores = {}
|
| 1329 |
+
for model_name in selected_models:
|
| 1330 |
+
if model_name in model_data_dict and leaderboard_name in model_data_dict[model_name]:
|
| 1331 |
+
results = model_data_dict[model_name][leaderboard_name].get("results", {})
|
| 1332 |
+
metric_scores[model_name] = results.get(metric_name)
|
| 1333 |
+
|
| 1334 |
+
valid_scores = [v for v in metric_scores.values() if v is not None]
|
| 1335 |
+
if valid_scores:
|
| 1336 |
+
max_score = max(valid_scores)
|
| 1337 |
+
min_score = min(valid_scores)
|
| 1338 |
+
score_range = max_score - min_score if max_score > min_score else 1
|
| 1339 |
+
else:
|
| 1340 |
+
max_score = min_score = score_range = None
|
| 1341 |
+
|
| 1342 |
+
for model_name in selected_models:
|
| 1343 |
+
score = metric_scores.get(model_name)
|
| 1344 |
+
if score is not None and score_range is not None:
|
| 1345 |
+
# Determine color class based on relative position
|
| 1346 |
+
if len(valid_scores) > 1:
|
| 1347 |
+
pct = (score - min_score) / score_range if score_range > 0 else 1
|
| 1348 |
+
if score == max_score:
|
| 1349 |
+
cell_class = "best"
|
| 1350 |
+
elif pct >= 0.75:
|
| 1351 |
+
cell_class = "good"
|
| 1352 |
+
elif pct >= 0.5:
|
| 1353 |
+
cell_class = "mid"
|
| 1354 |
+
elif pct >= 0.25:
|
| 1355 |
+
cell_class = "low"
|
| 1356 |
+
else:
|
| 1357 |
+
cell_class = "worst"
|
| 1358 |
+
else:
|
| 1359 |
+
cell_class = ""
|
| 1360 |
+
html += f'<td class="score-cell {cell_class}">{score:.2f}</td>'
|
| 1361 |
+
else:
|
| 1362 |
+
html += '<td class="score-cell na">β</td>'
|
| 1363 |
+
|
| 1364 |
+
html += '</tr>'
|
| 1365 |
+
|
| 1366 |
+
html += '</tbody></table></div>'
|
| 1367 |
+
|
| 1368 |
+
html += """
|
| 1369 |
+
</div>
|
| 1370 |
+
</div>
|
| 1371 |
+
"""
|
| 1372 |
+
|
| 1373 |
+
html += "</div>"
|
| 1374 |
+
return html
|