import gradio as gr import plotly.graph_objects as go from data_loader import get_eval_metadata def get_theme(): return gr.themes.Base( primary_hue="blue", neutral_hue="slate", ).set( body_background_fill="#f5f5f5", body_text_color="#0a0a0a", body_text_color_subdued="#525252", block_background_fill="#ffffff", block_border_color="#e5e5e5", block_label_text_color="#525252", block_title_text_color="#0a0a0a", input_background_fill="#ffffff", input_border_color="#e5e5e5", button_primary_background_fill="#3b82f6", button_primary_text_color="#ffffff", button_secondary_background_fill="#ffffff", button_secondary_text_color="#0a0a0a", button_secondary_border_color="#e5e5e5", ) def get_custom_css(): return """ :root { --brand-black: #0a0a0a; --brand-dark: #1a1a1a; --brand-gray: #2a2a2a; --brand-light: #f5f5f5; --brand-accent: #3b82f6; } body, .gradio-container { background: var(--brand-light) !important; color: var(--brand-black) !important; } .gradio-container { max-width: 100%; padding: 1.25rem 2.5rem 2rem; } .gradio-container *:focus-visible { outline: none !important; box-shadow: inset 0 0 0 1.5px #3b82f6 !important; } .gradio-container .block, .gradio-container .wrap, .gradio-container .form, .gradio-container .container { box-shadow: none !important; } .app-header { display: flex; align-items: center; gap: 1rem; margin-bottom: 1.5rem; padding: 1rem 1.25rem; background: #ffffff; border: 1px solid #e5e5e5; border-radius: 12px; } .logo-mark { width: 48px; height: 48px; border-radius: 12px; display: flex; align-items: center; justify-content: center; font-weight: 800; font-size: 1.1rem; color: #ffffff; } .brand h1 { margin: 0; font-size: 1.5rem; font-weight: 700; color: #0a0a0a; } .brand .tagline { color: #525252; font-size: 0.9rem; } .header-right { margin-left: auto; } .version-badge { background: rgba(59, 130, 246, 0.1); border: 1px solid #3b82f6; border-radius: 8px; padding: 0.35rem 0.6rem; font-size: 0.78rem; color: #3b82f6; } .info-banner { background: #ffffff; border: 1px solid #e5e5e5; border-left: 3px solid #3b82f6; border-radius: 10px; padding: 1rem 1.25rem; margin-bottom: 1rem; } .info-banner h3 { margin: 0; font-weight: 600; color: #0a0a0a; } .leaderboard-header { display: flex; justify-content: space-between; align-items: center; gap: 1rem; flex-wrap: wrap; margin-bottom: 0.4rem; } .lb-title { font-size: 1.2rem; font-weight: 700; color: #0a0a0a; margin: 0; line-height: 1.35; } .lb-by { font-size: 0.9rem; color: #525252; margin: 0.1rem 0 0 0; line-height: 1.35; } .lb-meta { display: flex; flex-direction: column; gap: 0.1rem; } .eval-tags { display: flex; flex-wrap: wrap; gap: 0.4rem; } .eval-tags { margin-top: 0.35rem; } .eval-tag { border-radius: 10px; padding: 0.3rem 0.65rem; font-size: 0.82rem; font-weight: 600; color: #0a0a0a; border: 1px solid #e5e5e5; background: #f8fafc; } .eval-tag:nth-child(5n + 1) { border-color: #3b82f6; background: rgba(59, 130, 246, 0.12); color: #0a1d4a; } .eval-tag:nth-child(5n + 2) { border-color: #10b981; background: rgba(16, 185, 129, 0.12); color: #0b3b2b; } .eval-tag:nth-child(5n + 3) { border-color: #f97316; background: rgba(249, 115, 22, 0.12); color: #4b1f07; } .eval-tag:nth-child(5n + 4) { border-color: #8b5cf6; background: rgba(139, 92, 246, 0.12); color: #2f0f5a; } .eval-tag:nth-child(5n) { border-color: #06b6d4; background: rgba(6, 182, 212, 0.12); color: #053f46; } .source-link { font-size: 0.75rem; color: #3b82f6; text-decoration: none; padding: 0.375rem 0.75rem; border: 1px solid #3b82f6; border-radius: 6px; } .source-link:hover { background: rgba(59, 130, 246, 0.1); } .pagination-bar { margin-top: 0.75rem; padding: 0.85rem 0 0.25rem; display: flex; justify-content: center; align-items: center; gap: 0.85rem; } .page-info { font-size: 1rem; min-width: 80px; text-align: center; color: #0a0a0a; } .metrics-section { margin-top: 1.25rem; padding: 1.25rem 1rem; border-top: 1px solid #e5e5e5; } .metrics-section h3 { font-size: 0.9rem; font-weight: 700; color: #525252; margin: 0 0 0.9rem 0; text-transform: uppercase; letter-spacing: 0.05em; } .metrics-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 0.75rem; } @media (max-width: 768px) { .metrics-grid { grid-template-columns: repeat(auto-fill, minmax(160px, 1fr)); gap: 0.5rem; } .metric-card-header { padding: 0.65rem 0.8rem; flex-direction: column; align-items: flex-start; gap: 0.25rem; } .metric-card-body { padding: 0.65rem 0.8rem; font-size: 0.85rem; } .metrics-section { padding: 1rem 0.5rem; } } .metrics-grid .metric-card { align-self: start; } .metric-card { background: #ffffff; border: 1px solid #e5e5e5; border-radius: 10px; overflow: hidden; position: relative; } .metric-card-header { display: flex; justify-content: space-between; align-items: center; padding: 0.85rem 1rem; cursor: pointer; } .metric-card-header:hover { background: #f9f9f9; } .metric-card-name { font-weight: 600; color: #0a0a0a; } .metric-card-direction { font-size: 0.82rem; color: #525252; } .metric-card-direction .arrow { color: #22c55e; font-weight: 700; } .metric-card-body { display: none; padding: 0.85rem 1rem; border-top: 1px solid #e5e5e5; color: #0a0a0a; } .metric-card input.metric-toggle { display: none; } .metric-card input.metric-toggle:checked ~ .metric-card-body { display: block; } .metric-card input.metric-toggle:checked ~ .metric-card-header { background: #f9f9f9; border-bottom: 1px solid #e5e5e5; } .metric-card input.metric-toggle:checked ~ .metric-card-header .metric-card-name, .metric-card input.metric-toggle:checked ~ .metric-card-header .metric-card-direction { color: #0a0a0a; } /* Ensure multiple cards can be open at once and are closable */ .metric-card input.metric-toggle:not(:checked) ~ .metric-card-body { display: none; } .metric-type-badge { font-size: 0.68rem; text-transform: uppercase; padding: 0.2rem 0.45rem; background: rgba(59, 130, 246, 0.1); border: 1px solid #3b82f6; border-radius: 6px; color: #3b82f6; } .heatmap-table { width: 100%; border-collapse: collapse; font-size: 0.85rem; } .heatmap-table th { padding: 0.55rem 0.65rem; font-weight: 700; font-size: 0.72rem; text-transform: uppercase; color: #525252; background: #f5f5f5; } .heatmap-table td { padding: 0.45rem 0.65rem; text-align: center; border-bottom: 1px solid #e5e5e5; } .heatmap-table td.metric-name { text-align: left; font-weight: 600; color: #0a0a0a; } .heatmap-table td.score-cell { font-weight: 600; } .heatmap-table td.score-cell.best { background: rgba(34, 197, 94, 0.15); color: #16a34a; } .heatmap-table td.score-cell.good { background: rgba(34, 197, 94, 0.08); color: #16a34a; } .heatmap-table td.score-cell.mid { background: rgba(234, 179, 8, 0.15); color: #ca8a04; } .heatmap-table td.score-cell.low { background: rgba(239, 68, 68, 0.12); color: #dc2626; } .heatmap-table td.score-cell.worst { background: rgba(239, 68, 68, 0.18); color: #b91c1c; } .heatmap-table td.score-cell.na { color: #525252; font-style: italic; } /* Model chips */ .selected-models-group label { display: inline-flex !important; background: #ffffff; border: 1px solid #e5e5e5; border-radius: 16px; padding: 0.35rem 0.85rem; font-size: 0.88rem; color: #0a0a0a; cursor: pointer; margin: 0.18rem 0.32rem 0.18rem 0 !important; } .selected-models-group input[type="checkbox"] { display: none; } .no-results { text-align: center; padding: 2.5rem 1rem; color: #525252; } .gradio-container footer { display: none; } .block, .form, .wrap, .container { background: #ffffff !important; } body, .gradio-container, p, span, div, h1, h2, h3, h4, h5, h6, label, td, th { color: #0a0a0a !important; } .label-wrap span, .prose, .markdown, .prose p, .prose li, .markdown p, .markdown li { color: #525252 !important; } input, textarea, select { background: #ffffff !important; color: #0a0a0a !important; border: 1px solid #e5e5e5 !important; border-radius: 8px !important; } input::placeholder, textarea::placeholder { color: #a1a1a1 !important; } input:focus, textarea:focus, select:focus { border-color: #3b82f6 !important; outline: none !important; box-shadow: inset 0 0 0 1.5px #3b82f6 !important; } select, .wrap select, .wrap input, input[type="text"], textarea { min-height: 44px !important; padding: 0.55rem 0.75rem !important; font-size: 0.96rem !important; } button { border-radius: 8px !important; font-weight: 500 !important; transition: all 0.15s ease !important; } button.primary, button[variant="primary"] { background: #3b82f6 !important; color: #ffffff !important; border: none !important; } button.primary:hover, button[variant="primary"]:hover { background: #2563eb !important; } button.secondary, button[variant="secondary"], button:not(.primary):not([variant="primary"]) { background: #ffffff !important; color: #0a0a0a !important; border: 1px solid #e5e5e5 !important; } button.secondary:hover, button[variant="secondary"]:hover { border-color: #3b82f6 !important; background: #f5f5f5 !important; } .tab-nav, .tabs { border-bottom: 1px solid #e5e5e5 !important; } .tab-nav button, .tabs button { color: #525252 !important; background: transparent !important; border: none !important; border-bottom: 2px solid transparent !important; } .tab-nav button.selected, .tabs button.selected { color: #3b82f6 !important; border-bottom-color: #3b82f6 !important; } .wrap, .secondary-wrap, .primary-wrap { background: transparent !important; border: none !important; border-radius: 0 !important; box-shadow: none !important; padding: 0 !important; } ul[role="listbox"], .dropdown, .options { background: #ffffff !important; border: 1px solid #e5e5e5 !important; border-radius: 8px !important; box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1) !important; } ul[role="listbox"] li, .dropdown li, .options li { color: #0a0a0a !important; } ul[role="listbox"] li:hover, .dropdown li:hover, .options li:hover { background: #f5f5f5 !important; } ul[role="listbox"] li.active, .dropdown li.active, .options li.active { background: #f5f5f5 !important; color: #0a0a0a !important; } ul[role="listbox"] li.selected, .dropdown li.selected { background: rgba(59, 130, 246, 0.1) !important; color: #3b82f6 !important; } .accordion { border: 1px solid #e5e5e5 !important; border-radius: 8px !important; background: #ffffff !important; } .accordion > button { color: #0a0a0a !important; } .selected-models-group label, .checkbox-group label { display: inline-flex !important; background: #ffffff; border: 1px solid #e5e5e5; border-radius: 20px !important; padding: 0.4rem 0.9rem !important; font-size: 0.88rem !important; color: #0a0a0a !important; cursor: pointer !important; margin: 0.2rem !important; transition: all 0.15s ease !important; } .selected-models-group label:hover, .checkbox-group label:hover { border-color: #3b82f6 !important; background: #f5f5f5 !important; } .selected-models-group input[type="checkbox"], .checkbox-group input[type="checkbox"] { display: none !important; } table { width: 100% !important; border-collapse: collapse !important; background: #ffffff !important; } table th { background: #f5f5f5 !important; color: #525252 !important; font-weight: 600 !important; text-transform: uppercase !important; font-size: 0.75rem !important; padding: 0.75rem !important; border-bottom: 1px solid #e5e5e5 !important; text-align: left !important; } table td { padding: 0.65rem 0.75rem !important; border-bottom: 1px solid #e5e5e5 !important; color: #0a0a0a !important; } table tr:hover td { background: #f9f9f9 !important; } .dataframe { background: #ffffff !important; border: 1px solid #e5e5e5 !important; box-shadow: none !important; border-radius: px !important; overflow: hidden !important; } .dataframe table { width: 100% !important; border-collapse: collapse !important; font-size: 0.75rem !important; table-layout: auto !important; background: #ffffff !important; } .dataframe thead, .dataframe thead tr { background: #ffffff !important; position: sticky !important; top: 0 !important; z-index: 10 !important; } .dataframe thead th { padding: 0.875rem 1rem !important; font-weight: 700 !important; font-size: 0.75rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; color: #0a0a0a !important; border-bottom: 2px solid #e5e5e5 !important; border-top: none !important; text-align: left !important; background: #ffffff !important; white-space: nowrap !important; border-radius: 0 !important; } .dataframe thead th span, .dataframe thead th div, .dataframe thead th button { background: transparent !important; border: none !important; border-radius: 0 !important; box-shadow: none !important; margin: 0 !important; outline: none !important; } .dataframe thead th span[role="button"], .dataframe thead th span[class*="svelte"] { background: transparent !important; border: none !important; box-shadow: none !important; outline: none !important; padding: 0 !important; width: auto !important; } /* Also target the SVG icon if it exists to ensure it doesn't have a background */ .dataframe thead th svg { background: transparent !important; box-shadow: none !important; } .dataframe thead th span:hover, .dataframe thead th span[role="button"]:hover, .dataframe thead th span[class*="svelte"]:hover, .dataframe thead th button:hover { background: transparent !important; border: none !important; box-shadow: none !important; color: #3b82f6 !important; } .token { background-color: rgba(59, 130, 246, 0.12) !important; border: 1px solid rgba(59, 130, 246, 0.3) !important; color: #1e3a8a !important; border-radius: 6px !important; padding: 2px 8px !important; gap: 4px !important; } .token-remove { background-color: rgba(255, 255, 255, 0.4) !important; border: 1px solid rgba(30, 58, 138, 0.5) !important; /* Dark blue outline */ color: #1e3a8a !important; border-radius: 4px !important; margin-left: 6px !important; padding: 1px !important; opacity: 0.9 !important; min-width: 18px !important; min-height: 18px !important; display: flex !important; align-items: center !important; justify-content: center !important; } .token-remove svg { width: 12px !important; height: 12px !important; } .token-remove:hover { background-color: #1e3a8a !important; color: #ffffff !important; border-color: #1e3a8a !important; } .selector-item { border-radius: 6px !important; } .gradio-container .token { box-shadow: none !important; font-weight: 500 !important; } .gradio-container .token span { color: #1e3a8a !important; } .dataframe tbody, .dataframe tbody tr { background: #ffffff !important; } .dataframe tbody tr { border-bottom: 1px solid #e5e5e5 !important; } .dataframe tbody tr:hover { background: #f9f9f9 !important; } .dataframe tbody td { padding: 0.75rem 1rem !important; color: #0a0a0a !important; background: #ffffff !important; border: none !important; border-bottom: 1px solid #e5e5e5 !important; } .dataframe tbody td:first-child { font-weight: 700 !important; color: #0a0a0a !important; white-space: normal !important; word-break: break-word !important; max-width: 400px; min-width: 250px; } .dataframe tbody td:not(:first-child) { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !important; text-align: left !important; white-space: nowrap !important; min-width: 80px !important; } .dataframe td:nth-child(2), .dataframe th:nth-child(2) { max-width: 220px; min-width: 140px; } .column-selector-dropdown { min-width: 300px; } .column-selector-dropdown .wrap { flex-wrap: nowrap !important; overflow-x: auto !important; gap: 0.25rem !important; padding: 0.5rem !important; } .column-selector-dropdown .wrap input { width: 100% !important; padding-left: 0.5rem !important; border: none !important; box-shadow: none !important; } .heatmap-table { border: 1px solid #e5e5e5 !important; border-radius: 8px !important; overflow: hidden !important; } .heatmap-table th { background: #f5f5f5 !important; color: #525252 !important; padding: 0.6rem 0.75rem !important; font-size: 0.72rem !important; border-bottom: 2px solid #e5e5e5 !important; } .heatmap-table td { padding: 0.5rem 0.75rem !important; border-bottom: 1px solid #e5e5e5 !important; } .heatmap-table td.metric-name { background: #f5f5f5 !important; font-weight: 600 !important; } .heatmap-table td.score-cell.best { background: rgba(34, 197, 94, 0.2) !important; color: #15803d !important; } .heatmap-table td.score-cell.good { background: rgba(34, 197, 94, 0.1) !important; color: #16a34a !important; } .heatmap-table td.score-cell.mid { background: rgba(234, 179, 8, 0.15) !important; color: #a16207 !important; } .heatmap-table td.score-cell.low { background: rgba(239, 68, 68, 0.12) !important; color: #dc2626 !important; } .heatmap-table td.score-cell.worst { background: rgba(239, 68, 68, 0.2) !important; color: #b91c1c !important; } .heatmap-table td.score-cell.na { color: #a1a1a1 !important; font-style: italic !important; } .gradio-container footer { display: none !important; } ::-webkit-scrollbar { width: 8px; height: 8px; } ::-webkit-scrollbar-track { background: #f5f5f5; } ::-webkit-scrollbar-thumb { background: #d4d4d4; border-radius: 4px; } ::-webkit-scrollbar-thumb:hover { background: #a1a1a1; } """ def format_leaderboard_header(selected_leaderboard, metadata): if not selected_leaderboard: return '
Select a leaderboard to explore
' if not metadata or not metadata.get("evals"): return f'

{selected_leaderboard}

' source_info = metadata.get("source_info", {}) org = source_info.get("organization", "Unknown") url = source_info.get("url", "#") eval_names = sorted(list(metadata["evals"].keys())) eval_tags = "".join([f'{name}' for name in eval_names]) return f'''
{selected_leaderboard}
By {org}
Source →
{eval_tags}
''' def format_metric_details(selected_leaderboard, metadata): if not selected_leaderboard or not metadata or not metadata.get("evals"): return "" evals = metadata.get("evals", {}) cards_html = "" for i, (eval_name, info) in enumerate(evals.items()): score_type = info.get('score_type', '').upper() or "—" direction = "Lower is better" if info.get('lower_is_better') else "Higher is better" arrow = "↓" if info.get('lower_is_better') else "↑" details = "" if info.get('score_type') == "continuous" and info.get('min_score') is not None: details = f"Range: [{info['min_score']} – {info['max_score']}]" elif info.get('score_type') == "levels" and info.get('level_names'): details = f"Levels: {', '.join(str(l) for l in info['level_names'])}" card_id = f"mc{i}" cards_html += f'''
{info.get('description', 'No description')}
{details} {score_type}
''' return f'''

Metric Reference

{cards_html}
''' def format_model_card(model_name, model_data): if not model_data: return '

No results found

Try a different model name

' first = list(model_data.values())[0] developer = first.get("developer", "Unknown") params = first.get("params") arch = first.get("architecture", "Unknown") params_str = f"{params}B" if params else "—" html = f'''

{model_name}

Developer: {developer} · Params: {params_str} · Arch: {arch}
''' for leaderboard_name, data in model_data.items(): results = data.get("results", {}) if not results: continue scores = [v for v in results.values() if v is not None] avg = sum(scores) / len(scores) if scores else None avg_str = f"{avg:.2f}" if avg else "—" html += f'

{leaderboard_name} (avg: {avg_str})

' html += '
' for metric_name, score in sorted(results.items(), key=lambda x: x[1] if x[1] else 0, reverse=True): score_display = f"{score:.2f}" if score is not None else "—" html += f'
{metric_name}: {score_display}
' html += '
' html += '
' return html def format_model_comparison(selected_models, all_results): if not selected_models or not all_results: return '

Select models to compare

Choose models from the dropdown

' all_leaderboards = set() model_data_dict = {} for model_name in selected_models: if model_name in all_results: model_data_dict[model_name] = all_results[model_name] for lb in all_results[model_name].keys(): all_leaderboards.add(lb) if not model_data_dict: return '

No data found

' all_leaderboards = sorted(all_leaderboards) html = '
' for leaderboard_name in all_leaderboards: metrics = set() for md in model_data_dict.values(): if leaderboard_name in md: metrics.update(md[leaderboard_name].get("results", {}).keys()) metrics = sorted(metrics) if not metrics: continue html += f'

{leaderboard_name}

' html += '
' for model_name in selected_models: short = model_name[:20] + "…" if len(model_name) > 20 else model_name html += f'' html += '' for metric_name in metrics: html += f'' scores = {} for m in selected_models: if m in model_data_dict and leaderboard_name in model_data_dict[m]: scores[m] = model_data_dict[m][leaderboard_name].get("results", {}).get(metric_name) valid = [v for v in scores.values() if v is not None] max_s = max(valid) if valid else None min_s = min(valid) if valid else None for model_name in selected_models: score = scores.get(model_name) if score is not None: if len(valid) > 1 and max_s and min_s: if score == max_s: cls = "best" elif max_s > min_s: pct = (score - min_s) / (max_s - min_s) cls = "good" if pct >= 0.75 else "mid" if pct >= 0.5 else "low" if pct >= 0.25 else "worst" else: cls = "" else: cls = "" html += f'' else: html += '' html += '' html += '
Metric{short}
{metric_name}{score:.2f}
' html += '
' return html def create_radar_plot(selected_models, all_results): if not selected_models or not all_results: return None metric_data = {} leaderboards_involved = set() for model in selected_models: if model not in all_results: continue model_data = all_results[model] for lb_name, lb_data in model_data.items(): leaderboards_involved.add(lb_name) results = lb_data.get("results", {}) for metric, score in results.items(): if score is None: continue key = f"{lb_name}: {metric}" if key not in metric_data: metric_data[key] = {} metric_data[key][model] = score if not metric_data: return None meta_cache = {} for lb in leaderboards_involved: meta_cache[lb] = get_eval_metadata(lb) fig = go.Figure() categories = sorted(metric_data.keys()) for model in selected_models: r_values = [] theta_values = [] hover_texts = [] for cat in categories: lb_name, metric_name = cat.split(": ", 1) val = metric_data[cat].get(model) if val is None: r_values.append(None) theta_values.append(cat) hover_texts.append(f"{cat}
N/A") else: meta = meta_cache.get(lb_name, {}).get("evals", {}).get(metric_name, {}) min_s = meta.get("min_score") max_s = meta.get("max_score") observed_vals = [] for m in selected_models: v = metric_data[cat].get(m) if v is not None: observed_vals.append(v) observed_max = max(observed_vals) if observed_vals else 1.0 if min_s is None: min_s = 0 if max_s is None: if observed_max > 1: max_s = 100 else: max_s = 1 max_s = max(max_s, observed_max) if max_s == min_s: norm_val = 1.0 else: norm_val = (val - min_s) / (max_s - min_s) norm_val = max(0.0, min(1.0, norm_val)) r_values.append(norm_val) theta_values.append(cat) hover_texts.append(f"{cat}
Score: {val:.2f} (Norm: {norm_val:.2f})") if r_values: r_values.append(r_values[0]) theta_values.append(theta_values[0]) hover_texts.append(hover_texts[0]) fig.add_trace(go.Scatterpolar( r=r_values, theta=theta_values, name=model, hovertext=hover_texts, hoverinfo="text", fill='toself' )) fig.update_layout( polar=dict( radialaxis=dict( visible=True, range=[0, 1] ) ), showlegend=True, margin=dict(l=80, r=80, t=20, b=20), title="Model Comparison Radar (Normalized Scores)" ) return fig