""" UI Components: Themes, CSS, and HTML formatters for the Gradio interface. Nord color theme with balanced contrast. """ import gradio as gr def get_theme(): """Returns the Nord-themed Gradio theme, locked to dark mode.""" return gr.themes.Base( primary_hue="blue", neutral_hue="slate", font=[gr.themes.GoogleFont("DM Sans"), "system-ui", "sans-serif"], font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"], ).set( body_background_fill="#2E3440", body_background_fill_dark="#2E3440", body_text_color="#ECEFF4", body_text_color_dark="#ECEFF4", body_text_color_subdued="#4C566A", body_text_color_subdued_dark="#4C566A", block_background_fill="#3B4252", block_background_fill_dark="#3B4252", block_border_width="1px", block_border_color="#434C5E", block_border_color_dark="#434C5E", block_label_text_color="#D8DEE9", block_label_text_color_dark="#D8DEE9", block_title_text_color="#ECEFF4", block_title_text_color_dark="#ECEFF4", input_background_fill="#2E3440", input_background_fill_dark="#2E3440", input_border_color="#4C566A", input_border_color_dark="#4C566A", button_primary_background_fill="#88C0D0", button_primary_background_fill_dark="#88C0D0", button_primary_text_color="#2E3440", button_primary_text_color_dark="#2E3440", button_secondary_background_fill="#434C5E", button_secondary_background_fill_dark="#434C5E", button_secondary_text_color="#ECEFF4", button_secondary_text_color_dark="#ECEFF4", ) def get_custom_css(): """Returns custom CSS with Nord colors.""" return """ /* === Nord Theme === Polar Night: #2E3440 (bg), #3B4252 (surface), #434C5E, #4C566A Snow Storm: #D8DEE9, #E5E9F0, #ECEFF4 Frost: #8FBCBB, #88C0D0, #81A1C1, #5E81AC Aurora: #BF616A, #D08770, #EBCB8B, #A3BE8C, #B48EAD */ /* Lock the UI to dark Nord regardless of OS preference */ :root { color-scheme: dark; background-color: #2E3440; } body { background: #2E3440 !important; color: #ECEFF4 !important; } /* === Base === */ .gradio-container { max-width: 100% !important; margin: 0 !important; padding: 1.25rem 2.5rem 2rem !important; background: #2E3440 !important; color: #ECEFF4 !important; font-family: 'DM Sans', system-ui, sans-serif !important; font-size: 16px !important; } /* === Header === */ .app-header { display: flex; align-items: center; gap: 1rem; margin-bottom: 1.5rem; padding: 1.25rem 1.5rem; background: #3B4252; border: 1px solid #434C5E; border-radius: 12px; } .app-header .logo-mark { width: 48px; height: 48px; background: linear-gradient(135deg, #88C0D0 0%, #81A1C1 100%); border-radius: 12px; display: flex; align-items: center; justify-content: center; font-weight: 800; font-size: 1.1rem; color: #2E3440; } .app-header .brand { display: flex; flex-direction: column; gap: 0.125rem; } .app-header h1 { margin: 0; font-size: 1.5rem; font-weight: 700; color: #ECEFF4; letter-spacing: -0.02em; } .app-header .tagline { color: #D8DEE9; font-size: 0.85rem; } .app-header .header-right { margin-left: auto; display: flex; align-items: center; gap: 0.75rem; } .app-header .version-badge { background: rgba(136, 192, 208, 0.2); border: 1px solid rgba(136, 192, 208, 0.4); border-radius: 6px; padding: 0.25rem 0.625rem; font-size: 0.7rem; font-family: 'JetBrains Mono', monospace; color: #88C0D0; } /* === Tabs === */ .tabs { border: none !important; background: transparent !important; } .tab-nav { background: #3B4252 !important; border: 1px solid #434C5E !important; border-radius: 10px !important; padding: 0.25rem !important; gap: 0.25rem !important; margin-bottom: 1.25rem !important; display: inline-flex !important; } .tab-nav button { background: transparent !important; border: none !important; color: #D8DEE9 !important; padding: 0.75rem 1.5rem !important; font-size: 0.95rem !important; font-weight: 500 !important; border-radius: 8px !important; transition: all 0.15s ease !important; } .tab-nav button.selected { color: #2E3440 !important; background: #88C0D0 !important; } .tab-nav button:hover:not(.selected) { background: #434C5E !important; color: #ECEFF4 !important; } .tabitem { background: transparent !important; border: none !important; padding: 0 !important; } /* === Controls bar === */ .controls-bar { background: #3B4252 !important; border: 1px solid #434C5E !important; border-radius: 10px !important; padding: 0.75rem 1.25rem !important; margin-bottom: 1rem !important; gap: 0.75rem !important; } .controls-bar label { font-size: 0.75rem !important; text-transform: uppercase !important; letter-spacing: 0.04em !important; color: #D8DEE9 !important; font-weight: 500 !important; } /* === Info banner === */ .info-banner { background: #3B4252 !important; border: 1px solid #434C5E !important; border-left: 3px solid #88C0D0 !important; border-radius: 0 10px 10px 0 !important; padding: 0.75rem 1rem !important; margin-bottom: 1rem !important; } .info-banner h3 { margin: 0; font-size: 1.1rem; font-weight: 600; color: #ECEFF4; } .info-banner .eval-tags { display: flex; flex-wrap: wrap; gap: 0.375rem; } .info-banner .eval-tag { background: rgba(143, 188, 187, 0.15); border: 1px solid rgba(143, 188, 187, 0.3); border-radius: 4px; padding: 0.3rem 0.6rem; font-size: 0.8rem; font-family: 'JetBrains Mono', monospace; color: #8FBCBB; } /* === Dataframe - seamless styling === */ .dataframe, .dataframe > div, .dataframe > div > div, .dataframe .table-wrap, .dataframe .svelte-1gfkn6j { background: #2E3440 !important; border: none !important; box-shadow: none !important; border-radius: 0 !important; } .dataframe table { width: 100% !important; border-collapse: collapse !important; font-size: 0.95rem !important; table-layout: auto !important; background: #2E3440 !important; } .dataframe thead, .dataframe thead tr { background: #2E3440 !important; position: sticky; top: 0; z-index: 10; } .dataframe thead th { padding: 0.875rem 1rem !important; font-weight: 600 !important; font-size: 0.75rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; color: #81A1C1 !important; border-bottom: 1px solid #434C5E !important; border-top: none !important; text-align: left !important; background: #2E3440 !important; } .dataframe tbody, .dataframe tbody tr { background: #2E3440 !important; } .dataframe tbody tr { border-bottom: 1px solid #3B4252 !important; } .dataframe tbody tr:hover { background: rgba(136, 192, 208, 0.04) !important; } .dataframe tbody td { padding: 0.75rem 1rem !important; color: #E5E9F0 !important; background: #2E3440 !important; overflow: hidden !important; text-overflow: ellipsis !important; border: none !important; } /* === Pagination bar === */ .pagination-bar { margin-top: 1rem !important; padding: 1rem 0 !important; border-top: 1px solid #3B4252 !important; display: flex !important; justify-content: center !important; align-items: center !important; gap: 1rem !important; } .page-info { font-family: 'JetBrains Mono', monospace !important; font-size: 1rem !important; color: #D8DEE9 !important; min-width: 80px !important; text-align: center !important; } /* Model name - white, readable */ .dataframe tbody td:first-child { font-weight: 500 !important; color: #ECEFF4 !important; white-space: nowrap !important; } /* All other columns - use monospace for numbers */ .dataframe tbody td:not(:first-child) { font-family: 'JetBrains Mono', monospace !important; color: #8FBCBB !important; text-align: left !important; } .dataframe tbody td:nth-child(2) { color: #88C0D0 !important; white-space: nowrap !important; } .dataframe tbody td:nth-child(3) { color: #D08770 !important; } .dataframe tbody td:nth-child(4) { font-weight: 600 !important; color: #A3BE8C !important; } .dataframe tbody td:nth-child(n+5) { white-space: nowrap !important; } /* === Status text === */ .status-text { font-size: 0.9rem !important; color: #D8DEE9 !important; padding: 0.5rem 0 !important; font-family: 'JetBrains Mono', monospace !important; } /* === Model Card === */ .model-card-container { display: flex; flex-direction: column; gap: 1.25rem; } .model-card-header { background: #3B4252; border: 1px solid #434C5E; border-radius: 12px; padding: 1.5rem 2rem; } .model-card-header h2 { margin: 0 0 0.5rem 0; font-size: 1.5rem; font-weight: 600; color: #ECEFF4; } .model-card-header .model-meta { display: flex; gap: 1.5rem; color: #D8DEE9; font-size: 0.95rem; } .model-card-header .model-meta strong { color: #8FBCBB; } .leaderboard-section { background: #3B4252; border: 1px solid #434C5E; border-radius: 10px; overflow: hidden; } .leaderboard-section-header { background: #434C5E; padding: 1rem 1.25rem; border-bottom: 1px solid #4C566A; display: flex; justify-content: space-between; align-items: center; } .leaderboard-section-header h3 { margin: 0; font-size: 1rem; font-weight: 600; color: #88C0D0; } .leaderboard-section-header .lb-avg { background: rgba(163, 190, 140, 0.15); border: 1px solid rgba(163, 190, 140, 0.3); border-radius: 8px; padding: 0.5rem 1rem; font-size: 0.85rem; color: #D8DEE9; } .leaderboard-section-header .lb-avg strong { color: #A3BE8C; font-family: 'JetBrains Mono', monospace; font-size: 1.1rem; font-weight: 700; } .scores-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 1px; background: #434C5E; } .score-item { background: #3B4252; padding: 1rem 1.25rem; } .score-item .score-label { font-size: 0.8rem; text-transform: uppercase; letter-spacing: 0.05em; color: #D8DEE9; margin-bottom: 0.375rem; } .score-item .score-value { font-size: 1.5rem; font-weight: 600; font-family: 'JetBrains Mono', monospace; color: #A3BE8C; } .score-item.highlight .score-value { color: #88C0D0; } .no-results { text-align: center; padding: 3rem 1rem; color: #D8DEE9; } .no-results h3 { color: #ECEFF4; margin-bottom: 0.5rem; } /* === New Comparison View === */ .comparison-container { display: flex; flex-direction: column; gap: 1.5rem; } .comparison-summary { background: #3B4252; border: 1px solid #434C5E; border-radius: 12px; padding: 1.5rem; } .comparison-summary h2 { margin: 0 0 1rem 0; color: #ECEFF4; font-size: 1.25rem; } .summary-cards { display: flex; gap: 1rem; flex-wrap: wrap; } .summary-card { flex: 1; min-width: 200px; background: #2E3440; border-radius: 8px; padding: 1rem; } .summary-card-header { display: flex; align-items: center; gap: 0.5rem; margin-bottom: 0.75rem; } .model-dot { width: 10px; height: 10px; border-radius: 50%; } .model-name { font-weight: 600; color: #ECEFF4; font-size: 0.9rem; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; } .summary-card-body { display: flex; flex-direction: column; gap: 0.5rem; } .summary-stat { display: flex; justify-content: space-between; align-items: center; } .summary-stat .stat-label { font-size: 0.75rem; color: #D8DEE9; text-transform: uppercase; letter-spacing: 0.05em; } .summary-stat .stat-value { font-family: 'JetBrains Mono', monospace; color: #8FBCBB; } .summary-stat.primary .stat-value.large { font-size: 1.5rem; font-weight: 700; color: #A3BE8C; } .leaderboard-comparison-card { background: #3B4252; border: 1px solid #434C5E; border-radius: 12px; overflow: hidden; } .lb-card-header { background: #434C5E; padding: 0.875rem 1.25rem; } .lb-card-header h3 { margin: 0; color: #88C0D0; font-size: 1rem; font-weight: 600; } .lb-card-body { padding: 1rem 1.25rem; display: flex; flex-direction: column; gap: 0.75rem; } .metric-comparison { display: flex; flex-direction: column; gap: 0.375rem; } .metric-name-row { margin-bottom: 0.25rem; } .metric-title { font-size: 0.85rem; font-weight: 600; color: #ECEFF4; } .metric-title.sub { font-size: 0.75rem; font-weight: 500; color: #D8DEE9; } .model-score-row { display: flex; align-items: center; gap: 0.5rem; padding: 0.375rem 0; } .model-score-row.compact { padding: 0.25rem 0; } .model-score-row.best-score { background: rgba(163, 190, 140, 0.1); border-radius: 4px; padding-left: 0.5rem; margin-left: -0.5rem; } .model-score-row.no-data { opacity: 0.5; } .model-indicator { width: 8px; height: 8px; border-radius: 2px; flex-shrink: 0; } .model-indicator.small { width: 6px; height: 6px; } .score-bar-container { flex: 1; display: flex; align-items: center; gap: 0.75rem; height: 24px; background: #2E3440; border-radius: 4px; padding: 0 0.5rem; position: relative; } .score-bar { position: absolute; left: 0; top: 0; bottom: 0; border-radius: 4px; opacity: 0.3; } .score-bar.thin { opacity: 0.2; } .score-value { position: relative; font-family: 'JetBrains Mono', monospace; font-size: 0.9rem; font-weight: 600; color: #ECEFF4; z-index: 1; } .score-value.small { font-size: 0.8rem; font-weight: 500; } .score-value.dim { color: #4C566A; } /* === Selected Models Chips === */ .selected-models-group label { display: inline-flex !important; align-items: center !important; background: #434C5E; border: 1px solid #4C566A; border-radius: 16px; padding: 0.35rem 0.85rem; font-size: 0.85rem; color: #ECEFF4; gap: 0.4rem; cursor: pointer; margin: 0.15rem 0.3rem 0.15rem 0 !important; } .selected-models-group label span::before { content: "×"; font-size: 0.75rem; color: #EBCB8B; opacity: 0; transition: opacity 0.15s ease; } .selected-models-group label:hover span::before { opacity: 1; } .selected-models-group input[type="checkbox"] { display: none; } /* === Heat Map Table === */ .heatmap-table-wrapper { overflow-x: auto; margin-top: 1rem; } .heatmap-table { width: 100%; border-collapse: collapse; font-size: 0.85rem; } .heatmap-table thead { position: sticky; top: 0; z-index: 10; } .heatmap-table th { background: #434C5E; padding: 0.625rem 0.75rem; font-weight: 600; font-size: 0.7rem; text-transform: uppercase; letter-spacing: 0.05em; color: #81A1C1; text-align: left; border-bottom: 2px solid #4C566A; white-space: nowrap; } .heatmap-table th.metric-header { min-width: 120px; } .heatmap-table th.model-header { text-align: center; max-width: 150px; overflow: hidden; text-overflow: ellipsis; } .heatmap-table td { padding: 0.5rem 0.75rem; border-bottom: 1px solid #3B4252; } .heatmap-table td.metric-name { font-weight: 500; color: #D8DEE9; background: #2E3440; } .heatmap-table td.score-cell { text-align: center; font-family: 'JetBrains Mono', monospace; font-weight: 500; transition: all 0.15s ease; } .heatmap-table td.score-cell.best { background: rgba(163, 190, 140, 0.25); color: #A3BE8C; font-weight: 700; } .heatmap-table td.score-cell.good { background: rgba(163, 190, 140, 0.12); color: #A3BE8C; } .heatmap-table td.score-cell.mid { background: rgba(235, 203, 139, 0.12); color: #EBCB8B; } .heatmap-table td.score-cell.low { background: rgba(208, 135, 112, 0.12); color: #D08770; } .heatmap-table td.score-cell.worst { background: rgba(191, 97, 106, 0.15); color: #BF616A; } .heatmap-table td.score-cell.na { color: #4C566A; font-style: italic; } .heatmap-table tr.avg-row { background: rgba(136, 192, 208, 0.08); } .heatmap-table tr.avg-row td.metric-name { font-weight: 700; color: #88C0D0; background: rgba(136, 192, 208, 0.08); } /* === Buttons === */ button { border-radius: 8px !important; font-weight: 500 !important; font-size: 0.95rem !important; transition: all 0.15s ease !important; } button.primary { background: #88C0D0 !important; color: #2E3440 !important; border: none !important; } button.primary:hover:not(:disabled) { background: #8FBCBB !important; } button.secondary, button[variant="secondary"] { background: #434C5E !important; color: #ECEFF4 !important; border: 1px solid #4C566A !important; } button.secondary:hover:not(:disabled), button[variant="secondary"]:hover:not(:disabled) { background: #4C566A !important; } button:disabled { opacity: 0.35 !important; } /* === Inputs === */ input[type="text"], select { background: #2E3440 !important; border: 1px solid #4C566A !important; border-radius: 8px !important; color: #ECEFF4 !important; font-size: 1rem !important; } input[type="text"]:focus, select:focus { border-color: #88C0D0 !important; box-shadow: 0 0 0 3px rgba(136, 192, 208, 0.15) !important; outline: none !important; } input::placeholder { color: #4C566A !important; } /* === Accordion === */ .accordion { background: #3B4252 !important; border: 1px solid #434C5E !important; border-radius: 10px !important; margin-top: 1.5rem !important; } .accordion > .label-wrap { background: transparent !important; padding: 1rem 1.25rem !important; color: #D8DEE9 !important; font-size: 0.95rem !important; } .accordion > .wrap { padding: 0.5rem 1.25rem 1.25rem !important; color: #D8DEE9 !important; font-size: 0.95rem !important; line-height: 1.6 !important; } .accordion code { background: #434C5E !important; padding: 0.125rem 0.375rem !important; border-radius: 4px !important; font-family: 'JetBrains Mono', monospace !important; font-size: 0.8rem !important; color: #8FBCBB !important; } /* === Metrics section === */ .metrics-section { margin-top: 1.5rem; padding-top: 1.5rem; border-top: 1px solid #434C5E; } .metrics-section h3 { font-size: 0.85rem; font-weight: 600; color: #D8DEE9; margin: 0 0 1rem 0; text-transform: uppercase; letter-spacing: 0.05em; } .metrics-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(300px, 1fr)); gap: 0.75rem; } .metric-card { background: #3B4252; border: 1px solid #434C5E; border-radius: 8px; overflow: hidden; } .metric-card-header { display: flex; justify-content: space-between; align-items: center; padding: 0.75rem 1rem; cursor: pointer; list-style: none; } .metric-card-header::-webkit-details-marker { display: none; } .metric-card-name { font-weight: 500; font-size: 0.95rem; color: #ECEFF4; } .metric-card-direction { font-size: 0.8rem; color: #D8DEE9; } .metric-card-direction .arrow { color: #A3BE8C; font-weight: 600; } .metric-card-body { padding: 0.875rem 1.25rem; border-top: 1px solid #434C5E; font-size: 0.9rem; color: #D8DEE9; line-height: 1.5; } .metric-type-badge { font-size: 0.65rem; text-transform: uppercase; letter-spacing: 0.05em; padding: 0.15rem 0.4rem; background: rgba(180, 142, 173, 0.2); border: 1px solid rgba(180, 142, 173, 0.35); border-radius: 4px; color: #B48EAD; font-family: 'JetBrains Mono', monospace; } /* === Scrollbar === */ ::-webkit-scrollbar { width: 8px; height: 8px; } ::-webkit-scrollbar-track { background: #2E3440; } ::-webkit-scrollbar-thumb { background: #4C566A; border-radius: 4px; } ::-webkit-scrollbar-thumb:hover { background: #5E81AC; } /* === Responsive === */ @media (max-width: 768px) { .gradio-container { padding: 1rem !important; } .scores-grid { grid-template-columns: repeat(2, 1fr); } } /* === Overrides === */ .gradio-container footer { display: none !important; } .block { background: #3B4252 !important; } .gradio-radio label { background: #434C5E !important; border: 1px solid #4C566A !important; color: #ECEFF4 !important; border-radius: 8px !important; font-size: 0.85rem !important; } .gradio-radio label.selected { background: #88C0D0 !important; border-color: #88C0D0 !important; color: #2E3440 !important; } """ def format_leaderboard_header(selected_leaderboard, metadata): """Formats the leaderboard header info section.""" if not selected_leaderboard: return """
Select a leaderboard to explore
""" if not metadata or not metadata.get("evals"): return f"""

{selected_leaderboard}

""" source_info = metadata.get("source_info", {}) org = source_info.get("organization", "Unknown") url = source_info.get("url", "#") eval_names = list(metadata["evals"].keys()) eval_tags = "".join([f'{name}' for name in eval_names]) return f"""

{selected_leaderboard}

by {org}
{eval_tags}
Source →
""" def format_metric_details(selected_leaderboard, metadata): """Formats metric detail cards.""" if not selected_leaderboard or not metadata or not metadata.get("evals"): return "" evals = metadata.get("evals", {}) html = """

Metric Reference

""" for eval_name, info in evals.items(): score_type = info['score_type'].upper() if info.get('score_type') else "—" direction = "Lower is better" if info.get('lower_is_better') else "Higher is better" arrow = "↓" if info.get('lower_is_better') else "↑" details = "" if info.get('score_type') == "continuous" and info.get('min_score') is not None: details = f"Range: [{info['min_score']} – {info['max_score']}]" elif info.get('score_type') == "levels" and info.get('level_names'): details = f"Levels: {', '.join(str(l) for l in info['level_names'])}" html += f"""
{eval_name} {arrow} {direction}
{info.get('description', 'No description')}
{details} {score_type}
""" html += "
" return html def format_model_card(model_name, model_data): """Formats a model card showing all evals across leaderboards.""" if not model_data: return """

No results found

Try searching for a different model name

""" first = list(model_data.values())[0] developer = first.get("developer", "Unknown") params = first.get("params") arch = first.get("architecture", "Unknown") params_str = f"{params}B" if params else "—" html = f"""

{model_name}

Developer: {developer} Parameters: {params_str} Architecture: {arch}
""" for leaderboard_name, data in model_data.items(): results = data.get("results", {}) if not results: continue scores = [v for v in results.values() if v is not None] avg = sum(scores) / len(scores) if scores else None avg_str = f"{avg:.2f}" if avg else "—" html += f"""

{leaderboard_name}

Avg: {avg_str}
""" sorted_results = sorted(results.items(), key=lambda x: x[1] if x[1] is not None else 0, reverse=True) for i, (metric_name, score) in enumerate(sorted_results): score_display = f"{score:.2f}" if score is not None else "—" highlight_class = "highlight" if i == 0 else "" html += f"""
{metric_name}
{score_display}
""" html += "
" html += "
" return html def format_model_comparison(selected_models, all_results): """Formats a comparison view showing multiple models with visual indicators.""" if not selected_models or not all_results: return """

Select models to compare

Choose multiple models from the dropdown to see a side-by-side comparison

""" # Get all unique leaderboards across selected models all_leaderboards = set() model_data_dict = {} for model_name in selected_models: if model_name in all_results: model_data_dict[model_name] = all_results[model_name] for leaderboard_name in all_results[model_name].keys(): all_leaderboards.add(leaderboard_name) if not model_data_dict: return """

No data found for selected models

Try selecting different models

""" all_leaderboards = sorted(all_leaderboards) model_colors = ['#88C0D0', '#A3BE8C', '#EBCB8B', '#D08770', '#B48EAD', '#8FBCBB', '#81A1C1', '#BF616A'] # Calculate overall averages for summary overall_avgs = {} for model_name in selected_models: if model_name in model_data_dict: all_scores = [] for lb_data in model_data_dict[model_name].values(): all_scores.extend([v for v in lb_data.get("results", {}).values() if v is not None]) overall_avgs[model_name] = sum(all_scores) / len(all_scores) if all_scores else None html = """

Model Comparison

""" # Summary cards for each model for i, model_name in enumerate(selected_models): color = model_colors[i % len(model_colors)] avg = overall_avgs.get(model_name) avg_str = f"{avg:.2f}" if avg is not None else "—" # Get model info model_info = list(model_data_dict.get(model_name, {}).values()) developer = model_info[0].get("developer", "Unknown") if model_info else "Unknown" html += f"""
{model_name}
Developer {developer}
Overall Avg {avg_str}
""" html += """
""" # Leaderboard comparison cards for leaderboard_name in all_leaderboards: leaderboard_metrics = set() for model_data in model_data_dict.values(): if leaderboard_name in model_data: results = model_data[leaderboard_name].get("results", {}) leaderboard_metrics.update(results.keys()) leaderboard_metrics = sorted(leaderboard_metrics) if not leaderboard_metrics: continue # Calculate averages for ranking model_avgs = {} for model_name in selected_models: if model_name in model_data_dict and leaderboard_name in model_data_dict[model_name]: results = model_data_dict[model_name][leaderboard_name].get("results", {}) scores = [v for v in results.values() if v is not None] model_avgs[model_name] = sum(scores) / len(scores) if scores else None html += f"""

{leaderboard_name}

""" # Compact heat-map table html += '
' html += '' # Header with model names html += '' for i, model_name in enumerate(selected_models): # Truncate long names short_name = model_name if len(model_name) <= 20 else model_name[:18] + "…" html += f'' html += '' html += '' # Average row first html += '' valid_avgs_list = [model_avgs.get(m) for m in selected_models if model_avgs.get(m) is not None] max_avg_val = max(valid_avgs_list) if valid_avgs_list else None for model_name in selected_models: avg = model_avgs.get(model_name) if avg is not None: cell_class = "best" if avg == max_avg_val and len(valid_avgs_list) > 1 else "" html += f'' else: html += '' html += '' # Individual metric rows for metric_name in leaderboard_metrics: html += f'' # Get all scores for this metric metric_scores = {} for model_name in selected_models: if model_name in model_data_dict and leaderboard_name in model_data_dict[model_name]: results = model_data_dict[model_name][leaderboard_name].get("results", {}) metric_scores[model_name] = results.get(metric_name) valid_scores = [v for v in metric_scores.values() if v is not None] if valid_scores: max_score = max(valid_scores) min_score = min(valid_scores) score_range = max_score - min_score if max_score > min_score else 1 else: max_score = min_score = score_range = None for model_name in selected_models: score = metric_scores.get(model_name) if score is not None and score_range is not None: # Determine color class based on relative position if len(valid_scores) > 1: pct = (score - min_score) / score_range if score_range > 0 else 1 if score == max_score: cell_class = "best" elif pct >= 0.75: cell_class = "good" elif pct >= 0.5: cell_class = "mid" elif pct >= 0.25: cell_class = "low" else: cell_class = "worst" else: cell_class = "" html += f'' else: html += '' html += '' html += '
Metric{short_name}
Average{avg:.2f}
{metric_name}{score:.2f}
' html += """
""" html += "
" return html