Spaces:
Running
Running
| import gradio as gr | |
| import plotly.graph_objects as go | |
| from data_loader import get_eval_metadata | |
| def get_theme(): | |
| return gr.themes.Base( | |
| primary_hue="blue", | |
| neutral_hue="slate", | |
| ).set( | |
| body_background_fill="#f5f5f5", | |
| body_text_color="#0a0a0a", | |
| body_text_color_subdued="#525252", | |
| block_background_fill="#ffffff", | |
| block_border_color="#e5e5e5", | |
| block_label_text_color="#525252", | |
| block_title_text_color="#0a0a0a", | |
| input_background_fill="#ffffff", | |
| input_border_color="#e5e5e5", | |
| button_primary_background_fill="#3b82f6", | |
| button_primary_text_color="#ffffff", | |
| button_secondary_background_fill="#ffffff", | |
| button_secondary_text_color="#0a0a0a", | |
| button_secondary_border_color="#e5e5e5", | |
| ) | |
| def get_custom_css(): | |
| return """ | |
| :root { | |
| --brand-black: #0a0a0a; | |
| --brand-dark: #1a1a1a; | |
| --brand-gray: #2a2a2a; | |
| --brand-light: #f5f5f5; | |
| --brand-accent: #3b82f6; | |
| } | |
| body, .gradio-container { | |
| background: var(--brand-light) !important; | |
| color: var(--brand-black) !important; | |
| } | |
| .gradio-container { | |
| max-width: 100%; | |
| padding: 1.25rem 2.5rem 2rem; | |
| } | |
| .gradio-container *:focus-visible { | |
| outline: none !important; | |
| box-shadow: inset 0 0 0 1.5px #3b82f6 !important; | |
| } | |
| .gradio-container .block, | |
| .gradio-container .wrap, | |
| .gradio-container .form, | |
| .gradio-container .container { | |
| box-shadow: none !important; | |
| } | |
| /* Match pill styling */ | |
| .match-pills .wrap, | |
| .match-pills .container { | |
| display: flex !important; | |
| flex-wrap: wrap !important; | |
| gap: 0.35rem !important; | |
| } | |
| .match-pills .wrap > div, | |
| .match-pills .container > div { | |
| margin: 0 !important; | |
| } | |
| .match-pills input[type="checkbox"] { | |
| display: none; | |
| } | |
| .match-pills label { | |
| display: inline-flex; | |
| align-items: center; | |
| border: 1px solid #d6d9de; | |
| background: #f5f7fb; | |
| border-radius: 999px; | |
| padding: 0.28rem 0.75rem; | |
| font-weight: 500; | |
| color: #0a0a0a; | |
| transition: all 120ms ease; | |
| cursor: pointer; | |
| } | |
| .match-pills label:hover { | |
| border-color: #3b82f6; | |
| background: #eef4ff; | |
| } | |
| .match-pills input[type="checkbox"]:checked + label { | |
| border-color: #3b82f6; | |
| background: rgba(59, 130, 246, 0.12); | |
| color: #0a0a0a; | |
| font-weight: 600; | |
| } | |
| .app-header { | |
| display: flex; | |
| align-items: center; | |
| gap: 1rem; | |
| margin-bottom: 1.5rem; | |
| padding: 1rem 1.25rem; | |
| background: #ffffff; | |
| border: 1px solid #e5e5e5; | |
| border-radius: 12px; | |
| } | |
| .logo-mark { | |
| width: 48px; | |
| height: 48px; | |
| border-radius: 12px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| font-weight: 800; | |
| font-size: 1.1rem; | |
| color: #ffffff; | |
| } | |
| .brand h1 { margin: 0; font-size: 1.5rem; font-weight: 700; color: #0a0a0a; } | |
| .brand .tagline { color: #525252; font-size: 0.9rem; } | |
| .header-right { margin-left: auto; } | |
| .version-badge { | |
| background: rgba(59, 130, 246, 0.1); | |
| border: 1px solid #3b82f6; | |
| border-radius: 8px; | |
| padding: 0.35rem 0.6rem; | |
| font-size: 0.78rem; | |
| color: #3b82f6; | |
| } | |
| .info-banner { | |
| background: #ffffff; | |
| border: 1px solid #e5e5e5; | |
| border-left: 3px solid #3b82f6; | |
| border-radius: 10px; | |
| padding: 1rem 1.25rem; | |
| margin-bottom: 1rem; | |
| } | |
| .info-banner h3 { margin: 0; font-weight: 600; color: #0a0a0a; } | |
| .leaderboard-header { | |
| display: flex; | |
| justify-content: space-between; | |
| align-items: center; | |
| gap: 1rem; | |
| flex-wrap: wrap; | |
| margin-bottom: 0.4rem; | |
| } | |
| .lb-title { | |
| font-size: 1.2rem; | |
| font-weight: 700; | |
| color: #0a0a0a; | |
| margin: 0; | |
| line-height: 1.35; | |
| } | |
| .lb-by { | |
| font-size: 0.9rem; | |
| color: #525252; | |
| margin: 0.1rem 0 0 0; | |
| line-height: 1.35; | |
| } | |
| .lb-meta { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 0.1rem; | |
| } | |
| .eval-tags { display: flex; flex-wrap: wrap; gap: 0.4rem; } | |
| .eval-tags { margin-top: 0.35rem; } | |
| .eval-tag { | |
| border-radius: 10px; | |
| padding: 0.3rem 0.65rem; | |
| font-size: 0.82rem; | |
| font-weight: 600; | |
| color: #0a0a0a; | |
| border: 1px solid #e5e5e5; | |
| background: #f8fafc; | |
| } | |
| .eval-tag:nth-child(5n + 1) { border-color: #3b82f6; background: rgba(59, 130, 246, 0.12); color: #0a1d4a; } | |
| .eval-tag:nth-child(5n + 2) { border-color: #10b981; background: rgba(16, 185, 129, 0.12); color: #0b3b2b; } | |
| .eval-tag:nth-child(5n + 3) { border-color: #f97316; background: rgba(249, 115, 22, 0.12); color: #4b1f07; } | |
| .eval-tag:nth-child(5n + 4) { border-color: #8b5cf6; background: rgba(139, 92, 246, 0.12); color: #2f0f5a; } | |
| .eval-tag:nth-child(5n) { border-color: #06b6d4; background: rgba(6, 182, 212, 0.12); color: #053f46; } | |
| .source-link { | |
| font-size: 0.75rem; | |
| color: #3b82f6; | |
| text-decoration: none; | |
| padding: 0.375rem 0.75rem; | |
| border: 1px solid #3b82f6; | |
| border-radius: 6px; | |
| } | |
| .source-link:hover { background: rgba(59, 130, 246, 0.1); } | |
| .pagination-bar { | |
| margin-top: 0.75rem; | |
| padding: 0.85rem 0 0.25rem; | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| gap: 0.85rem; | |
| } | |
| .page-info { font-size: 1rem; min-width: 80px; text-align: center; color: #0a0a0a; } | |
| .metrics-section { | |
| margin-top: 1.25rem; | |
| padding: 1.25rem 1rem; | |
| border-top: 1px solid #e5e5e5; | |
| } | |
| .metrics-section h3 { | |
| font-size: 0.9rem; | |
| font-weight: 700; | |
| color: #525252; | |
| margin: 0 0 0.9rem 0; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| } | |
| .metrics-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); | |
| gap: 0.75rem; | |
| } | |
| @media (max-width: 768px) { | |
| .metrics-grid { | |
| grid-template-columns: repeat(auto-fill, minmax(160px, 1fr)); | |
| gap: 0.5rem; | |
| } | |
| .metric-card-header { | |
| padding: 0.65rem 0.8rem; | |
| flex-direction: column; | |
| align-items: flex-start; | |
| gap: 0.25rem; | |
| } | |
| .metric-card-body { | |
| padding: 0.65rem 0.8rem; | |
| font-size: 0.85rem; | |
| } | |
| .metrics-section { | |
| padding: 1rem 0.5rem; | |
| } | |
| } | |
| .metrics-grid .metric-card { | |
| align-self: start; | |
| } | |
| .metric-card { | |
| background: #ffffff; | |
| border: 1px solid #e5e5e5; | |
| border-radius: 10px; | |
| overflow: hidden; | |
| position: relative; | |
| } | |
| .metric-card-header { | |
| display: flex; | |
| justify-content: space-between; | |
| align-items: center; | |
| padding: 0.85rem 1rem; | |
| cursor: pointer; | |
| } | |
| .metric-card-header:hover { | |
| background: #f9f9f9; | |
| } | |
| .metric-card-name { font-weight: 600; color: #0a0a0a; } | |
| .metric-card-direction { font-size: 0.82rem; color: #525252; } | |
| .metric-card-direction .arrow { color: #22c55e; font-weight: 700; } | |
| .metric-card-body { | |
| display: none; | |
| padding: 0.85rem 1rem; | |
| border-top: 1px solid #e5e5e5; | |
| color: #0a0a0a; | |
| } | |
| .metric-card input.metric-toggle { | |
| display: none; | |
| } | |
| .metric-card input.metric-toggle:checked ~ .metric-card-body { | |
| display: block; | |
| } | |
| .metric-card input.metric-toggle:checked ~ .metric-card-header { | |
| background: #f9f9f9; | |
| border-bottom: 1px solid #e5e5e5; | |
| } | |
| .metric-card input.metric-toggle:checked ~ .metric-card-header .metric-card-name, | |
| .metric-card input.metric-toggle:checked ~ .metric-card-header .metric-card-direction { | |
| color: #0a0a0a; | |
| } | |
| /* Ensure multiple cards can be open at once and are closable */ | |
| .metric-card input.metric-toggle:not(:checked) ~ .metric-card-body { | |
| display: none; | |
| } | |
| .metric-type-badge { | |
| font-size: 0.68rem; | |
| text-transform: uppercase; | |
| padding: 0.2rem 0.45rem; | |
| background: rgba(59, 130, 246, 0.1); | |
| border: 1px solid #3b82f6; | |
| border-radius: 6px; | |
| color: #3b82f6; | |
| } | |
| .heatmap-table { width: 100%; border-collapse: collapse; font-size: 0.85rem; } | |
| .heatmap-table th { padding: 0.55rem 0.65rem; font-weight: 700; font-size: 0.72rem; text-transform: uppercase; color: #525252; background: #f5f5f5; } | |
| .heatmap-table td { padding: 0.45rem 0.65rem; text-align: center; border-bottom: 1px solid #e5e5e5; } | |
| .heatmap-table td.metric-name { text-align: left; font-weight: 600; color: #0a0a0a; } | |
| .heatmap-table td.score-cell { font-weight: 600; } | |
| .heatmap-table td.score-cell.best { background: rgba(34, 197, 94, 0.15); color: #16a34a; } | |
| .heatmap-table td.score-cell.good { background: rgba(34, 197, 94, 0.08); color: #16a34a; } | |
| .heatmap-table td.score-cell.mid { background: rgba(234, 179, 8, 0.15); color: #ca8a04; } | |
| .heatmap-table td.score-cell.low { background: rgba(239, 68, 68, 0.12); color: #dc2626; } | |
| .heatmap-table td.score-cell.worst { background: rgba(239, 68, 68, 0.18); color: #b91c1c; } | |
| .heatmap-table td.score-cell.na { color: #525252; font-style: italic; } | |
| /* Model chips */ | |
| .selected-models-group label { | |
| display: inline-flex !important; | |
| background: #ffffff; | |
| border: 1px solid #e5e5e5; | |
| border-radius: 16px; | |
| padding: 0.35rem 0.85rem; | |
| font-size: 0.88rem; | |
| color: #0a0a0a; | |
| cursor: pointer; | |
| margin: 0.18rem 0.32rem 0.18rem 0 !important; | |
| } | |
| .selected-models-group input[type="checkbox"] { display: none; } | |
| .no-results { text-align: center; padding: 2.5rem 1rem; color: #525252; } | |
| .gradio-container footer { display: none; } | |
| .block, .form, .wrap, .container { background: #ffffff !important; } | |
| body, .gradio-container, p, span, div, h1, h2, h3, h4, h5, h6, label, td, th { | |
| color: #0a0a0a !important; | |
| } | |
| .label-wrap span, .prose, .markdown, .prose p, .prose li, .markdown p, .markdown li { | |
| color: #525252 !important; | |
| } | |
| input, textarea, select { | |
| background: #ffffff !important; | |
| color: #0a0a0a !important; | |
| border: 1px solid #e5e5e5 !important; | |
| border-radius: 8px !important; | |
| } | |
| input::placeholder, textarea::placeholder { | |
| color: #a1a1a1 !important; | |
| } | |
| input:focus, textarea:focus, select:focus { | |
| border-color: #3b82f6 !important; | |
| outline: none !important; | |
| box-shadow: inset 0 0 0 1.5px #3b82f6 !important; | |
| } | |
| select, .wrap select, .wrap input, input[type="text"], textarea { | |
| min-height: 44px !important; | |
| padding: 0.55rem 0.75rem !important; | |
| font-size: 0.96rem !important; | |
| } | |
| button { | |
| border-radius: 8px !important; | |
| font-weight: 500 !important; | |
| transition: all 0.15s ease !important; | |
| } | |
| button.primary, button[variant="primary"] { | |
| background: #3b82f6 !important; | |
| color: #ffffff !important; | |
| border: none !important; | |
| } | |
| button.primary:hover, button[variant="primary"]:hover { | |
| background: #2563eb !important; | |
| } | |
| button.secondary, button[variant="secondary"], button:not(.primary):not([variant="primary"]) { | |
| background: #ffffff !important; | |
| color: #0a0a0a !important; | |
| border: 1px solid #e5e5e5 !important; | |
| } | |
| button.secondary:hover, button[variant="secondary"]:hover { | |
| border-color: #3b82f6 !important; | |
| background: #f5f5f5 !important; | |
| } | |
| .tab-nav, .tabs { | |
| border-bottom: 1px solid #e5e5e5 !important; | |
| } | |
| .tab-nav button, .tabs button { | |
| color: #525252 !important; | |
| background: transparent !important; | |
| border: none !important; | |
| border-bottom: 2px solid transparent !important; | |
| } | |
| .tab-nav button.selected, .tabs button.selected { | |
| color: #3b82f6 !important; | |
| border-bottom-color: #3b82f6 !important; | |
| } | |
| .wrap, .secondary-wrap, .primary-wrap { | |
| background: transparent !important; | |
| border: none !important; | |
| border-radius: 0 !important; | |
| box-shadow: none !important; | |
| padding: 0 !important; | |
| } | |
| ul[role="listbox"], .dropdown, .options { | |
| background: #ffffff !important; | |
| border: 1px solid #e5e5e5 !important; | |
| border-radius: 8px !important; | |
| box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1) !important; | |
| } | |
| ul[role="listbox"] li, .dropdown li, .options li { | |
| color: #0a0a0a !important; | |
| } | |
| ul[role="listbox"] li:hover, .dropdown li:hover, .options li:hover { | |
| background: #f5f5f5 !important; | |
| } | |
| ul[role="listbox"] li.active, .dropdown li.active, .options li.active { | |
| background: #f5f5f5 !important; | |
| color: #0a0a0a !important; | |
| } | |
| ul[role="listbox"] li.selected, .dropdown li.selected { | |
| background: rgba(59, 130, 246, 0.1) !important; | |
| color: #3b82f6 !important; | |
| } | |
| .accordion { | |
| border: 1px solid #e5e5e5 !important; | |
| border-radius: 8px !important; | |
| background: #ffffff !important; | |
| } | |
| .accordion > button { | |
| color: #0a0a0a !important; | |
| } | |
| .selected-models-group label, .checkbox-group label { | |
| display: inline-flex !important; | |
| background: #ffffff; | |
| border: 1px solid #e5e5e5; | |
| border-radius: 20px !important; | |
| padding: 0.4rem 0.9rem !important; | |
| font-size: 0.88rem !important; | |
| color: #0a0a0a !important; | |
| cursor: pointer !important; | |
| margin: 0.2rem !important; | |
| transition: all 0.15s ease !important; | |
| } | |
| .selected-models-group label:hover, .checkbox-group label:hover { | |
| border-color: #3b82f6 !important; | |
| background: #f5f5f5 !important; | |
| } | |
| .selected-models-group input[type="checkbox"], .checkbox-group input[type="checkbox"] { | |
| display: none !important; | |
| } | |
| table { | |
| width: 100% !important; | |
| border-collapse: collapse !important; | |
| background: #ffffff !important; | |
| } | |
| table th { | |
| background: #f5f5f5 !important; | |
| color: #525252 !important; | |
| font-weight: 600 !important; | |
| text-transform: uppercase !important; | |
| font-size: 0.75rem !important; | |
| padding: 0.75rem !important; | |
| border-bottom: 1px solid #e5e5e5 !important; | |
| text-align: left !important; | |
| } | |
| table td { | |
| padding: 0.65rem 0.75rem !important; | |
| border-bottom: 1px solid #e5e5e5 !important; | |
| color: #0a0a0a !important; | |
| } | |
| table tr:hover td { | |
| background: #f9f9f9 !important; | |
| } | |
| .dataframe { | |
| background: #ffffff !important; | |
| border: 1px solid #e5e5e5 !important; | |
| box-shadow: none !important; | |
| border-radius: px !important; | |
| overflow: hidden !important; | |
| } | |
| .dataframe table { | |
| width: 100% !important; | |
| border-collapse: collapse !important; | |
| font-size: 0.75rem !important; | |
| table-layout: auto !important; | |
| background: #ffffff !important; | |
| } | |
| .dataframe thead, | |
| .dataframe thead tr { | |
| background: #ffffff !important; | |
| position: sticky !important; | |
| top: 0 !important; | |
| z-index: 10 !important; | |
| } | |
| .dataframe thead th { | |
| padding: 0.875rem 1rem !important; | |
| font-weight: 700 !important; | |
| font-size: 0.75rem !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.05em !important; | |
| color: #0a0a0a !important; | |
| border-bottom: 2px solid #e5e5e5 !important; | |
| border-top: none !important; | |
| text-align: left !important; | |
| background: #ffffff !important; | |
| white-space: nowrap !important; | |
| border-radius: 0 !important; | |
| } | |
| .dataframe thead th span, | |
| .dataframe thead th div, | |
| .dataframe thead th button { | |
| background: transparent !important; | |
| border: none !important; | |
| border-radius: 0 !important; | |
| box-shadow: none !important; | |
| margin: 0 !important; | |
| outline: none !important; | |
| } | |
| .dataframe thead th span[role="button"], | |
| .dataframe thead th span[class*="svelte"] { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| outline: none !important; | |
| padding: 0 !important; | |
| width: auto !important; | |
| } | |
| /* Also target the SVG icon if it exists to ensure it doesn't have a background */ | |
| .dataframe thead th svg { | |
| background: transparent !important; | |
| box-shadow: none !important; | |
| } | |
| .dataframe thead th span:hover, | |
| .dataframe thead th span[role="button"]:hover, | |
| .dataframe thead th span[class*="svelte"]:hover, | |
| .dataframe thead th button:hover { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| color: #3b82f6 !important; | |
| } | |
| .token { | |
| background-color: rgba(59, 130, 246, 0.12) !important; | |
| border: 1px solid rgba(59, 130, 246, 0.3) !important; | |
| color: #1e3a8a !important; | |
| border-radius: 6px !important; | |
| padding: 2px 8px !important; | |
| gap: 4px !important; | |
| } | |
| .token-remove { | |
| background-color: rgba(255, 255, 255, 0.4) !important; | |
| border: 1px solid rgba(30, 58, 138, 0.5) !important; /* Dark blue outline */ | |
| color: #1e3a8a !important; | |
| border-radius: 4px !important; | |
| margin-left: 6px !important; | |
| padding: 1px !important; | |
| opacity: 0.9 !important; | |
| min-width: 18px !important; | |
| min-height: 18px !important; | |
| display: flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| } | |
| .token-remove svg { | |
| width: 12px !important; | |
| height: 12px !important; | |
| } | |
| .token-remove:hover { | |
| background-color: #1e3a8a !important; | |
| color: #ffffff !important; | |
| border-color: #1e3a8a !important; | |
| } | |
| .selector-item { | |
| border-radius: 6px !important; | |
| } | |
| .gradio-container .token { | |
| box-shadow: none !important; | |
| font-weight: 500 !important; | |
| } | |
| .gradio-container .token span { | |
| color: #1e3a8a !important; | |
| } | |
| .dataframe tbody, | |
| .dataframe tbody tr { | |
| background: #ffffff !important; | |
| } | |
| .dataframe tbody tr { | |
| border-bottom: 1px solid #e5e5e5 !important; | |
| } | |
| .dataframe tbody tr:hover { | |
| background: #f9f9f9 !important; | |
| } | |
| .dataframe tbody td { | |
| padding: 0.75rem 1rem !important; | |
| color: #0a0a0a !important; | |
| background: #ffffff !important; | |
| border: none !important; | |
| border-bottom: 1px solid #e5e5e5 !important; | |
| } | |
| .dataframe tbody td:first-child { | |
| font-weight: 700 !important; | |
| color: #0a0a0a !important; | |
| white-space: normal !important; | |
| word-break: break-word !important; | |
| max-width: 400px; | |
| min-width: 250px; | |
| } | |
| .dataframe tbody td:not(:first-child) { | |
| font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !important; | |
| text-align: left !important; | |
| white-space: nowrap !important; | |
| min-width: 80px !important; | |
| } | |
| .dataframe td:nth-child(2), | |
| .dataframe th:nth-child(2) { | |
| max-width: 220px; | |
| min-width: 140px; | |
| } | |
| .column-selector-dropdown { | |
| min-width: 300px; | |
| } | |
| .column-selector-dropdown .wrap { | |
| flex-wrap: nowrap !important; | |
| overflow-x: auto !important; | |
| gap: 0.25rem !important; | |
| padding: 0.5rem !important; | |
| } | |
| .column-selector-dropdown .wrap input { | |
| width: 100% !important; | |
| padding-left: 0.5rem !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| } | |
| .heatmap-table { | |
| border: 1px solid #e5e5e5 !important; | |
| border-radius: 8px !important; | |
| overflow: hidden !important; | |
| } | |
| .heatmap-table th { | |
| background: #f5f5f5 !important; | |
| color: #525252 !important; | |
| padding: 0.6rem 0.75rem !important; | |
| font-size: 0.72rem !important; | |
| border-bottom: 2px solid #e5e5e5 !important; | |
| } | |
| .heatmap-table td { | |
| padding: 0.5rem 0.75rem !important; | |
| border-bottom: 1px solid #e5e5e5 !important; | |
| } | |
| .heatmap-table td.metric-name { | |
| background: #f5f5f5 !important; | |
| font-weight: 600 !important; | |
| } | |
| .heatmap-table td.score-cell.best { background: rgba(34, 197, 94, 0.2) !important; color: #15803d !important; } | |
| .heatmap-table td.score-cell.good { background: rgba(34, 197, 94, 0.1) !important; color: #16a34a !important; } | |
| .heatmap-table td.score-cell.mid { background: rgba(234, 179, 8, 0.15) !important; color: #a16207 !important; } | |
| .heatmap-table td.score-cell.low { background: rgba(239, 68, 68, 0.12) !important; color: #dc2626 !important; } | |
| .heatmap-table td.score-cell.worst { background: rgba(239, 68, 68, 0.2) !important; color: #b91c1c !important; } | |
| .heatmap-table td.score-cell.na { color: #a1a1a1 !important; font-style: italic !important; } | |
| .gradio-container footer { display: none !important; } | |
| ::-webkit-scrollbar { width: 8px; height: 8px; } | |
| ::-webkit-scrollbar-track { background: #f5f5f5; } | |
| ::-webkit-scrollbar-thumb { background: #d4d4d4; border-radius: 4px; } | |
| ::-webkit-scrollbar-thumb:hover { background: #a1a1a1; } | |
| """ | |
| def format_leaderboard_header(selected_leaderboard, metadata): | |
| if not selected_leaderboard: | |
| return '<div style="text-align: center; padding: 2rem; color: #525252;">Select a leaderboard to explore</div>' | |
| if not metadata or not metadata.get("evals"): | |
| return f'<div class="info-banner"><h3>{selected_leaderboard}</h3></div>' | |
| source_info = metadata.get("source_info", {}) | |
| org = source_info.get("organization", "Unknown") | |
| url = source_info.get("url", "#") | |
| eval_names = sorted(list(metadata["evals"].keys())) | |
| eval_tags = "".join([f'<span class="eval-tag">{name}</span>' for name in eval_names]) | |
| return f''' | |
| <div class="info-banner"> | |
| <div class="leaderboard-header"> | |
| <div class="lb-meta"> | |
| <div class="lb-title">{selected_leaderboard}</div> | |
| <div class="lb-by">By {org}</div> | |
| </div> | |
| <a href="{url}" target="_blank" class="source-link">Source →</a> | |
| </div> | |
| <div class="eval-tags">{eval_tags}</div> | |
| </div> | |
| ''' | |
| def format_metric_details(selected_leaderboard, metadata): | |
| if not selected_leaderboard or not metadata or not metadata.get("evals"): | |
| return "" | |
| evals = metadata.get("evals", {}) | |
| cards_html = "" | |
| for i, (eval_name, info) in enumerate(evals.items()): | |
| score_type = info.get('score_type', '').upper() or "—" | |
| direction = "Lower is better" if info.get('lower_is_better') else "Higher is better" | |
| arrow = "↓" if info.get('lower_is_better') else "↑" | |
| details = "" | |
| if info.get('score_type') == "continuous" and info.get('min_score') is not None: | |
| details = f"Range: [{info['min_score']} – {info['max_score']}]" | |
| elif info.get('score_type') == "levels" and info.get('level_names'): | |
| details = f"Levels: {', '.join(str(l) for l in info['level_names'])}" | |
| card_id = f"mc{i}" | |
| cards_html += f''' | |
| <div class="metric-card" id="{card_id}"> | |
| <input type="checkbox" id="toggle-{card_id}" class="metric-toggle" /> | |
| <label class="metric-card-header" for="toggle-{card_id}"> | |
| <span class="metric-card-name">{eval_name}</span> | |
| <span class="metric-card-direction"><span class="arrow">{arrow}</span> {direction}</span> | |
| </label> | |
| <div class="metric-card-body"> | |
| <div>{info.get('description', 'No description')}</div> | |
| <div style="display: flex; justify-content: space-between; align-items: center; margin-top: 0.5rem;"> | |
| <span style="font-size: 0.75rem; color: #525252;">{details}</span> | |
| <span class="metric-type-badge">{score_type}</span> | |
| </div> | |
| </div> | |
| </div> | |
| ''' | |
| return f''' | |
| <div class="metrics-section"> | |
| <h3>Metric Reference</h3> | |
| <div class="metrics-grid">{cards_html}</div> | |
| </div> | |
| ''' | |
| def format_model_card(model_name, model_data): | |
| if not model_data: | |
| return '<div class="no-results"><h3>No results found</h3><p>Try a different model name</p></div>' | |
| first = list(model_data.values())[0] | |
| developer = first.get("developer", "Unknown") | |
| params = first.get("params") | |
| arch = first.get("architecture", "Unknown") | |
| params_str = f"{params}B" if params else "—" | |
| html = f''' | |
| <div style="padding: 1rem; background: #ffffff; border-radius: 10px; border: 1px solid #e5e5e5;"> | |
| <h2 style="margin: 0 0 0.5rem 0; color: #0a0a0a;">{model_name}</h2> | |
| <div style="color: #525252; margin-bottom: 1rem;"> | |
| <span>Developer: {developer}</span> · | |
| <span>Params: {params_str}</span> · | |
| <span>Arch: {arch}</span> | |
| </div> | |
| ''' | |
| for leaderboard_name, data in model_data.items(): | |
| results = data.get("results", {}) | |
| if not results: | |
| continue | |
| scores = [v for v in results.values() if v is not None] | |
| avg = sum(scores) / len(scores) if scores else None | |
| avg_str = f"{avg:.2f}" if avg else "—" | |
| html += f'<div style="margin-bottom: 1rem;"><h4 style="color: #0a0a0a;">{leaderboard_name} <span style="color: #525252;">(avg: {avg_str})</span></h4>' | |
| html += '<div style="display: flex; flex-wrap: wrap; gap: 0.5rem;">' | |
| for metric_name, score in sorted(results.items(), key=lambda x: x[1] if x[1] else 0, reverse=True): | |
| score_display = f"{score:.2f}" if score is not None else "—" | |
| html += f'<div style="padding: 0.4rem 0.8rem; border-radius: 6px; background: #f5f5f5; border: 1px solid #e5e5e5;"><span style="color: #525252;">{metric_name}:</span> <strong style="color: #0a0a0a;">{score_display}</strong></div>' | |
| html += '</div></div>' | |
| html += '</div>' | |
| return html | |
| def format_model_comparison(selected_models, all_results): | |
| if not selected_models or not all_results: | |
| return '<div class="no-results"><h3>Select models to compare</h3><p>Choose models from the dropdown</p></div>' | |
| all_leaderboards = set() | |
| model_data_dict = {} | |
| for model_name in selected_models: | |
| if model_name in all_results: | |
| model_data_dict[model_name] = all_results[model_name] | |
| for lb in all_results[model_name].keys(): | |
| all_leaderboards.add(lb) | |
| if not model_data_dict: | |
| return '<div class="no-results"><h3>No data found</h3></div>' | |
| all_leaderboards = sorted(all_leaderboards) | |
| html = '<div style="padding: 1rem; background: #ffffff; border-radius: 10px; border: 1px solid #e5e5e5;">' | |
| for leaderboard_name in all_leaderboards: | |
| metrics = set() | |
| for md in model_data_dict.values(): | |
| if leaderboard_name in md: | |
| metrics.update(md[leaderboard_name].get("results", {}).keys()) | |
| metrics = sorted(metrics) | |
| if not metrics: | |
| continue | |
| html += f'<h3 style="margin: 1rem 0 0.5rem; color: #0a0a0a;">{leaderboard_name}</h3>' | |
| html += '<div style="overflow-x: auto;"><table class="heatmap-table"><thead><tr><th>Metric</th>' | |
| for model_name in selected_models: | |
| short = model_name[:20] + "…" if len(model_name) > 20 else model_name | |
| html += f'<th title="{model_name}">{short}</th>' | |
| html += '</tr></thead><tbody>' | |
| for metric_name in metrics: | |
| html += f'<tr><td class="metric-name">{metric_name}</td>' | |
| scores = {} | |
| for m in selected_models: | |
| if m in model_data_dict and leaderboard_name in model_data_dict[m]: | |
| scores[m] = model_data_dict[m][leaderboard_name].get("results", {}).get(metric_name) | |
| valid = [v for v in scores.values() if v is not None] | |
| max_s = max(valid) if valid else None | |
| min_s = min(valid) if valid else None | |
| for model_name in selected_models: | |
| score = scores.get(model_name) | |
| if score is not None: | |
| if len(valid) > 1 and max_s and min_s: | |
| if score == max_s: | |
| cls = "best" | |
| elif max_s > min_s: | |
| pct = (score - min_s) / (max_s - min_s) | |
| cls = "good" if pct >= 0.75 else "mid" if pct >= 0.5 else "low" if pct >= 0.25 else "worst" | |
| else: | |
| cls = "" | |
| else: | |
| cls = "" | |
| html += f'<td class="score-cell {cls}">{score:.2f}</td>' | |
| else: | |
| html += '<td class="score-cell na">—</td>' | |
| html += '</tr>' | |
| html += '</tbody></table></div>' | |
| html += '</div>' | |
| return html | |
| def create_radar_plot(selected_models, all_results): | |
| if not selected_models or not all_results: | |
| return None | |
| metric_data = {} | |
| leaderboards_involved = set() | |
| for model in selected_models: | |
| if model not in all_results: | |
| continue | |
| model_data = all_results[model] | |
| for lb_name, lb_data in model_data.items(): | |
| leaderboards_involved.add(lb_name) | |
| results = lb_data.get("results", {}) | |
| for metric, score in results.items(): | |
| if score is None: continue | |
| key = f"{lb_name}: {metric}" | |
| if key not in metric_data: | |
| metric_data[key] = {} | |
| metric_data[key][model] = score | |
| if not metric_data: | |
| return None | |
| meta_cache = {} | |
| for lb in leaderboards_involved: | |
| meta_cache[lb] = get_eval_metadata(lb) | |
| fig = go.Figure() | |
| categories = sorted(metric_data.keys()) | |
| for model in selected_models: | |
| r_values = [] | |
| theta_values = [] | |
| hover_texts = [] | |
| for cat in categories: | |
| lb_name, metric_name = cat.split(": ", 1) | |
| val = metric_data[cat].get(model) | |
| if val is None: | |
| r_values.append(None) | |
| theta_values.append(cat) | |
| hover_texts.append(f"{cat}<br>N/A") | |
| else: | |
| meta = meta_cache.get(lb_name, {}).get("evals", {}).get(metric_name, {}) | |
| min_s = meta.get("min_score") | |
| max_s = meta.get("max_score") | |
| observed_vals = [] | |
| for m in selected_models: | |
| v = metric_data[cat].get(m) | |
| if v is not None: | |
| observed_vals.append(v) | |
| observed_max = max(observed_vals) if observed_vals else 1.0 | |
| if min_s is None: | |
| min_s = 0 | |
| if max_s is None: | |
| if observed_max > 1: | |
| max_s = 100 | |
| else: | |
| max_s = 1 | |
| max_s = max(max_s, observed_max) | |
| if max_s == min_s: | |
| norm_val = 1.0 | |
| else: | |
| norm_val = (val - min_s) / (max_s - min_s) | |
| norm_val = max(0.0, min(1.0, norm_val)) | |
| r_values.append(norm_val) | |
| theta_values.append(cat) | |
| hover_texts.append(f"{cat}<br>Score: {val:.2f} (Norm: {norm_val:.2f})") | |
| if r_values: | |
| r_values.append(r_values[0]) | |
| theta_values.append(theta_values[0]) | |
| hover_texts.append(hover_texts[0]) | |
| fig.add_trace(go.Scatterpolar( | |
| r=r_values, | |
| theta=theta_values, | |
| name=model, | |
| hovertext=hover_texts, | |
| hoverinfo="text", | |
| fill='toself' | |
| )) | |
| fig.update_layout( | |
| polar=dict( | |
| radialaxis=dict( | |
| visible=True, | |
| range=[0, 1] | |
| ) | |
| ), | |
| showlegend=True, | |
| margin=dict(l=80, r=80, t=20, b=20), | |
| title="Model Comparison Radar (Normalized Scores)" | |
| ) | |
| return fig | |