| import gradio as gr |
| import pandas as pd |
| import plotly.graph_objects as go |
| import os |
| import base64 |
| import re |
|
|
| from huggingface_hub import HfApi |
|
|
| import aliases |
| from constants import MARK_BY_CHOICES, MARK_BY_DEFAULT |
| from simple_data_loader import SimpleLeaderboardViewer |
| from leaderboard_transformer import ( |
| DataTransformer, |
| transform_raw_dataframe, |
| create_pretty_tag_map, |
| INFORMAL_TO_FORMAL_NAME_MAP, |
| _plot_scatter_plotly, |
| format_cost_column, |
| format_score_column, |
| format_runtime_column, |
| format_date_column, |
| get_pareto_df, |
| clean_llm_base_list, |
| get_company_from_model, |
| COMPANY_LOGO_MAP, |
| ) |
| from config import ( |
| CONFIG_NAME, |
| EXTRACTED_DATA_DIR, |
| IS_INTERNAL, |
| RESULTS_DATASET, |
| ) |
| from content import ( |
| create_gradio_anchor_id, |
| format_error, |
| get_benchmark_description, |
| hf_uri_to_web_url, |
| hyperlink, |
| SCATTER_DISCLAIMER, |
| ) |
|
|
| api = HfApi() |
| os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True) |
|
|
|
|
| def get_company_logo_html(model_name: str) -> str: |
| """ |
| Generates HTML for a company logo based on the model name. |
| """ |
| company_info = get_company_from_model(model_name) |
| uri = get_svg_as_data_uri(company_info["path"]) |
| if uri: |
| return f'<img src="{uri}" alt="{company_info["name"]}" title="{company_info["name"]}" style="width:20px; height:20px; vertical-align: middle;">' |
| return "" |
|
|
|
|
| |
| |
| OPENNESS_ICON_MAP = { |
| aliases.CANONICAL_OPENNESS_OPEN: "assets/ellipse-pink.svg", |
| aliases.CANONICAL_OPENNESS_CLOSED: "assets/ellipse-yellow.svg", |
| } |
|
|
| |
| for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items(): |
| for openness_alias in openness_aliases: |
| OPENNESS_ICON_MAP[openness_alias] = OPENNESS_ICON_MAP[canonical_openness] |
|
|
|
|
| OPENNESS_SVG_MAP = { |
| aliases.CANONICAL_OPENNESS_OPEN: { |
| "path": "assets/ellipse-pink.svg", |
| "description": "Open source model" |
| }, |
| aliases.CANONICAL_OPENNESS_CLOSED: { |
| "path": "assets/ellipse-yellow.svg", |
| "description": "Closed source model" |
| }, |
| } |
|
|
| def get_svg_as_data_uri(path: str) -> str: |
| """Reads an SVG file and returns it as a base64-encoded data URI.""" |
| try: |
| with open(path, "rb") as svg_file: |
| encoded_svg = base64.b64encode(svg_file.read()).decode("utf-8") |
| return f"data:image/svg+xml;base64,{encoded_svg}" |
| except FileNotFoundError: |
| print(f"Warning: SVG file not found at {path}") |
| return "" |
|
|
|
|
| def build_openness_tooltip_content() -> str: |
| """ |
| Generates the inner HTML for the Model Openness tooltip card using custom SVG lock icons. |
| """ |
| open_uri = get_svg_as_data_uri("assets/lock-open.svg") |
| closed_uri = get_svg_as_data_uri("assets/lock-closed.svg") |
| html_items = [ |
| f""" |
| <div class="tooltip-legend-item"> |
| <img src="{open_uri}" alt="Open" style="width: 24px; height: 24px;"> |
| <div> |
| <strong>Open</strong> |
| <span>Open source model</span> |
| </div> |
| </div> |
| """, |
| f""" |
| <div class="tooltip-legend-item"> |
| <img src="{closed_uri}" alt="Closed" style="width: 24px; height: 24px;"> |
| <div> |
| <strong>Closed</strong> |
| <span>Closed source model</span> |
| </div> |
| </div> |
| """ |
| ] |
|
|
| joined_items = "".join(html_items) |
|
|
| return f"""<span class="tooltip-icon-legend"> |
| ⓘ |
| <span class="tooltip-card"> |
| <h3>Model Openness</h3> |
| <p class="tooltip-description">Indicates whether the language model is open source or closed source.</p> |
| <div class="tooltip-items-container">{joined_items}</div> |
| </span> |
| </span>""" |
|
|
|
|
| def build_pareto_tooltip_content() -> str: |
| """Generates the inner HTML for the Pareto tooltip card with final copy.""" |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") |
| trophy_icon_html = f'<img src="{trophy_uri}" style="width: 25px; height: 25px; vertical-align: middle;">' |
| return f""" |
| <h3>On Pareto Frontier</h3> |
| <p class="tooltip-description">The Pareto frontier represents the best balance between score and cost.</p> |
| <p class="tooltip-description">Agents on the frontier either:</p> |
| <ul class="tooltip-sub-list"> |
| <li>Offer the lowest cost for a given performance, or</li> |
| <li>Deliver the best performance at a given cost.</li> |
| </ul> |
| <div class="tooltip-description" style="margin-top: 12px; display: flex; align-items: center;"> |
| <span>These agents are marked with this icon:</span> |
| <span>{trophy_icon_html}</span> |
| </div> |
| """ |
|
|
|
|
|
|
|
|
| def build_descriptions_tooltip_content(table) -> str: |
| """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table.""" |
| if table == "Overall": |
| return """ |
| <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands SDK evaluated.</div> |
| <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div> |
| <div class="tooltip-description-item"><b>Average Score:</b> Sum of category scores divided by 5. Missing categories count as 0.</div> |
| <div class="tooltip-description-item"><b>Average Cost:</b> Average cost per instance across all submitted benchmarks, in USD.</div> |
| <div class="tooltip-description-item"><b>Issue Resolution Score:</b> Macro-average score across Issue Resolution benchmarks.</div> |
| <div class="tooltip-description-item"><b>Issue Resolution Cost:</b> Macro-average cost per instance (USD) across Issue Resolution benchmarks.</div> |
| <div class="tooltip-description-item"><b>Frontend Score:</b> Macro-average score across Frontend benchmarks.</div> |
| <div class="tooltip-description-item"><b>Frontend Cost:</b> Macro-average cost per instance (USD) across Frontend benchmarks.</div> |
| <div class="tooltip-description-item"><b>Greenfield Score:</b> Macro-average score across Greenfield benchmarks.</div> |
| <div class="tooltip-description-item"><b>Greenfield Cost:</b> Macro-average cost per instance (USD) across Greenfield benchmarks.</div> |
| <div class="tooltip-description-item"><b>Testing Score:</b> Macro-average score across Testing benchmarks.</div> |
| <div class="tooltip-description-item"><b>Testing Cost:</b> Macro-average cost per instance (USD) across Testing benchmarks.</div> |
| <div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div> |
| <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div> |
| <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div> |
| <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div> |
| <div class="tooltip-description-item"><b>Visualization:</b> View interactive evaluation visualization (when available).</div> |
| """ |
| elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]: |
| return f""" |
| <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div> |
| <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div> |
| <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div> |
| <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div> |
| <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div> |
| <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div> |
| <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div> |
| <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div> |
| <div class="tooltip-description-item"><b>📊 Visualization:</b> View interactive evaluation visualization (when available).</div> |
| <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div> |
| """ |
| else: |
| |
| return f""" |
| <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div> |
| <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div> |
| <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div> |
| <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div> |
| <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div> |
| <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div> |
| <div class="tooltip-description-item"><b>📊 Visualization:</b> View interactive evaluation visualization (when available).</div> |
| <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div> |
| """ |
|
|
| |
| open_lock_uri = get_svg_as_data_uri("assets/lock-open.svg") |
| closed_lock_uri = get_svg_as_data_uri("assets/lock-closed.svg") |
| openness_html_items = [ |
| f'<div style="display: flex; align-items: center; white-space: nowrap;">' |
| f'<img src="{open_lock_uri}" alt="Open" style="width:16px; height:16px; margin-right: 4px;">' |
| f'<span>Open</span>' |
| f'</div>', |
| f'<div style="display: flex; align-items: center; white-space: nowrap;">' |
| f'<img src="{closed_lock_uri}" alt="Closed" style="width:16px; height:16px; margin-right: 4px;">' |
| f'<span>Closed</span>' |
| f'</div>' |
| ] |
| openness_html = " ".join(openness_html_items) |
|
|
| pareto_tooltip_content = build_pareto_tooltip_content() |
| openness_tooltip_content = build_openness_tooltip_content() |
|
|
| def create_legend_markdown(which_table: str) -> str: |
| """ |
| Generates the complete HTML for the legend section, including tooltips. |
| This is used in the main leaderboard display. |
| """ |
| descriptions_tooltip_content = build_descriptions_tooltip_content(which_table) |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") |
| |
| |
| download_section = "" |
| if which_table not in ["Overall", "Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]: |
| download_section = """ |
| <div> <!-- Container for the Download section --> |
| <b>Download</b> |
| <div class="table-legend-item"> |
| <span style="font-size: 16px; margin-right: 4px;">⬇️</span> |
| <span>Trajectories</span> |
| </div> |
| </div> |
| """ |
| |
| legend_markdown = f""" |
| <div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 20px; font-size: 14px; padding-bottom: 8px;"> |
| |
| <div> <!-- Container for the Pareto section --> |
| <b>Pareto</b> |
| <span class="tooltip-icon-legend"> |
| ⓘ |
| <span class="tooltip-card">{pareto_tooltip_content}</span> |
| </span> |
| <div class="table-legend-item"> |
| <img src="{trophy_uri}" alt="On Frontier" style="width:20px; height:20px; margin-right: 4px; flex-shrink: 0;"> |
| <span>On frontier</span> |
| </div> |
| </div> |
| |
| <div> <!-- Container for the Openness section --> |
| <b>Model Openness</b> |
| {openness_tooltip_content} |
| <div class="table-legend-item">{openness_html}</div> |
| </div> |
| |
| {download_section} |
| |
| <div><!-- Container for the Column Descriptions section --> |
| <b>Column Descriptions</b> |
| <span class="tooltip-icon-legend"> |
| ⓘ |
| <span class="tooltip-card"> |
| <h3>Column Descriptions</h3> |
| <div class="tooltip-items-container">{descriptions_tooltip_content}</div> |
| </span> |
| </span> |
| </div> |
| </div> |
| """ |
| return legend_markdown |
|
|
| |
| company_legend_items = [] |
| |
| sample_companies = [ |
| ("Anthropic", "assets/logo-anthropic.svg"), |
| ("OpenAI", "assets/logo-openai.svg"), |
| ("Google", "assets/logo-google.svg"), |
| ("Meta", "assets/logo-meta.svg"), |
| ("Mistral", "assets/logo-mistral.svg"), |
| ] |
| for name, path in sample_companies: |
| uri = get_svg_as_data_uri(path) |
| if uri: |
| company_legend_items.append( |
| f'<div class="plot-legend-item">' |
| f'<img class="plot-legend-item-svg" src="{uri}" alt="{name}" title="{name}" style="width: 20px; height: 20px;">' |
| f'<span>{name}</span>' |
| f'</div>' |
| ) |
|
|
| plot_legend_html = f""" |
| <div class="plot-legend-container"> |
| <div id="plot-legend-logo"> |
| <img src="{get_svg_as_data_uri("assets/logo.svg")}"> |
| </div> |
| <div style="margin-bottom: 16px;"> |
| <span class="plot-legend-category-heading">Pareto</span> |
| <div style="margin-top: 8px;"> |
| <div class="plot-legend-item"> |
| <img id="plot-legend-item-pareto-svg" class="plot-legend-item-svg" src="{get_svg_as_data_uri("assets/pareto.svg")}"> |
| <span>On frontier</span> |
| </div> |
| </div> |
| </div> |
| <div> |
| <span class="plot-legend-category-heading">Company Logos</span> |
| <div style="margin-top: 8px;"> |
| {''.join(company_legend_items)} |
| </div> |
| </div> |
| </div> |
| """; |
|
|
| |
| CACHED_VIEWERS = {} |
| CACHED_TAG_MAPS = {} |
| _cache_lock = __import__('threading').Lock() |
| _data_version = 0 |
|
|
|
|
| def get_data_version(): |
| """Get the current data version number.""" |
| global _data_version |
| return _data_version |
|
|
|
|
| def clear_viewer_cache(): |
| """ |
| Clear all cached viewers and tag maps. |
| Called when data is refreshed from the background scheduler. |
| """ |
| global CACHED_VIEWERS, CACHED_TAG_MAPS, _data_version |
| with _cache_lock: |
| CACHED_VIEWERS.clear() |
| CACHED_TAG_MAPS.clear() |
| _data_version += 1 |
| print(f"[CACHE] Viewer cache cleared after data refresh (version: {_data_version})") |
|
|
|
|
| |
| try: |
| from setup_data import register_refresh_callback |
| register_refresh_callback(clear_viewer_cache) |
| except ImportError: |
| pass |
|
|
|
|
| |
| CATEGORY_DEFINITIONS = [ |
| {"name": "Issue Resolution", "score_col": "Issue Resolution Score", "icon": "bug-fixing.svg", "emoji": "🐛"}, |
| {"name": "Greenfield", "score_col": "Greenfield Score", "icon": "app-creation.svg", "emoji": "🌱"}, |
| {"name": "Frontend", "score_col": "Frontend Score", "icon": "frontend-development.svg", "emoji": "🎨"}, |
| {"name": "Testing", "score_col": "Testing Score", "icon": "test-generation.svg", "emoji": "🧪"}, |
| {"name": "Information Gathering", "score_col": "Information Gathering Score", "icon": "information-gathering.svg", "emoji": "🔍"}, |
| ] |
|
|
|
|
| def get_winners_by_category(df: pd.DataFrame, top_n: int = 5) -> dict: |
| """ |
| Extract the top N models for each category based on their score. |
| Deduplicates by model name, keeping only the highest score per model. |
| |
| Args: |
| df: The full leaderboard DataFrame with all scores |
| top_n: Number of top models to return per category (default: 5) |
| |
| Returns: |
| Dictionary mapping category name to list of top models with their scores |
| """ |
| winners = {} |
| |
| for cat_def in CATEGORY_DEFINITIONS: |
| cat_name = cat_def["name"] |
| score_col = cat_def["score_col"] |
| |
| if score_col not in df.columns: |
| winners[cat_name] = [] |
| continue |
| |
| |
| cat_df = df[df[score_col].notna()].copy() |
| |
| if cat_df.empty: |
| winners[cat_name] = [] |
| continue |
| |
| |
| def clean_model_name(model): |
| if isinstance(model, list): |
| model = model[0] if model else "Unknown" |
| return str(model).split('/')[-1] |
| |
| cat_df['_clean_model'] = cat_df['Language Model'].apply(clean_model_name) |
| |
| |
| cat_df = cat_df.sort_values(score_col, ascending=False) |
| cat_df = cat_df.drop_duplicates(subset=['_clean_model'], keep='first') |
| |
| |
| cat_df = cat_df.head(top_n) |
| |
| |
| top_models = [] |
| for rank, (_, row) in enumerate(cat_df.iterrows(), 1): |
| model_info = { |
| "rank": rank, |
| "language_model": row.get("Language Model", "Unknown"), |
| "score": row[score_col], |
| } |
| top_models.append(model_info) |
| |
| winners[cat_name] = top_models |
| |
| return winners |
|
|
|
|
| def create_winners_by_category_html(df: pd.DataFrame, top_n: int = 5) -> str: |
| """ |
| Create a single HTML table displaying the top N winners for each category side by side. |
| Format: | [emoji] Category | Score | [emoji] Category | Score | ... |
| |
| Args: |
| df: The full leaderboard DataFrame |
| top_n: Number of top models to show per category |
| |
| Returns: |
| HTML string with the winners table |
| """ |
| winners = get_winners_by_category(df, top_n) |
| |
| |
| html_parts = ['<div class="winners-by-category-container">'] |
| html_parts.append('<table class="winners-unified-table">') |
| |
| |
| html_parts.append('<thead><tr>') |
| for cat_def in CATEGORY_DEFINITIONS: |
| cat_name = cat_def["name"] |
| cat_emoji = cat_def["emoji"] |
| html_parts.append(f''' |
| <th class="category-header" colspan="2"> |
| {cat_emoji} {cat_name} |
| </th> |
| ''') |
| html_parts.append('</tr></thead>') |
| |
| |
| html_parts.append('<tbody>') |
| for rank in range(1, top_n + 1): |
| html_parts.append('<tr>') |
| |
| for cat_def in CATEGORY_DEFINITIONS: |
| cat_name = cat_def["name"] |
| top_models = winners.get(cat_name, []) |
| |
| |
| model_at_rank = None |
| for m in top_models: |
| if m["rank"] == rank: |
| model_at_rank = m |
| break |
| |
| if model_at_rank: |
| language_model = model_at_rank["language_model"] |
| score = model_at_rank["score"] |
| |
| |
| if isinstance(language_model, list): |
| language_model = language_model[0] if language_model else "Unknown" |
| model_display = str(language_model).split('/')[-1] |
| |
| |
| rank_suffix = "" |
| if rank == 1: |
| rank_suffix = " 🥇" |
| elif rank == 2: |
| rank_suffix = " 🥈" |
| elif rank == 3: |
| rank_suffix = " 🥉" |
| |
| html_parts.append(f''' |
| <td class="score-cell">{score:.1f}</td> |
| <td class="model-cell">{model_display}{rank_suffix}</td> |
| ''') |
| else: |
| html_parts.append('<td class="score-cell">-</td><td class="model-cell">-</td>') |
| |
| html_parts.append('</tr>') |
| |
| html_parts.append('</tbody></table></div>') |
| |
| return ''.join(html_parts) |
|
|
|
|
| class DummyViewer: |
| """A mock viewer to be cached on error. It has a ._load() method |
| to ensure it behaves like the real LeaderboardViewer.""" |
| def __init__(self, error_df): |
| self._error_df = error_df |
|
|
| def _load(self): |
| |
| return self._error_df, {} |
|
|
| def get_leaderboard_viewer_instance( |
| split: str, |
| agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS, |
| ): |
| """ |
| Fetches the LeaderboardViewer for a (split, agent_filter) pair, using a |
| thread-safe cache to avoid re-downloading data. The cache is keyed on |
| both axes so the OpenHands and Alternative Agents pages don't fight |
| over a single slot. On error, returns a stable DummyViewer. |
| """ |
| global CACHED_VIEWERS, CACHED_TAG_MAPS |
|
|
| cache_key = (split, agent_filter) |
|
|
| with _cache_lock: |
| if cache_key in CACHED_VIEWERS: |
| |
| return CACHED_VIEWERS[cache_key], CACHED_TAG_MAPS.get(cache_key, {"Overall": []}) |
|
|
| |
| try: |
| |
| data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results" |
|
|
| print(f"Loading data for split '{split}' (agent_filter={agent_filter}) from: {data_dir}/{CONFIG_NAME}") |
| viewer = SimpleLeaderboardViewer( |
| data_dir=data_dir, |
| config=CONFIG_NAME, |
| split=split, |
| agent_filter=agent_filter, |
| ) |
|
|
| |
| pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP) |
|
|
| |
| with _cache_lock: |
| CACHED_VIEWERS[cache_key] = viewer |
| CACHED_TAG_MAPS[cache_key] = pretty_tag_map |
|
|
| return viewer, pretty_tag_map |
|
|
| except Exception as e: |
| |
| error_message = f"Error loading data for split '{split}' (agent_filter={agent_filter}): {e}" |
| print(format_error(error_message)) |
|
|
| dummy_df = pd.DataFrame({"Message": [error_message]}) |
| dummy_viewer = DummyViewer(dummy_df) |
| dummy_tag_map = {"Overall": []} |
|
|
| |
| with _cache_lock: |
| CACHED_VIEWERS[cache_key] = dummy_viewer |
| CACHED_TAG_MAPS[cache_key] = dummy_tag_map |
|
|
| return dummy_viewer, dummy_tag_map |
|
|
|
|
| def create_leaderboard_display( |
| full_df: pd.DataFrame, |
| tag_map: dict, |
| category_name: str, |
| split_name: str |
| ): |
| """ |
| This UI factory takes pre-loaded data and renders the main DataFrame and Plot |
| for a given category (e.g., "Overall" or "Literature Understanding"). |
| |
| The display includes a timer that periodically checks for data updates and |
| refreshes the UI when new data is available. |
| """ |
| |
| initial_data_version = get_data_version() |
| |
| |
| |
| transformer = DataTransformer(full_df, tag_map) |
| df_view_full, plots_dict = transformer.view(tag=category_name, use_plotly=True) |
| |
| def prepare_df_for_display(df_view): |
| """Prepare a DataFrame for display with all formatting applied.""" |
| df_display = df_view.copy() |
| |
| |
| pareto_df = get_pareto_df(df_display) |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") |
| if not pareto_df.empty and 'id' in pareto_df.columns: |
| pareto_agent_names = pareto_df['id'].tolist() |
| else: |
| pareto_agent_names = [] |
|
|
| for col in df_display.columns: |
| if "Cost" in col: |
| df_display = format_cost_column(df_display, col) |
|
|
| for col in df_display.columns: |
| if "Score" in col: |
| df_display = format_score_column(df_display, col) |
| |
| for col in df_display.columns: |
| if "Runtime" in col: |
| df_display = format_runtime_column(df_display, col) |
| |
| |
| if "Date" in df_display.columns: |
| df_display = format_date_column(df_display, "Date") |
| |
| |
| df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list) |
| |
| |
| def format_language_model_with_icons(row): |
| icons_html = '' |
| |
| |
| if row['id'] in pareto_agent_names: |
| icons_html += f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:18px; height:18px;">' |
| |
| |
| openness_val = row.get('Openness', '') |
| if openness_val in [aliases.CANONICAL_OPENNESS_OPEN, 'Open', 'Open Source', 'Open Source + Open Weights']: |
| lock_uri = get_svg_as_data_uri("assets/lock-open.svg") |
| icons_html += f'<img src="{lock_uri}" alt="Open" title="Open source model" style="width:16px; height:16px;">' |
| else: |
| lock_uri = get_svg_as_data_uri("assets/lock-closed.svg") |
| icons_html += f'<img src="{lock_uri}" alt="Closed" title="Closed source model" style="width:16px; height:16px;">' |
| |
| |
| company_html = get_company_logo_html(row['Language Model']) |
| if company_html: |
| icons_html += company_html |
| |
| |
| model_name = row['Language Model'] |
| if isinstance(model_name, list): |
| if len(model_name) > 1: |
| tooltip_text = "\\n".join(map(str, model_name)) |
| model_text = f'<span class="tooltip-icon cell-tooltip-icon" style="cursor: help;" data-tooltip="{tooltip_text}">{model_name[0]} (+ {len(model_name) - 1}) ⓘ</span>' |
| elif len(model_name) == 1: |
| model_text = model_name[0] |
| else: |
| model_text = str(model_name) |
| else: |
| model_text = str(model_name) |
| |
| |
| return f'<div style="display:flex; align-items:center; gap:4px; flex-wrap:nowrap;">{icons_html}<span>{model_text}</span></div>' |
| |
| df_display['Language Model'] = df_display.apply(format_language_model_with_icons, axis=1) |
| |
| if 'Source' in df_display.columns: |
| df_display['SDK Version'] = df_display.apply( |
| lambda row: f"{row['SDK Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['SDK Version'], |
| axis=1 |
| ) |
| |
| columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source'] |
| df_display = df_display.drop(columns=columns_to_drop, errors='ignore') |
| |
| return df_display |
| |
| |
| |
| |
| if 'Categories Attempted' in df_view_full.columns: |
| df_view_complete = df_view_full[df_view_full['Categories Attempted'] == '5/5'].copy() |
| else: |
| df_view_complete = df_view_full.copy() |
| |
| |
| if 'Openness' in df_view_full.columns: |
| df_view_open = df_view_full[df_view_full['Openness'].str.lower() == 'open'].copy() |
| df_view_complete_open = df_view_complete[df_view_complete['Openness'].str.lower() == 'open'].copy() |
| else: |
| df_view_open = df_view_full.copy() |
| df_view_complete_open = df_view_complete.copy() |
| |
| df_display_complete = prepare_df_for_display(df_view_complete) |
| df_display_all = prepare_df_for_display(df_view_full) |
| df_display_open = prepare_df_for_display(df_view_open) |
| df_display_complete_open = prepare_df_for_display(df_view_complete_open) |
| |
| |
| has_complete_entries = len(df_display_complete) > 0 |
| |
| |
| if category_name == "Overall": |
| primary_score_col = "Average Score" |
| primary_cost_col = "Average Cost" |
| primary_runtime_col = "Average Runtime" |
| else: |
| primary_score_col = f"{category_name} Score" |
| primary_cost_col = f"{category_name} Cost" |
| primary_runtime_col = f"{category_name} Runtime" |
| |
| |
| def create_cost_scatter_plot(df_data, mark_by=MARK_BY_DEFAULT, show_all_labels=False): |
| return _plot_scatter_plotly( |
| data=df_data, |
| x=primary_cost_col if primary_cost_col in df_data.columns else None, |
| y=primary_score_col if primary_score_col in df_data.columns else "Average Score", |
| agent_col="SDK Version", |
| name=category_name, |
| plot_type='cost', |
| mark_by=mark_by, |
| show_all_labels=show_all_labels |
| ) |
| |
| |
| def create_runtime_scatter_plot(df_data, mark_by=MARK_BY_DEFAULT, show_all_labels=False): |
| return _plot_scatter_plotly( |
| data=df_data, |
| x=primary_runtime_col if primary_runtime_col in df_data.columns else None, |
| y=primary_score_col if primary_score_col in df_data.columns else "Average Score", |
| agent_col="SDK Version", |
| name=category_name, |
| plot_type='runtime', |
| mark_by=mark_by, |
| show_all_labels=show_all_labels |
| ) |
| |
| |
| cost_scatter_complete = create_cost_scatter_plot(df_view_complete) if has_complete_entries else go.Figure() |
| cost_scatter_all = create_cost_scatter_plot(df_view_full) |
| cost_scatter_open = create_cost_scatter_plot(df_view_open) if len(df_view_open) > 0 else go.Figure() |
| cost_scatter_complete_open = create_cost_scatter_plot(df_view_complete_open) if len(df_view_complete_open) > 0 else go.Figure() |
| |
| |
| runtime_scatter_complete = create_runtime_scatter_plot(df_view_complete) if has_complete_entries else go.Figure() |
| runtime_scatter_all = create_runtime_scatter_plot(df_view_full) |
| runtime_scatter_open = create_runtime_scatter_plot(df_view_open) if len(df_view_open) > 0 else go.Figure() |
| runtime_scatter_complete_open = create_runtime_scatter_plot(df_view_complete_open) if len(df_view_complete_open) > 0 else go.Figure() |
| |
| |
| df_headers = df_display_all.columns.tolist() |
| df_datatypes = [] |
| for col in df_headers: |
| if col in ["Logs", "Visualization"] or "Cost" in col or "Score" in col or "Runtime" in col: |
| df_datatypes.append("markdown") |
| elif col in ["SDK Version", "Language Model"]: |
| df_datatypes.append("html") |
| else: |
| df_datatypes.append("str") |
| |
| |
| fixed_start_widths = [280, 100, 100, 90] |
| num_score_cost_runtime_cols = 0 |
| remaining_headers = df_headers[len(fixed_start_widths):] |
| for col in remaining_headers: |
| if "Score" in col or "Cost" in col or "Runtime" in col: |
| num_score_cost_runtime_cols += 1 |
| dynamic_widths = [90] * num_score_cost_runtime_cols |
| fixed_end_widths = [90, 100, 120, 120] |
| |
| final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths |
|
|
| |
| num_complete = len(df_display_complete) |
| num_total = len(df_display_all) |
| num_incomplete = num_total - num_complete |
| num_open = len(df_display_open) |
| num_closed = num_total - num_open |
| |
| |
| with gr.Row(): |
| with gr.Column(scale=3): |
| if has_complete_entries: |
| show_incomplete_checkbox = gr.Checkbox( |
| label=f"Show incomplete entries ({num_incomplete} entries with fewer than 5 categories)", |
| value=False, |
| elem_id="show-incomplete-toggle" |
| ) |
| else: |
| show_incomplete_checkbox = None |
| gr.Markdown(f"*No entries with all 5 categories completed yet. Showing all {num_total} entries.*") |
| |
| |
| if num_open > 0 and num_closed > 0: |
| show_open_only_checkbox = gr.Checkbox( |
| label=f"Show only open models ({num_open} open, {num_closed} closed)", |
| value=False, |
| elem_id="show-open-only-toggle" |
| ) |
| else: |
| show_open_only_checkbox = None |
| |
| |
| show_all_labels_checkbox = gr.Checkbox( |
| label="Show all labels on scatter plots", |
| value=False, |
| elem_id="show-all-labels-toggle" |
| ) |
| |
| with gr.Column(scale=1): |
| mark_by_dropdown = gr.Dropdown( |
| choices=MARK_BY_CHOICES, |
| value=MARK_BY_DEFAULT, |
| label="Mark systems by", |
| elem_id="mark-by-dropdown" |
| ) |
|
|
| |
| |
| initial_cost_plot = cost_scatter_complete if has_complete_entries else cost_scatter_all |
| cost_plot_component = gr.Plot( |
| value=initial_cost_plot, |
| show_label=False, |
| ) |
| |
| |
| initial_runtime_plot = runtime_scatter_complete if has_complete_entries else runtime_scatter_all |
| runtime_plot_component = gr.Plot( |
| value=initial_runtime_plot, |
| show_label=False, |
| ) |
| gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer") |
|
|
| |
| with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"): |
| |
| if has_complete_entries: |
| |
| dataframe_component = gr.DataFrame( |
| headers=df_headers, |
| value=df_display_complete, |
| datatype=df_datatypes, |
| interactive=False, |
| wrap=True, |
| column_widths=final_column_widths, |
| elem_classes=["wrap-header-df"], |
| show_search="search", |
| elem_id="main-leaderboard" |
| ) |
| |
| |
| def update_display(show_incomplete, show_open_only, mark_by, show_all_labels): |
| |
| if show_open_only: |
| df_to_show = df_display_open if show_incomplete else df_display_complete_open |
| view_df = df_view_open if show_incomplete else df_view_complete_open |
| else: |
| df_to_show = df_display_all if show_incomplete else df_display_complete |
| view_df = df_view_full if show_incomplete else df_view_complete |
| |
| |
| cost_plot = create_cost_scatter_plot(view_df, mark_by, show_all_labels) |
| runtime_plot = create_runtime_scatter_plot(view_df, mark_by, show_all_labels) |
| return df_to_show, cost_plot, runtime_plot |
| |
| |
| filter_inputs = [show_incomplete_checkbox] |
| if show_open_only_checkbox is not None: |
| filter_inputs.append(show_open_only_checkbox) |
| else: |
| |
| filter_inputs = [show_incomplete_checkbox, gr.State(value=False)] |
| filter_inputs.append(mark_by_dropdown) |
| filter_inputs.append(show_all_labels_checkbox) |
| |
| show_incomplete_checkbox.change( |
| fn=update_display, |
| inputs=filter_inputs, |
| outputs=[dataframe_component, cost_plot_component, runtime_plot_component] |
| ) |
| if show_open_only_checkbox is not None: |
| show_open_only_checkbox.change( |
| fn=update_display, |
| inputs=filter_inputs, |
| outputs=[dataframe_component, cost_plot_component, runtime_plot_component] |
| ) |
| mark_by_dropdown.change( |
| fn=update_display, |
| inputs=filter_inputs, |
| outputs=[dataframe_component, cost_plot_component, runtime_plot_component] |
| ) |
| show_all_labels_checkbox.change( |
| fn=update_display, |
| inputs=filter_inputs, |
| outputs=[dataframe_component, cost_plot_component, runtime_plot_component] |
| ) |
| else: |
| dataframe_component = gr.DataFrame( |
| headers=df_headers, |
| value=df_display_all, |
| datatype=df_datatypes, |
| interactive=False, |
| wrap=True, |
| column_widths=final_column_widths, |
| elem_classes=["wrap-header-df"], |
| show_search="search", |
| elem_id="main-leaderboard" |
| ) |
| |
| |
| def update_display_no_complete(show_open_only, mark_by, show_all_labels): |
| if show_open_only: |
| df_to_show = df_display_open |
| view_df = df_view_open |
| else: |
| df_to_show = df_display_all |
| view_df = df_view_full |
| cost_plot = create_cost_scatter_plot(view_df, mark_by, show_all_labels) |
| runtime_plot = create_runtime_scatter_plot(view_df, mark_by, show_all_labels) |
| return df_to_show, cost_plot, runtime_plot |
| |
| filter_inputs_no_complete = [] |
| if show_open_only_checkbox is not None: |
| filter_inputs_no_complete.append(show_open_only_checkbox) |
| else: |
| filter_inputs_no_complete.append(gr.State(value=False)) |
| filter_inputs_no_complete.append(mark_by_dropdown) |
| filter_inputs_no_complete.append(show_all_labels_checkbox) |
| |
| if show_open_only_checkbox is not None: |
| show_open_only_checkbox.change( |
| fn=update_display_no_complete, |
| inputs=filter_inputs_no_complete, |
| outputs=[dataframe_component, cost_plot_component, runtime_plot_component] |
| ) |
| mark_by_dropdown.change( |
| fn=update_display_no_complete, |
| inputs=filter_inputs_no_complete, |
| outputs=[dataframe_component, cost_plot_component, runtime_plot_component] |
| ) |
| show_all_labels_checkbox.change( |
| fn=update_display_no_complete, |
| inputs=filter_inputs_no_complete, |
| outputs=[dataframe_component, cost_plot_component, runtime_plot_component] |
| ) |
| |
| legend_markdown = create_legend_markdown(category_name) |
| gr.HTML(value=legend_markdown, elem_id="legend-markdown") |
| |
| |
| |
| def check_and_refresh_data(show_incomplete, show_open_only=False, mark_by=MARK_BY_DEFAULT, show_all_labels=False): |
| """Check if data has been refreshed and return updated data if so.""" |
| current_version = get_data_version() |
| if current_version > initial_data_version: |
| |
| print(f"[REFRESH] Data version changed from {initial_data_version} to {current_version}, reloading...") |
| new_df, new_tag_map = get_full_leaderboard_data(split_name) |
| if not new_df.empty: |
| new_transformer = DataTransformer(new_df, new_tag_map) |
| new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True) |
| |
| |
| if 'Categories Attempted' in new_df_view_full.columns: |
| new_df_view_complete = new_df_view_full[new_df_view_full['Categories Attempted'] == '5/5'].copy() |
| else: |
| new_df_view_complete = new_df_view_full.copy() |
| |
| |
| if 'Openness' in new_df_view_full.columns: |
| new_df_view_open = new_df_view_full[new_df_view_full['Openness'].str.lower() == 'open'].copy() |
| new_df_view_complete_open = new_df_view_complete[new_df_view_complete['Openness'].str.lower() == 'open'].copy() |
| else: |
| new_df_view_open = new_df_view_full.copy() |
| new_df_view_complete_open = new_df_view_complete.copy() |
| |
| new_df_display_complete = prepare_df_for_display(new_df_view_complete) |
| new_df_display_all = prepare_df_for_display(new_df_view_full) |
| new_df_display_open = prepare_df_for_display(new_df_view_open) |
| new_df_display_complete_open = prepare_df_for_display(new_df_view_complete_open) |
| |
| |
| new_cost_scatter_complete = create_cost_scatter_plot(new_df_view_complete, mark_by, show_all_labels) if len(new_df_display_complete) > 0 else go.Figure() |
| new_cost_scatter_all = create_cost_scatter_plot(new_df_view_full, mark_by, show_all_labels) |
| new_cost_scatter_open = create_cost_scatter_plot(new_df_view_open, mark_by, show_all_labels) if len(new_df_view_open) > 0 else go.Figure() |
| new_cost_scatter_complete_open = create_cost_scatter_plot(new_df_view_complete_open, mark_by, show_all_labels) if len(new_df_view_complete_open) > 0 else go.Figure() |
| |
| new_runtime_scatter_complete = create_runtime_scatter_plot(new_df_view_complete, mark_by, show_all_labels) if len(new_df_display_complete) > 0 else go.Figure() |
| new_runtime_scatter_all = create_runtime_scatter_plot(new_df_view_full, mark_by, show_all_labels) |
| new_runtime_scatter_open = create_runtime_scatter_plot(new_df_view_open, mark_by, show_all_labels) if len(new_df_view_open) > 0 else go.Figure() |
| new_runtime_scatter_complete_open = create_runtime_scatter_plot(new_df_view_complete_open, mark_by, show_all_labels) if len(new_df_view_complete_open) > 0 else go.Figure() |
| |
| |
| if show_open_only: |
| if show_incomplete: |
| return new_df_display_open, new_cost_scatter_open, new_runtime_scatter_open |
| else: |
| return new_df_display_complete_open, new_cost_scatter_complete_open, new_runtime_scatter_complete_open |
| else: |
| if show_incomplete: |
| return new_df_display_all, new_cost_scatter_all, new_runtime_scatter_all |
| else: |
| return new_df_display_complete, new_cost_scatter_complete, new_runtime_scatter_complete |
| |
| |
| if show_open_only: |
| if show_incomplete: |
| return df_display_open, cost_scatter_open, runtime_scatter_open |
| else: |
| return df_display_complete_open, cost_scatter_complete_open, runtime_scatter_complete_open |
| else: |
| if show_incomplete: |
| return df_display_all, cost_scatter_all, runtime_scatter_all |
| else: |
| return df_display_complete, cost_scatter_complete, runtime_scatter_complete |
| |
| |
| refresh_timer = gr.Timer(value=60) |
| |
| |
| if show_incomplete_checkbox is not None: |
| timer_inputs = [show_incomplete_checkbox] |
| if show_open_only_checkbox is not None: |
| timer_inputs.append(show_open_only_checkbox) |
| timer_inputs.append(mark_by_dropdown) |
| timer_inputs.append(show_all_labels_checkbox) |
| refresh_timer.tick( |
| fn=check_and_refresh_data, |
| inputs=timer_inputs, |
| outputs=[dataframe_component, cost_plot_component, runtime_plot_component] |
| ) |
| else: |
| |
| def check_and_refresh_all(show_open_only=False, mark_by=MARK_BY_DEFAULT, show_all_labels=False): |
| current_version = get_data_version() |
| if current_version > initial_data_version: |
| print(f"[REFRESH] Data version changed, reloading...") |
| new_df, new_tag_map = get_full_leaderboard_data(split_name) |
| if not new_df.empty: |
| new_transformer = DataTransformer(new_df, new_tag_map) |
| new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True) |
| |
| if show_open_only and 'Openness' in new_df_view_full.columns: |
| new_df_view_full = new_df_view_full[new_df_view_full['Openness'].str.lower() == 'open'].copy() |
| |
| new_df_display_all = prepare_df_for_display(new_df_view_full) |
| new_cost_scatter_all = create_cost_scatter_plot(new_df_view_full, mark_by, show_all_labels) |
| new_runtime_scatter_all = create_runtime_scatter_plot(new_df_view_full, mark_by, show_all_labels) |
| return new_df_display_all, new_cost_scatter_all, new_runtime_scatter_all |
| |
| if show_open_only: |
| return df_display_open, cost_scatter_open, runtime_scatter_open |
| return df_display_all, cost_scatter_all, runtime_scatter_all |
| |
| if show_open_only_checkbox is not None: |
| refresh_timer.tick( |
| fn=check_and_refresh_all, |
| inputs=[show_open_only_checkbox, mark_by_dropdown, show_all_labels_checkbox], |
| outputs=[dataframe_component, cost_plot_component, runtime_plot_component] |
| ) |
| else: |
| def check_and_refresh_simple(mark_by=MARK_BY_DEFAULT, show_all_labels=False): |
| return check_and_refresh_all(False, mark_by, show_all_labels) |
| refresh_timer.tick( |
| fn=check_and_refresh_simple, |
| inputs=[mark_by_dropdown, show_all_labels_checkbox], |
| outputs=[dataframe_component, cost_plot_component, runtime_plot_component] |
| ) |
|
|
| |
| return show_incomplete_checkbox, show_open_only_checkbox, mark_by_dropdown |
|
|
| |
| def create_benchmark_details_display( |
| full_df: pd.DataFrame, |
| tag_map: dict, |
| category_name: str, |
| validation: bool = False, |
| ): |
| """ |
| Generates a detailed breakdown for each benchmark within a given category. |
| For each benchmark, it creates a title, a filtered table, and a scatter plot. |
| Only shows the detailed results if there is more than one benchmark in the category. |
| Args: |
| full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split. |
| tag_map (dict): The "pretty" tag map to find the list of benchmarks. |
| category_name (str): The main category to display details for (e.g., "Literature Understanding"). |
| """ |
| |
| benchmark_names = tag_map.get(category_name, []) |
|
|
| |
| if len(benchmark_names) <= 1: |
| return |
|
|
| gr.HTML(f'<h2 class="benchmark-main-subtitle">{category_name} Detailed Benchmark Results</h2>') |
| gr.Markdown("---") |
| |
| for benchmark_name in benchmark_names: |
| anchor_id = create_gradio_anchor_id(benchmark_name, validation) |
| gr.HTML( |
| f""" |
| <h3 class="benchmark-title" id="{anchor_id}">{benchmark_name} Leaderboard <a href="#{anchor_id}" class="header-link-icon">🔗</a></h3> |
| <div class="benchmark-description">{get_benchmark_description(benchmark_name, validation)}</div> |
| <button onclick="scroll_to_element('page-content-wrapper')" class="primary-link-button">Return to the aggregate {category_name} leaderboard</button> |
| """ |
| ) |
|
|
| |
| benchmark_score_col = f"{benchmark_name} Score" |
| benchmark_cost_col = f"{benchmark_name} Cost" |
| benchmark_runtime_col = f"{benchmark_name} Runtime" |
| benchmark_download_col = f"{benchmark_name} Download" |
| benchmark_visualization_col = f"{benchmark_name} Visualization" |
|
|
| |
| table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col, benchmark_runtime_col, 'Logs', benchmark_visualization_col, benchmark_download_col, 'id', 'Language Model'] |
|
|
| |
| existing_table_cols = [col for col in table_cols if col in full_df.columns] |
|
|
| if benchmark_score_col not in existing_table_cols: |
| gr.Markdown(f"Score data for {benchmark_name} not available.") |
| continue |
|
|
| |
| benchmark_table_df = full_df[existing_table_cols].copy() |
| pareto_df = get_pareto_df(benchmark_table_df) |
| |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") |
| if not pareto_df.empty and 'id' in pareto_df.columns: |
| pareto_agent_names = pareto_df['id'].tolist() |
| else: |
| pareto_agent_names = [] |
|
|
| |
| benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list) |
| |
| |
| def format_language_model_with_icons(row): |
| icons_html = '' |
| |
| |
| if row['id'] in pareto_agent_names: |
| icons_html += f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:18px; height:18px;">' |
| |
| |
| openness_val = row.get('Openness', '') |
| if openness_val in [aliases.CANONICAL_OPENNESS_OPEN, 'Open', 'Open Source', 'Open Source + Open Weights']: |
| lock_uri = get_svg_as_data_uri("assets/lock-open.svg") |
| icons_html += f'<img src="{lock_uri}" alt="Open" title="Open source model" style="width:16px; height:16px;">' |
| else: |
| lock_uri = get_svg_as_data_uri("assets/lock-closed.svg") |
| icons_html += f'<img src="{lock_uri}" alt="Closed" title="Closed source model" style="width:16px; height:16px;">' |
| |
| |
| company_html = get_company_logo_html(row['Language Model']) |
| if company_html: |
| icons_html += company_html |
| |
| |
| model_name = row['Language Model'] |
| if isinstance(model_name, list): |
| if len(model_name) > 1: |
| tooltip_text = "\\n".join(map(str, model_name)) |
| model_text = f'<span class="tooltip-icon cell-tooltip-icon" style="cursor: help;" data-tooltip="{tooltip_text}">{model_name[0]} (+ {len(model_name) - 1}) ⓘ</span>' |
| elif len(model_name) == 1: |
| model_text = model_name[0] |
| else: |
| model_text = str(model_name) |
| else: |
| model_text = str(model_name) |
| |
| |
| return f'<div style="display:flex; align-items:center; gap:4px; flex-wrap:nowrap;">{icons_html}<span>{model_text}</span></div>' |
| |
| benchmark_table_df['Language Model'] = benchmark_table_df.apply(format_language_model_with_icons, axis=1) |
| |
| |
| if 'Source' in benchmark_table_df.columns: |
| benchmark_table_df['SDK Version'] = benchmark_table_df.apply( |
| lambda row: f"{row['SDK Version']} {row['Source']}" if row['Source'] else row['SDK Version'], |
| axis=1 |
| ) |
|
|
| |
| def check_benchmark_status(row): |
| has_score = pd.notna(row.get(benchmark_score_col)) |
| has_cost = pd.notna(row.get(benchmark_cost_col)) |
| if has_score and has_cost: |
| return "✅" |
| if has_score or has_cost: |
| return "⚠️" |
| return "🚫 " |
|
|
| |
| benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1) |
| |
| if benchmark_score_col in benchmark_table_df.columns: |
| benchmark_table_df = benchmark_table_df.sort_values( |
| by=benchmark_score_col, ascending=False, na_position='last' |
| ) |
| |
| benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col) |
| benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col) |
| |
| |
| if benchmark_download_col in benchmark_table_df.columns: |
| def format_download_link(url): |
| if pd.isna(url) or url == "": |
| return "" |
| return f"[⬇️]({url})" |
| benchmark_table_df[benchmark_download_col] = benchmark_table_df[benchmark_download_col].apply(format_download_link) |
| |
| |
| if benchmark_visualization_col in benchmark_table_df.columns: |
| def format_visualization_link(url): |
| if pd.isna(url) or url == "": |
| return "" |
| return f"[📊]({url})" |
| benchmark_table_df[benchmark_visualization_col] = benchmark_table_df[benchmark_visualization_col].apply(format_visualization_link) |
| |
| desired_cols_in_order = [ |
| 'Language Model', |
| 'SDK Version', |
| 'Attempted Benchmark', |
| benchmark_score_col, |
| benchmark_cost_col, |
| benchmark_runtime_col, |
| 'Date', |
| 'Logs', |
| benchmark_visualization_col, |
| benchmark_download_col |
| ] |
| for col in desired_cols_in_order: |
| if col not in benchmark_table_df.columns: |
| benchmark_table_df[col] = pd.NA |
| benchmark_table_df = benchmark_table_df[desired_cols_in_order] |
| |
| |
| if benchmark_runtime_col in benchmark_table_df.columns: |
| benchmark_table_df = format_runtime_column(benchmark_table_df, benchmark_runtime_col) |
| |
| |
| if "Date" in benchmark_table_df.columns: |
| benchmark_table_df = format_date_column(benchmark_table_df, "Date") |
| |
| |
| benchmark_table_df.rename(columns={ |
| benchmark_score_col: 'Score', |
| benchmark_cost_col: 'Cost', |
| benchmark_runtime_col: 'Runtime', |
| benchmark_visualization_col: '📊', |
| benchmark_download_col: '⬇️', |
| }, inplace=True) |
| |
| |
| df_headers = benchmark_table_df.columns.tolist() |
| df_datatypes = [] |
| for col in df_headers: |
| if col in ["Logs", "📊", "⬇️"] or "Cost" in col or "Score" in col or "Runtime" in col: |
| df_datatypes.append("markdown") |
| elif col in ["SDK Version", "Language Model"]: |
| df_datatypes.append("html") |
| else: |
| df_datatypes.append("str") |
| |
| |
| cost_benchmark_plot = _plot_scatter_plotly( |
| data=full_df, |
| x=benchmark_cost_col, |
| y=benchmark_score_col, |
| agent_col="SDK Version", |
| name=benchmark_name, |
| plot_type='cost' |
| ) |
| gr.Plot(value=cost_benchmark_plot, show_label=False) |
| |
| |
| runtime_benchmark_plot = _plot_scatter_plotly( |
| data=full_df, |
| x=benchmark_runtime_col, |
| y=benchmark_score_col, |
| agent_col="SDK Version", |
| name=benchmark_name, |
| plot_type='runtime' |
| ) |
| gr.Plot(value=runtime_benchmark_plot, show_label=False) |
| gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer") |
|
|
| |
| with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"): |
| gr.DataFrame( |
| headers=df_headers, |
| value=benchmark_table_df, |
| datatype=df_datatypes, |
| interactive=False, |
| wrap=True, |
| column_widths=[200, 80, 40, 80, 80, 80, 100, 120, 40, 40], |
| show_search="search", |
| elem_classes=["wrap-header-df"] |
| ) |
| legend_markdown = create_legend_markdown(benchmark_name) |
| gr.HTML(value=legend_markdown, elem_id="legend-markdown") |
|
|
| def get_full_leaderboard_data( |
| split: str, |
| agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS, |
| ) -> tuple[pd.DataFrame, dict]: |
| """ |
| Loads and transforms the complete dataset for a (split, agent_filter) |
| pair. ``agent_filter`` defaults to ``"openhands"`` so existing pages |
| that don't pass it stay on the canonical leaderboard. The Alternative |
| Agents page passes ``"alternative"`` to get the third-party harnesses. |
| """ |
| viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split, agent_filter=agent_filter) |
|
|
| if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)): |
| raw_df, _ = viewer_or_data._load() |
| if raw_df.empty: |
| return pd.DataFrame(), {} |
|
|
| pretty_df = transform_raw_dataframe(raw_df) |
| pretty_tag_map = create_pretty_tag_map(raw_tag_map, INFORMAL_TO_FORMAL_NAME_MAP) |
| if "Logs" in pretty_df.columns: |
| |
| benchmarks = ['swe-bench', 'swe-bench-multimodal', 'commit0', 'swt-bench', 'gaia'] |
| def make_download_icons(idx): |
| parts = [] |
| for benchmark in benchmarks: |
| download_col = f"{benchmark} download" |
| if download_col in raw_df.columns: |
| url = raw_df.loc[idx, download_col] if idx in raw_df.index else None |
| if pd.notna(url) and url != "": |
| parts.append(f'<a href="{url}" title="Download {benchmark} results" target="_blank">⬇️</a>') |
| return " ".join(parts) if parts else "" |
| pretty_df["Logs"] = pretty_df.index.map(make_download_icons) |
| |
| if "Visualization" in pretty_df.columns: |
| |
| benchmarks = ['swe-bench', 'swe-bench-multimodal', 'commit0', 'swt-bench', 'gaia'] |
| def make_visualization_icons(idx): |
| parts = [] |
| for benchmark in benchmarks: |
| viz_col = f"{benchmark} visualization" |
| if viz_col in raw_df.columns: |
| url = raw_df.loc[idx, viz_col] if idx in raw_df.index else None |
| if pd.notna(url) and url != "": |
| parts.append(f'<a href="{url}" title="Visualize {benchmark} results" target="_blank">📊</a>') |
| return " ".join(parts) if parts else "" |
| pretty_df["Visualization"] = pretty_df.index.map(make_visualization_icons) |
|
|
| if "Source" in pretty_df.columns: |
| def format_source_url_to_html(raw_url): |
| |
| if pd.isna(raw_url) or raw_url == "": return "" |
| |
| return hyperlink(str(raw_url), "🔗") |
| |
| pretty_df["Source"] = pretty_df["Source"].apply(format_source_url_to_html) |
| return pretty_df, pretty_tag_map |
|
|
| |
| return pd.DataFrame(), {} |
| def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: bool = False) -> gr.HTML: |
| """ |
| Builds the entire sub-navigation bar as a single, self-contained HTML component. |
| This bypasses Gradio's layout components, giving us full control. |
| """ |
| benchmark_names = tag_map.get(category_name, []) |
| if not benchmark_names: |
| |
| return gr.HTML() |
|
|
| |
| html_buttons = [] |
| for name in benchmark_names: |
| target_id = create_gradio_anchor_id(name, validation) |
|
|
| |
| |
| |
| button_str = f""" |
| <button |
| class="primary-link-button" |
| onclick="scroll_to_element('{target_id}')" |
| > |
| {name} |
| </button> |
| """ |
| html_buttons.append(button_str) |
|
|
| |
| |
| full_html = f""" |
| <div class="sub-nav-bar-container"> |
| <span class="sub-nav-label">Benchmarks in this category:</span> |
| {' | '.join(html_buttons)} |
| </div> |
| """ |
|
|
| |
| return gr.HTML(full_html) |
|
|