Spaces:
Running
Running
| import gradio as gr | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| import os | |
| import base64 | |
| import re | |
| from huggingface_hub import HfApi | |
| import aliases | |
| from simple_data_loader import SimpleLeaderboardViewer | |
| from leaderboard_transformer import ( | |
| DataTransformer, | |
| transform_raw_dataframe, | |
| create_pretty_tag_map, | |
| INFORMAL_TO_FORMAL_NAME_MAP, | |
| _plot_scatter_plotly, | |
| format_cost_column, | |
| format_score_column, | |
| get_pareto_df, | |
| clean_llm_base_list, | |
| ) | |
| from config import ( | |
| CONFIG_NAME, | |
| EXTRACTED_DATA_DIR, | |
| IS_INTERNAL, | |
| RESULTS_DATASET, | |
| ) | |
| from content import ( | |
| create_gradio_anchor_id, | |
| format_error, | |
| get_benchmark_description, | |
| hf_uri_to_web_url, | |
| hyperlink, | |
| SCATTER_DISCLAIMER, | |
| ) | |
| api = HfApi() | |
| os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True) | |
| # Company logo mapping - maps model name patterns to company logo files | |
| COMPANY_LOGO_MAP = { | |
| "anthropic": {"path": "assets/logo-anthropic.svg", "name": "Anthropic"}, | |
| "claude": {"path": "assets/logo-anthropic.svg", "name": "Anthropic"}, | |
| "openai": {"path": "assets/logo-openai.svg", "name": "OpenAI"}, | |
| "gpt": {"path": "assets/logo-openai.svg", "name": "OpenAI"}, | |
| "o1": {"path": "assets/logo-openai.svg", "name": "OpenAI"}, | |
| "o3": {"path": "assets/logo-openai.svg", "name": "OpenAI"}, | |
| "google": {"path": "assets/logo-google.svg", "name": "Google"}, | |
| "gemini": {"path": "assets/logo-google.svg", "name": "Google"}, | |
| "gemma": {"path": "assets/logo-google.svg", "name": "Google"}, | |
| "meta": {"path": "assets/logo-meta.svg", "name": "Meta"}, | |
| "llama": {"path": "assets/logo-meta.svg", "name": "Meta"}, | |
| "mistral": {"path": "assets/logo-mistral.svg", "name": "Mistral"}, | |
| "mixtral": {"path": "assets/logo-mistral.svg", "name": "Mistral"}, | |
| "codestral": {"path": "assets/logo-mistral.svg", "name": "Mistral"}, | |
| "deepseek": {"path": "assets/logo-deepseek.svg", "name": "DeepSeek"}, | |
| "xai": {"path": "assets/logo-xai.svg", "name": "xAI"}, | |
| "grok": {"path": "assets/logo-xai.svg", "name": "xAI"}, | |
| "cohere": {"path": "assets/logo-cohere.svg", "name": "Cohere"}, | |
| "command": {"path": "assets/logo-cohere.svg", "name": "Cohere"}, | |
| "qwen": {"path": "assets/logo-qwen.svg", "name": "Qwen"}, | |
| "alibaba": {"path": "assets/logo-qwen.svg", "name": "Qwen"}, | |
| "kimi": {"path": "assets/logo-moonshot.svg", "name": "Moonshot"}, | |
| "moonshot": {"path": "assets/logo-moonshot.svg", "name": "Moonshot"}, | |
| "minimax": {"path": "assets/logo-minimax.svg", "name": "MiniMax"}, | |
| } | |
| def get_company_from_model(model_name: str) -> dict: | |
| """ | |
| Gets the company info (logo path and name) from a model name. | |
| Returns default unknown logo if no match found. | |
| """ | |
| if not model_name: | |
| return {"path": "assets/logo-unknown.svg", "name": "Unknown"} | |
| # Handle list of models - use the first one | |
| if isinstance(model_name, list): | |
| model_name = model_name[0] if model_name else "" | |
| model_lower = str(model_name).lower() | |
| # Check each pattern | |
| for pattern, company_info in COMPANY_LOGO_MAP.items(): | |
| if pattern in model_lower: | |
| return company_info | |
| return {"path": "assets/logo-unknown.svg", "name": "Unknown"} | |
| def get_company_logo_html(model_name: str) -> str: | |
| """ | |
| Generates HTML for a company logo based on the model name. | |
| """ | |
| company_info = get_company_from_model(model_name) | |
| uri = get_svg_as_data_uri(company_info["path"]) | |
| if uri: | |
| return f'<img src="{uri}" alt="{company_info["name"]}" title="{company_info["name"]}" style="width:20px; height:20px; vertical-align: middle;">' | |
| return "" | |
| # Simplified icon map (no tooling distinction, only openness) | |
| # Not actually used since we removed icons from the table, but keeping for potential future use | |
| OPENNESS_ICON_MAP = { | |
| aliases.CANONICAL_OPENNESS_OPEN: "assets/ellipse-pink.svg", | |
| aliases.CANONICAL_OPENNESS_CLOSED: "assets/ellipse-yellow.svg", | |
| } | |
| # Add aliases | |
| for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items(): | |
| for openness_alias in openness_aliases: | |
| OPENNESS_ICON_MAP[openness_alias] = OPENNESS_ICON_MAP[canonical_openness] | |
| OPENNESS_SVG_MAP = { | |
| aliases.CANONICAL_OPENNESS_OPEN: { | |
| "path": "assets/ellipse-pink.svg", | |
| "description": "Open source model" | |
| }, | |
| aliases.CANONICAL_OPENNESS_CLOSED: { | |
| "path": "assets/ellipse-yellow.svg", | |
| "description": "Closed source model" | |
| }, | |
| } | |
| def get_svg_as_data_uri(path: str) -> str: | |
| """Reads an SVG file and returns it as a base64-encoded data URI.""" | |
| try: | |
| with open(path, "rb") as svg_file: | |
| encoded_svg = base64.b64encode(svg_file.read()).decode("utf-8") | |
| return f"data:image/svg+xml;base64,{encoded_svg}" | |
| except FileNotFoundError: | |
| print(f"Warning: SVG file not found at {path}") | |
| return "" | |
| def create_svg_html(value, svg_map): | |
| """ | |
| Generates the absolute simplest HTML for an icon, without any extra text. | |
| This version is compatible with gr.DataFrame. | |
| """ | |
| if pd.isna(value) or value not in svg_map: | |
| return "" | |
| path_info = svg_map[value] | |
| # Handle both old string format and new object format | |
| if isinstance(path_info, dict): | |
| path = path_info["path"] | |
| else: | |
| path = path_info | |
| src = get_svg_as_data_uri(path) | |
| # Generate the HTML for the single icon, with NO text. | |
| if src: | |
| return f'<img src="{src}" style="width: 16px; height: 16px; vertical-align: middle;" alt="{value}" title="{value}">' | |
| return "" | |
| def build_openness_tooltip_content() -> str: | |
| """ | |
| Generates the inner HTML for the Model Openness tooltip card using custom SVG lock icons. | |
| """ | |
| open_uri = get_svg_as_data_uri("assets/lock-open.svg") | |
| closed_uri = get_svg_as_data_uri("assets/lock-closed.svg") | |
| html_items = [ | |
| f""" | |
| <div class="tooltip-legend-item"> | |
| <img src="{open_uri}" alt="Open" style="width: 24px; height: 24px;"> | |
| <div> | |
| <strong>Open</strong> | |
| <span>Open source model</span> | |
| </div> | |
| </div> | |
| """, | |
| f""" | |
| <div class="tooltip-legend-item"> | |
| <img src="{closed_uri}" alt="Closed" style="width: 24px; height: 24px;"> | |
| <div> | |
| <strong>Closed</strong> | |
| <span>Closed source model</span> | |
| </div> | |
| </div> | |
| """ | |
| ] | |
| joined_items = "".join(html_items) | |
| return f"""<span class="tooltip-icon-legend"> | |
| ⓘ | |
| <span class="tooltip-card"> | |
| <h3>Model Openness</h3> | |
| <p class="tooltip-description">Indicates whether the language model is open source or closed source.</p> | |
| <div class="tooltip-items-container">{joined_items}</div> | |
| </span> | |
| </span>""" | |
| def build_pareto_tooltip_content() -> str: | |
| """Generates the inner HTML for the Pareto tooltip card with final copy.""" | |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") | |
| trophy_icon_html = f'<img src="{trophy_uri}" style="width: 25px; height: 25px; vertical-align: middle;">' | |
| return f""" | |
| <h3>On Pareto Frontier</h3> | |
| <p class="tooltip-description">The Pareto frontier represents the best balance between score and cost.</p> | |
| <p class="tooltip-description">Agents on the frontier either:</p> | |
| <ul class="tooltip-sub-list"> | |
| <li>Offer the lowest cost for a given performance, or</li> | |
| <li>Deliver the best performance at a given cost.</li> | |
| </ul> | |
| <div class="tooltip-description" style="margin-top: 12px; display: flex; align-items: center;"> | |
| <span>These agents are marked with this icon:</span> | |
| <span>{trophy_icon_html}</span> | |
| </div> | |
| """ | |
| def build_descriptions_tooltip_content(table) -> str: | |
| """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table.""" | |
| if table == "Overall": | |
| return """ | |
| <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands SDK evaluated.</div> | |
| <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div> | |
| <div class="tooltip-description-item"><b>Average Score:</b> Sum of category scores divided by 5. Missing categories count as 0.</div> | |
| <div class="tooltip-description-item"><b>Average Cost:</b> Average cost per instance across all submitted benchmarks, in USD.</div> | |
| <div class="tooltip-description-item"><b>Issue Resolution Score:</b> Macro-average score across Issue Resolution benchmarks.</div> | |
| <div class="tooltip-description-item"><b>Issue Resolution Cost:</b> Macro-average cost per instance (USD) across Issue Resolution benchmarks.</div> | |
| <div class="tooltip-description-item"><b>Frontend Score:</b> Macro-average score across Frontend benchmarks.</div> | |
| <div class="tooltip-description-item"><b>Frontend Cost:</b> Macro-average cost per instance (USD) across Frontend benchmarks.</div> | |
| <div class="tooltip-description-item"><b>Greenfield Score:</b> Macro-average score across Greenfield benchmarks.</div> | |
| <div class="tooltip-description-item"><b>Greenfield Cost:</b> Macro-average cost per instance (USD) across Greenfield benchmarks.</div> | |
| <div class="tooltip-description-item"><b>Testing Score:</b> Macro-average score across Testing benchmarks.</div> | |
| <div class="tooltip-description-item"><b>Testing Cost:</b> Macro-average cost per instance (USD) across Testing benchmarks.</div> | |
| <div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div> | |
| <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div> | |
| <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div> | |
| <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div> | |
| """ | |
| elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]: | |
| return f""" | |
| <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div> | |
| <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div> | |
| <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div> | |
| <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div> | |
| <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div> | |
| <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div> | |
| <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div> | |
| <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div> | |
| <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div> | |
| """ | |
| else: | |
| # Fallback for any other table type, e.g., individual benchmarks | |
| return f""" | |
| <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div> | |
| <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div> | |
| <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div> | |
| <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div> | |
| <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div> | |
| <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div> | |
| <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div> | |
| """ | |
| # Create HTML for the "Openness" legend items for table using custom SVG lock icons | |
| open_lock_uri = get_svg_as_data_uri("assets/lock-open.svg") | |
| closed_lock_uri = get_svg_as_data_uri("assets/lock-closed.svg") | |
| openness_html_items = [ | |
| f'<div style="display: flex; align-items: center; white-space: nowrap;">' | |
| f'<img src="{open_lock_uri}" alt="Open" style="width:16px; height:16px; margin-right: 4px;">' | |
| f'<span>Open</span>' | |
| f'</div>', | |
| f'<div style="display: flex; align-items: center; white-space: nowrap;">' | |
| f'<img src="{closed_lock_uri}" alt="Closed" style="width:16px; height:16px; margin-right: 4px;">' | |
| f'<span>Closed</span>' | |
| f'</div>' | |
| ] | |
| openness_html = " ".join(openness_html_items) | |
| pareto_tooltip_content = build_pareto_tooltip_content() | |
| openness_tooltip_content = build_openness_tooltip_content() | |
| def create_legend_markdown(which_table: str) -> str: | |
| """ | |
| Generates the complete HTML for the legend section, including tooltips. | |
| This is used in the main leaderboard display. | |
| """ | |
| descriptions_tooltip_content = build_descriptions_tooltip_content(which_table) | |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") | |
| # Add download section for benchmark-specific tables (not Overall or category pages) | |
| download_section = "" | |
| if which_table not in ["Overall", "Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]: | |
| download_section = """ | |
| <div> <!-- Container for the Download section --> | |
| <b>Download</b> | |
| <div class="table-legend-item"> | |
| <span style="font-size: 16px; margin-right: 4px;">⬇️</span> | |
| <span>Trajectories</span> | |
| </div> | |
| </div> | |
| """ | |
| legend_markdown = f""" | |
| <div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 20px; font-size: 14px; padding-bottom: 8px;"> | |
| <div> <!-- Container for the Pareto section --> | |
| <b>Pareto</b> | |
| <span class="tooltip-icon-legend"> | |
| ⓘ | |
| <span class="tooltip-card">{pareto_tooltip_content}</span> | |
| </span> | |
| <div class="table-legend-item"> | |
| <img src="{trophy_uri}" alt="On Frontier" style="width:20px; height:20px; margin-right: 4px; flex-shrink: 0;"> | |
| <span>On frontier</span> | |
| </div> | |
| </div> | |
| <div> <!-- Container for the Openness section --> | |
| <b>Model Openness</b> | |
| {openness_tooltip_content} | |
| <div class="table-legend-item">{openness_html}</div> | |
| </div> | |
| {download_section} | |
| <div><!-- Container for the Column Descriptions section --> | |
| <b>Column Descriptions</b> | |
| <span class="tooltip-icon-legend"> | |
| ⓘ | |
| <span class="tooltip-card"> | |
| <h3>Column Descriptions</h3> | |
| <div class="tooltip-items-container">{descriptions_tooltip_content}</div> | |
| </span> | |
| </span> | |
| </div> | |
| </div> | |
| """ | |
| return legend_markdown | |
| # Create HTML for plot legend with company logos | |
| company_legend_items = [] | |
| # Show a sample of company logos in the legend | |
| sample_companies = [ | |
| ("Anthropic", "assets/logo-anthropic.svg"), | |
| ("OpenAI", "assets/logo-openai.svg"), | |
| ("Google", "assets/logo-google.svg"), | |
| ("Meta", "assets/logo-meta.svg"), | |
| ("Mistral", "assets/logo-mistral.svg"), | |
| ] | |
| for name, path in sample_companies: | |
| uri = get_svg_as_data_uri(path) | |
| if uri: | |
| company_legend_items.append( | |
| f'<div class="plot-legend-item">' | |
| f'<img class="plot-legend-item-svg" src="{uri}" alt="{name}" title="{name}" style="width: 20px; height: 20px;">' | |
| f'<span>{name}</span>' | |
| f'</div>' | |
| ) | |
| plot_legend_html = f""" | |
| <div class="plot-legend-container"> | |
| <div id="plot-legend-logo"> | |
| <img src="{get_svg_as_data_uri("assets/logo.svg")}"> | |
| </div> | |
| <div style="margin-bottom: 16px;"> | |
| <span class="plot-legend-category-heading">Pareto</span> | |
| <div style="margin-top: 8px;"> | |
| <div class="plot-legend-item"> | |
| <img id="plot-legend-item-pareto-svg" class="plot-legend-item-svg" src="{get_svg_as_data_uri("assets/pareto.svg")}"> | |
| <span>On frontier</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div> | |
| <span class="plot-legend-category-heading">Company Logos</span> | |
| <div style="margin-top: 8px;"> | |
| {''.join(company_legend_items)} | |
| </div> | |
| </div> | |
| </div> | |
| """; | |
| # --- Global State for Viewers (simple caching with TTL) --- | |
| CACHED_VIEWERS = {} | |
| CACHED_TAG_MAPS = {} | |
| _cache_lock = __import__('threading').Lock() | |
| _data_version = 0 # Incremented when data is refreshed | |
| def get_data_version(): | |
| """Get the current data version number.""" | |
| global _data_version | |
| return _data_version | |
| def clear_viewer_cache(): | |
| """ | |
| Clear all cached viewers and tag maps. | |
| Called when data is refreshed from the background scheduler. | |
| """ | |
| global CACHED_VIEWERS, CACHED_TAG_MAPS, _data_version | |
| with _cache_lock: | |
| CACHED_VIEWERS.clear() | |
| CACHED_TAG_MAPS.clear() | |
| _data_version += 1 | |
| print(f"[CACHE] Viewer cache cleared after data refresh (version: {_data_version})") | |
| # Register the cache clear callback with the data refresh system | |
| try: | |
| from setup_data import register_refresh_callback | |
| register_refresh_callback(clear_viewer_cache) | |
| except ImportError: | |
| pass # setup_data may not be available during import | |
| class DummyViewer: | |
| """A mock viewer to be cached on error. It has a ._load() method | |
| to ensure it behaves like the real LeaderboardViewer.""" | |
| def __init__(self, error_df): | |
| self._error_df = error_df | |
| def _load(self): | |
| # The _load method returns the error DataFrame and an empty tag map | |
| return self._error_df, {} | |
| def get_leaderboard_viewer_instance(split: str): | |
| """ | |
| Fetches the LeaderboardViewer for a split, using a thread-safe cache to avoid | |
| re-downloading data. On error, returns a stable DummyViewer object. | |
| """ | |
| global CACHED_VIEWERS, CACHED_TAG_MAPS | |
| with _cache_lock: | |
| if split in CACHED_VIEWERS: | |
| # Cache hit: return the cached viewer and tag map | |
| return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []}) | |
| # --- Cache miss: try to load data from the source --- | |
| try: | |
| # First try to load from extracted data directory (local mock data) | |
| data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results" | |
| print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}") | |
| viewer = SimpleLeaderboardViewer( | |
| data_dir=data_dir, | |
| config=CONFIG_NAME, | |
| split=split | |
| ) | |
| # Simplify tag map creation | |
| pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP) | |
| # Cache the results for next time (thread-safe) | |
| with _cache_lock: | |
| CACHED_VIEWERS[split] = viewer | |
| CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly | |
| return viewer, pretty_tag_map | |
| except Exception as e: | |
| # On ANY error, create a consistent error message and cache a DummyViewer | |
| error_message = f"Error loading data for split '{split}': {e}" | |
| print(format_error(error_message)) | |
| dummy_df = pd.DataFrame({"Message": [error_message]}) | |
| dummy_viewer = DummyViewer(dummy_df) | |
| dummy_tag_map = {"Overall": []} | |
| # Cache the dummy objects so we don't try to fetch again on this run | |
| with _cache_lock: | |
| CACHED_VIEWERS[split] = dummy_viewer | |
| CACHED_TAG_MAPS[split] = dummy_tag_map | |
| return dummy_viewer, dummy_tag_map | |
| def create_leaderboard_display( | |
| full_df: pd.DataFrame, | |
| tag_map: dict, | |
| category_name: str, | |
| split_name: str | |
| ): | |
| """ | |
| This UI factory takes pre-loaded data and renders the main DataFrame and Plot | |
| for a given category (e.g., "Overall" or "Literature Understanding"). | |
| The display includes a timer that periodically checks for data updates and | |
| refreshes the UI when new data is available. | |
| """ | |
| # Track the data version when this display was created | |
| initial_data_version = get_data_version() | |
| # 1. Instantiate the transformer and get the specific view for this category. | |
| # The function no longer loads data itself; it filters the data it receives. | |
| transformer = DataTransformer(full_df, tag_map) | |
| df_view_full, plots_dict = transformer.view(tag=category_name, use_plotly=True) | |
| def prepare_df_for_display(df_view): | |
| """Prepare a DataFrame for display with all formatting applied.""" | |
| df_display = df_view.copy() | |
| # Get Pareto frontier info | |
| pareto_df = get_pareto_df(df_display) | |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") | |
| if not pareto_df.empty and 'id' in pareto_df.columns: | |
| pareto_agent_names = pareto_df['id'].tolist() | |
| else: | |
| pareto_agent_names = [] | |
| for col in df_display.columns: | |
| if "Cost" in col: | |
| df_display = format_cost_column(df_display, col) | |
| for col in df_display.columns: | |
| if "Score" in col: | |
| df_display = format_score_column(df_display, col) | |
| # Clean the Language Model column first | |
| df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list) | |
| # Now combine icons with Language Model column | |
| def format_language_model_with_icons(row): | |
| icons_html = '' | |
| # Add Pareto trophy if on frontier | |
| if row['id'] in pareto_agent_names: | |
| icons_html += f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:18px; height:18px;">' | |
| # Add openness lock icon | |
| openness_val = row.get('Openness', '') | |
| if openness_val in [aliases.CANONICAL_OPENNESS_OPEN, 'Open', 'Open Source', 'Open Source + Open Weights']: | |
| lock_uri = get_svg_as_data_uri("assets/lock-open.svg") | |
| icons_html += f'<img src="{lock_uri}" alt="Open" title="Open source model" style="width:16px; height:16px;">' | |
| else: | |
| lock_uri = get_svg_as_data_uri("assets/lock-closed.svg") | |
| icons_html += f'<img src="{lock_uri}" alt="Closed" title="Closed source model" style="width:16px; height:16px;">' | |
| # Add company logo | |
| company_html = get_company_logo_html(row['Language Model']) | |
| if company_html: | |
| icons_html += company_html | |
| # Format the model name | |
| model_name = row['Language Model'] | |
| if isinstance(model_name, list): | |
| if len(model_name) > 1: | |
| tooltip_text = "\\n".join(map(str, model_name)) | |
| model_text = f'<span class="tooltip-icon cell-tooltip-icon" style="cursor: help;" data-tooltip="{tooltip_text}">{model_name[0]} (+ {len(model_name) - 1}) ⓘ</span>' | |
| elif len(model_name) == 1: | |
| model_text = model_name[0] | |
| else: | |
| model_text = str(model_name) | |
| else: | |
| model_text = str(model_name) | |
| # Wrap in a flex container to keep icons horizontal | |
| return f'<div style="display:flex; align-items:center; gap:4px; flex-wrap:nowrap;">{icons_html}<span>{model_text}</span></div>' | |
| df_display['Language Model'] = df_display.apply(format_language_model_with_icons, axis=1) | |
| if 'Source' in df_display.columns: | |
| df_display['SDK Version'] = df_display.apply( | |
| lambda row: f"{row['SDK Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['SDK Version'], | |
| axis=1 | |
| ) | |
| columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source'] | |
| df_display = df_display.drop(columns=columns_to_drop, errors='ignore') | |
| return df_display | |
| # Prepare both complete and all entries versions | |
| # Complete entries have all 5 categories submitted | |
| # The 'Categories Attempted' column is formatted as "X/5" | |
| if 'Categories Attempted' in df_view_full.columns: | |
| df_view_complete = df_view_full[df_view_full['Categories Attempted'] == '5/5'].copy() | |
| else: | |
| df_view_complete = df_view_full.copy() | |
| df_display_complete = prepare_df_for_display(df_view_complete) | |
| df_display_all = prepare_df_for_display(df_view_full) | |
| # If no complete entries exist, show all entries by default | |
| has_complete_entries = len(df_display_complete) > 0 | |
| # Determine primary score/cost columns for scatter plot | |
| if category_name == "Overall": | |
| primary_score_col = "Average Score" | |
| primary_cost_col = "Average Cost" | |
| else: | |
| primary_score_col = f"{category_name} Score" | |
| primary_cost_col = f"{category_name} Cost" | |
| # Function to create scatter plot from data | |
| def create_scatter_plot(df_data): | |
| return _plot_scatter_plotly( | |
| data=df_data, | |
| x=primary_cost_col if primary_cost_col in df_data.columns else None, | |
| y=primary_score_col if primary_score_col in df_data.columns else "Average Score", | |
| agent_col="SDK Version", | |
| name=category_name | |
| ) | |
| # Create initial scatter plots for both complete and all data | |
| scatter_plot_complete = create_scatter_plot(df_view_complete) if has_complete_entries else go.Figure() | |
| scatter_plot_all = create_scatter_plot(df_view_full) | |
| # Now get headers from the renamed dataframe (use all entries to ensure headers are present) | |
| df_headers = df_display_all.columns.tolist() | |
| df_datatypes = [] | |
| for col in df_headers: | |
| if col == "Logs" or "Cost" in col or "Score" in col: | |
| df_datatypes.append("markdown") | |
| elif col in ["SDK Version", "Language Model"]: | |
| df_datatypes.append("html") | |
| else: | |
| df_datatypes.append("str") | |
| # Dynamically set widths for the DataFrame columns | |
| # Order: Language Model, SDK Version, Average Score, Average Cost, ... | |
| fixed_start_widths = [280, 100, 100] # Language Model (with icons), SDK Version, Average Score | |
| num_score_cost_cols = 0 | |
| remaining_headers = df_headers[len(fixed_start_widths):] | |
| for col in remaining_headers: | |
| if "Score" in col or "Cost" in col: | |
| num_score_cost_cols += 1 | |
| dynamic_widths = [90] * num_score_cost_cols | |
| fixed_end_widths = [90, 100, 50] # Categories Attempted, Date, Logs | |
| # 5. Combine all the lists to create the final, fully dynamic list. | |
| final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths | |
| # Calculate counts for the checkbox label | |
| num_complete = len(df_display_complete) | |
| num_total = len(df_display_all) | |
| num_incomplete = num_total - num_complete | |
| # Add toggle for showing incomplete entries ABOVE the plot | |
| if has_complete_entries: | |
| show_incomplete_checkbox = gr.Checkbox( | |
| label=f"Show incomplete entries ({num_incomplete} entries with fewer than 5 categories)", | |
| value=False, | |
| elem_id="show-incomplete-toggle" | |
| ) | |
| else: | |
| show_incomplete_checkbox = None | |
| gr.Markdown(f"*No entries with all 5 categories completed yet. Showing all {num_total} entries.*") | |
| # Plot component - show complete entries by default if available | |
| initial_plot = scatter_plot_complete if has_complete_entries else scatter_plot_all | |
| plot_component = gr.Plot( | |
| value=initial_plot, | |
| show_label=False, | |
| ) | |
| gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer") | |
| # Put table and key into an accordion | |
| with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"): | |
| # If there are complete entries, show toggle. If not, show all entries. | |
| if has_complete_entries: | |
| # Start with complete entries only (default) | |
| dataframe_component = gr.DataFrame( | |
| headers=df_headers, | |
| value=df_display_complete, | |
| datatype=df_datatypes, | |
| interactive=False, | |
| wrap=True, | |
| column_widths=final_column_widths, | |
| elem_classes=["wrap-header-df"], | |
| show_search="search", | |
| elem_id="main-leaderboard" | |
| ) | |
| # Update function for the toggle - updates both table and plot | |
| def update_display(show_incomplete): | |
| if show_incomplete: | |
| return df_display_all, scatter_plot_all | |
| else: | |
| return df_display_complete, scatter_plot_complete | |
| show_incomplete_checkbox.change( | |
| fn=update_display, | |
| inputs=[show_incomplete_checkbox], | |
| outputs=[dataframe_component, plot_component] | |
| ) | |
| else: | |
| dataframe_component = gr.DataFrame( | |
| headers=df_headers, | |
| value=df_display_all, | |
| datatype=df_datatypes, | |
| interactive=False, | |
| wrap=True, | |
| column_widths=final_column_widths, | |
| elem_classes=["wrap-header-df"], | |
| show_search="search", | |
| elem_id="main-leaderboard" | |
| ) | |
| legend_markdown = create_legend_markdown(category_name) | |
| gr.HTML(value=legend_markdown, elem_id="legend-markdown") | |
| # Add a timer to periodically check for data updates and refresh the UI | |
| # This runs every 60 seconds to check if new data is available | |
| def check_and_refresh_data(current_checkbox_state): | |
| """Check if data has been refreshed and return updated data if so.""" | |
| current_version = get_data_version() | |
| if current_version > initial_data_version: | |
| # Data has been refreshed, reload it | |
| print(f"[REFRESH] Data version changed from {initial_data_version} to {current_version}, reloading...") | |
| new_df, new_tag_map = get_full_leaderboard_data(split_name) | |
| if not new_df.empty: | |
| new_transformer = DataTransformer(new_df, new_tag_map) | |
| new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True) | |
| # Prepare both complete and all entries versions | |
| if 'Categories Attempted' in new_df_view_full.columns: | |
| new_df_view_complete = new_df_view_full[new_df_view_full['Categories Attempted'] == '5/5'].copy() | |
| else: | |
| new_df_view_complete = new_df_view_full.copy() | |
| new_df_display_complete = prepare_df_for_display(new_df_view_complete) | |
| new_df_display_all = prepare_df_for_display(new_df_view_full) | |
| # Create new scatter plots | |
| new_scatter_complete = create_scatter_plot(new_df_view_complete) if len(new_df_display_complete) > 0 else go.Figure() | |
| new_scatter_all = create_scatter_plot(new_df_view_full) | |
| # Return the appropriate data based on checkbox state | |
| if current_checkbox_state: | |
| return new_df_display_all, new_scatter_all | |
| else: | |
| return new_df_display_complete, new_scatter_complete | |
| # No change, return current values | |
| if current_checkbox_state: | |
| return df_display_all, scatter_plot_all | |
| else: | |
| return df_display_complete, scatter_plot_complete | |
| # Create a timer that checks for updates every 60 seconds | |
| refresh_timer = gr.Timer(value=60) | |
| # Connect the timer to the refresh function | |
| if show_incomplete_checkbox is not None: | |
| refresh_timer.tick( | |
| fn=check_and_refresh_data, | |
| inputs=[show_incomplete_checkbox], | |
| outputs=[dataframe_component, plot_component] | |
| ) | |
| else: | |
| # If no checkbox, always show all data | |
| def check_and_refresh_all(): | |
| current_version = get_data_version() | |
| if current_version > initial_data_version: | |
| print(f"[REFRESH] Data version changed, reloading...") | |
| new_df, new_tag_map = get_full_leaderboard_data(split_name) | |
| if not new_df.empty: | |
| new_transformer = DataTransformer(new_df, new_tag_map) | |
| new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True) | |
| new_df_display_all = prepare_df_for_display(new_df_view_full) | |
| new_scatter_all = create_scatter_plot(new_df_view_full) | |
| return new_df_display_all, new_scatter_all | |
| return df_display_all, scatter_plot_all | |
| refresh_timer.tick( | |
| fn=check_and_refresh_all, | |
| inputs=[], | |
| outputs=[dataframe_component, plot_component] | |
| ) | |
| # Return the components so they can be referenced elsewhere. | |
| return plot_component, dataframe_component | |
| # # --- Detailed Benchmark Display --- | |
| def create_benchmark_details_display( | |
| full_df: pd.DataFrame, | |
| tag_map: dict, | |
| category_name: str, | |
| validation: bool = False, | |
| ): | |
| """ | |
| Generates a detailed breakdown for each benchmark within a given category. | |
| For each benchmark, it creates a title, a filtered table, and a scatter plot. | |
| Args: | |
| full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split. | |
| tag_map (dict): The "pretty" tag map to find the list of benchmarks. | |
| category_name (str): The main category to display details for (e.g., "Literature Understanding"). | |
| """ | |
| # 1. Get the list of benchmarks for the selected category | |
| benchmark_names = tag_map.get(category_name, []) | |
| if not benchmark_names: | |
| gr.Markdown(f"No detailed benchmarks found for the category: {category_name}") | |
| return | |
| gr.HTML(f'<h2 class="benchmark-main-subtitle">{category_name} Detailed Benchmark Results</h2>') | |
| gr.Markdown("---") | |
| # 2. Loop through each benchmark and create its UI components | |
| for benchmark_name in benchmark_names: | |
| anchor_id = create_gradio_anchor_id(benchmark_name, validation) | |
| gr.HTML( | |
| f""" | |
| <h3 class="benchmark-title" id="{anchor_id}">{benchmark_name} Leaderboard <a href="#{anchor_id}" class="header-link-icon">🔗</a></h3> | |
| <div class="benchmark-description">{get_benchmark_description(benchmark_name, validation)}</div> | |
| <button onclick="scroll_to_element('page-content-wrapper')" class="primary-link-button">Return to the aggregate {category_name} leaderboard</button> | |
| """ | |
| ) | |
| # 3. Prepare the data for this specific benchmark's table and plot | |
| benchmark_score_col = f"{benchmark_name} Score" | |
| benchmark_cost_col = f"{benchmark_name} Cost" | |
| benchmark_download_col = f"{benchmark_name} Download" | |
| # Define the columns needed for the detailed table | |
| table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs', benchmark_download_col, 'id', 'Language Model'] | |
| # Filter to only columns that actually exist in the full dataframe | |
| existing_table_cols = [col for col in table_cols if col in full_df.columns] | |
| if benchmark_score_col not in existing_table_cols: | |
| gr.Markdown(f"Score data for {benchmark_name} not available.") | |
| continue # Skip to the next benchmark if score is missing | |
| # Create a specific DataFrame for the table view | |
| benchmark_table_df = full_df[existing_table_cols].copy() | |
| pareto_df = get_pareto_df(benchmark_table_df) | |
| # Get the list of agents on the frontier. We'll use this list later. | |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") | |
| if not pareto_df.empty and 'id' in pareto_df.columns: | |
| pareto_agent_names = pareto_df['id'].tolist() | |
| else: | |
| pareto_agent_names = [] | |
| # Clean the Language Model column first | |
| benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list) | |
| # Combine icons with Language Model column | |
| def format_language_model_with_icons(row): | |
| icons_html = '' | |
| # Add Pareto trophy if on frontier | |
| if row['id'] in pareto_agent_names: | |
| icons_html += f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:18px; height:18px;">' | |
| # Add openness lock icon | |
| openness_val = row.get('Openness', '') | |
| if openness_val in [aliases.CANONICAL_OPENNESS_OPEN, 'Open', 'Open Source', 'Open Source + Open Weights']: | |
| lock_uri = get_svg_as_data_uri("assets/lock-open.svg") | |
| icons_html += f'<img src="{lock_uri}" alt="Open" title="Open source model" style="width:16px; height:16px;">' | |
| else: | |
| lock_uri = get_svg_as_data_uri("assets/lock-closed.svg") | |
| icons_html += f'<img src="{lock_uri}" alt="Closed" title="Closed source model" style="width:16px; height:16px;">' | |
| # Add company logo | |
| company_html = get_company_logo_html(row['Language Model']) | |
| if company_html: | |
| icons_html += company_html | |
| # Format the model name | |
| model_name = row['Language Model'] | |
| if isinstance(model_name, list): | |
| if len(model_name) > 1: | |
| tooltip_text = "\\n".join(map(str, model_name)) | |
| model_text = f'<span class="tooltip-icon cell-tooltip-icon" style="cursor: help;" data-tooltip="{tooltip_text}">{model_name[0]} (+ {len(model_name) - 1}) ⓘ</span>' | |
| elif len(model_name) == 1: | |
| model_text = model_name[0] | |
| else: | |
| model_text = str(model_name) | |
| else: | |
| model_text = str(model_name) | |
| # Wrap in a flex container to keep icons horizontal | |
| return f'<div style="display:flex; align-items:center; gap:4px; flex-wrap:nowrap;">{icons_html}<span>{model_text}</span></div>' | |
| benchmark_table_df['Language Model'] = benchmark_table_df.apply(format_language_model_with_icons, axis=1) | |
| # append the repro url to the end of the SDK Version | |
| if 'Source' in benchmark_table_df.columns: | |
| benchmark_table_df['SDK Version'] = benchmark_table_df.apply( | |
| lambda row: f"{row['SDK Version']} {row['Source']}" if row['Source'] else row['SDK Version'], | |
| axis=1 | |
| ) | |
| # Calculated and add "Benchmark Attempted" column | |
| def check_benchmark_status(row): | |
| has_score = pd.notna(row.get(benchmark_score_col)) | |
| has_cost = pd.notna(row.get(benchmark_cost_col)) | |
| if has_score and has_cost: | |
| return "✅" | |
| if has_score or has_cost: | |
| return "⚠️" | |
| return "🚫 " | |
| # Apply the function to create the new column | |
| benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1) | |
| # Sort the DataFrame | |
| if benchmark_score_col in benchmark_table_df.columns: | |
| benchmark_table_df = benchmark_table_df.sort_values( | |
| by=benchmark_score_col, ascending=False, na_position='last' | |
| ) | |
| # 1. Format the cost and score columns | |
| benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col) | |
| benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col) | |
| # Format download column as clickable icon | |
| if benchmark_download_col in benchmark_table_df.columns: | |
| def format_download_link(url): | |
| if pd.isna(url) or url == "": | |
| return "" | |
| return f"[⬇️]({url})" | |
| benchmark_table_df[benchmark_download_col] = benchmark_table_df[benchmark_download_col].apply(format_download_link) | |
| desired_cols_in_order = [ | |
| 'Language Model', | |
| 'SDK Version', | |
| 'Attempted Benchmark', | |
| benchmark_score_col, | |
| benchmark_cost_col, | |
| 'Date', | |
| 'Logs', | |
| benchmark_download_col | |
| ] | |
| for col in desired_cols_in_order: | |
| if col not in benchmark_table_df.columns: | |
| benchmark_table_df[col] = pd.NA # Add as an empty column | |
| benchmark_table_df = benchmark_table_df[desired_cols_in_order] | |
| # Rename columns for a cleaner table display, as requested | |
| benchmark_table_df.rename(columns={ | |
| benchmark_score_col: 'Score', | |
| benchmark_cost_col: 'Cost', | |
| benchmark_download_col: '⬇️', # Empty-ish header with icon hint | |
| }, inplace=True) | |
| # Now get headers from the renamed dataframe | |
| df_headers = benchmark_table_df.columns.tolist() | |
| df_datatypes = [] | |
| for col in df_headers: | |
| if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col: | |
| df_datatypes.append("markdown") | |
| elif col in ["SDK Version", "Language Model"]: | |
| df_datatypes.append("html") | |
| else: | |
| df_datatypes.append("str") | |
| benchmark_plot = _plot_scatter_plotly( | |
| data=full_df, | |
| x=benchmark_cost_col, | |
| y=benchmark_score_col, | |
| agent_col="SDK Version", | |
| name=benchmark_name | |
| ) | |
| gr.Plot(value=benchmark_plot, show_label=False) | |
| gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer") | |
| # Put table and key into an accordion | |
| with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"): | |
| gr.DataFrame( | |
| headers=df_headers, | |
| value=benchmark_table_df, | |
| datatype=df_datatypes, | |
| interactive=False, | |
| wrap=True, | |
| column_widths=[200, 80, 40, 80, 80, 150, 40, 40], # Language Model, SDK Version, Attempted, Score, Cost, Date, Logs, Download | |
| show_search="search", | |
| elem_classes=["wrap-header-df"] | |
| ) | |
| legend_markdown = create_legend_markdown(benchmark_name) | |
| gr.HTML(value=legend_markdown, elem_id="legend-markdown") | |
| def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]: | |
| """ | |
| Loads and transforms the complete dataset for a given split. | |
| This function handles caching and returns the final "pretty" DataFrame and tag map. | |
| """ | |
| viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split) | |
| if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)): | |
| raw_df, _ = viewer_or_data._load() | |
| if raw_df.empty: | |
| return pd.DataFrame(), {} | |
| pretty_df = transform_raw_dataframe(raw_df) | |
| pretty_tag_map = create_pretty_tag_map(raw_tag_map, INFORMAL_TO_FORMAL_NAME_MAP) | |
| if "Logs" in pretty_df.columns: | |
| def format_log_entry_to_html(raw_uri): | |
| if pd.isna(raw_uri) or raw_uri == "": return "" | |
| web_url = hf_uri_to_web_url(str(raw_uri)) | |
| return hyperlink(web_url, "🔗") if web_url else "" | |
| # Apply the function to the "Logs" column | |
| pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html) | |
| if "Source" in pretty_df.columns: | |
| def format_source_url_to_html(raw_url): | |
| # Handle empty or NaN values, returning a blank string. | |
| if pd.isna(raw_url) or raw_url == "": return "" | |
| # Assume 'source_url' is already a valid web URL and doesn't need conversion. | |
| return hyperlink(str(raw_url), "🔗") | |
| # Apply the function to the "source_url" column. | |
| pretty_df["Source"] = pretty_df["Source"].apply(format_source_url_to_html) | |
| return pretty_df, pretty_tag_map | |
| # Fallback for unexpected types | |
| return pd.DataFrame(), {} | |
| def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: bool = False) -> gr.HTML: | |
| """ | |
| Builds the entire sub-navigation bar as a single, self-contained HTML component. | |
| This bypasses Gradio's layout components, giving us full control. | |
| """ | |
| benchmark_names = tag_map.get(category_name, []) | |
| if not benchmark_names: | |
| # Return an empty HTML component to prevent errors | |
| return gr.HTML() | |
| # Start building the list of HTML button elements as strings | |
| html_buttons = [] | |
| for name in benchmark_names: | |
| target_id = create_gradio_anchor_id(name, validation) | |
| # Create a standard HTML button. | |
| # The onclick attribute calls our global JS function directly. | |
| # Note the mix of double and single quotes. | |
| button_str = f""" | |
| <button | |
| class="primary-link-button" | |
| onclick="scroll_to_element('{target_id}')" | |
| > | |
| {name} | |
| </button> | |
| """ | |
| html_buttons.append(button_str) | |
| # Join the button strings and wrap them in a single div container | |
| # This container will be our flexbox row. | |
| full_html = f""" | |
| <div class="sub-nav-bar-container"> | |
| <span class="sub-nav-label">Benchmarks in this category:</span> | |
| {' | '.join(html_buttons)} | |
| </div> | |
| """ | |
| # Return the entire navigation bar as one single Gradio HTML component | |
| return gr.HTML(full_html) | |
| def format_llm_base_with_html(value): | |
| """ | |
| Formats the 'Models Used' cell value. | |
| If the value is a list with more than 1 element, it returns an | |
| HTML <span> with the full list in a hover-over tooltip. | |
| If it's a single-element list, it returns just that element. | |
| Otherwise, it returns the original value. | |
| """ | |
| if isinstance(value, list): | |
| if len(value) > 1: | |
| # Join the list items with a newline character for a clean tooltip | |
| tooltip_text = "\n".join(map(str, value)) | |
| # Return an HTML span with the title attribute for the tooltip | |
| return f'<span class="tooltip-icon cell-tooltip-icon" style="cursor: help;" data-tooltip="{tooltip_text}">{value[0]} (+ {len(value) - 1}) ⓘ</span>' | |
| if len(value) == 1: | |
| # If only one item, just return that item | |
| return value[0] | |
| # Return the value as-is if it's not a list or is an empty list | |
| return value | |