import gradio as gr import pandas as pd import plotly.graph_objects as go import os import re import base64 from agenteval.leaderboard.view import LeaderboardViewer from huggingface_hub import HfApi import aliases from leaderboard_transformer import ( DataTransformer, transform_raw_dataframe, create_pretty_tag_map, INFORMAL_TO_FORMAL_NAME_MAP, _plot_scatter_plotly, format_cost_column, format_score_column, get_pareto_df, clean_llm_base_list, ) from config import ( CONFIG_NAME, EXTRACTED_DATA_DIR, IS_INTERNAL, RESULTS_DATASET, ) from content import ( scatter_disclaimer_html, format_error, format_log, format_warning, hf_uri_to_web_url, hyperlink, ) api = HfApi() MAX_UPLOAD_BYTES = 100 * 1024**2 AGENTEVAL_MANIFEST_NAME = "agenteval.json" os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True) # Global variables COMBINED_ICON_MAP = { aliases.CANONICAL_OPENNESS_OPEN_OPEN_WEIGHTS: { aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-ow-standard.svg", aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-ow-equivalent.svg", aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-ow-custom.svg", }, aliases.CANONICAL_OPENNESS_OPEN_CLOSED_WEIGHTS: { aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-standard.svg", aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-equivalent.svg", aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-custom.svg", }, aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: { aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/api-standard.svg", aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/api-equivalent.svg", aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/api-custom.svg", }, aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: { aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/c-standard.svg", aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/c-equivalent.svg", aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/c-custom.svg", } } # it's important to do the tool usage first here, so that when # we do openness, the tool usage changes get picked up for openness in COMBINED_ICON_MAP: for canonical_tool_usage, tool_usage_aliases in aliases.TOOL_USAGE_ALIASES.items(): for tool_usage_alias in tool_usage_aliases: COMBINED_ICON_MAP[openness][tool_usage_alias] = COMBINED_ICON_MAP[openness][canonical_tool_usage] for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items(): for openness_alias in openness_aliases: COMBINED_ICON_MAP[openness_alias] = COMBINED_ICON_MAP[canonical_openness] OPENNESS_SVG_MAP = { "Open Source + Open Weights": "assets/os-ow-legend.svg", "Open Source": "assets/os-legend.svg", "API Available": "assets/api-legend.svg", "Closed": "assets/c-legend.svg", } TOOLING_SVG_MAP = { "Standard": "assets/standard-legend.svg", "Custom with Standard Search": "assets/equivalent-legend.svg", "Fully Custom": "assets/custom-legend.svg", } def get_svg_as_data_uri(path: str) -> str: """Reads an SVG file and returns it as a base64-encoded data URI.""" try: with open(path, "rb") as svg_file: encoded_svg = base64.b64encode(svg_file.read()).decode("utf-8") return f"data:image/svg+xml;base64,{encoded_svg}" except FileNotFoundError: print(f"Warning: SVG file not found at {path}") return "" # Create a pre-loaded version of our map. This should be run ONCE when the app starts. PRELOADED_URI_MAP = { openness: { tooling: get_svg_as_data_uri(path) for tooling, path in tooling_map.items() } for openness, tooling_map in COMBINED_ICON_MAP.items() } def get_combined_icon_html(row, uri_map): """ Looks up the correct icon URI from the pre-loaded map based on the row's 'Openness' and 'Agent Tooling' values and returns an HTML tag. """ openness_val = row['Openness'] tooling_val = row['Agent Tooling'] uri = uri_map.get(openness_val, {}).get(tooling_val, "") # The tooltip will show the exact combination for clarity. tooltip = f"Openness: {openness_val}, Tooling: {tooling_val}" # Return the HTML string that Gradio will render in the DataFrame. return f' {tooltip}

' def create_svg_html(value, svg_map): """ Generates the absolute simplest HTML for an icon, without any extra text. This version is compatible with gr.DataFrame. """ if pd.isna(value) or value not in svg_map: return "" path_info = svg_map[value] src = get_svg_as_data_uri(path_info) # Generate the HTML for the single icon, with NO text. if src: return f' {value}

' return "" def build_openness_tooltip_content() -> str: """ Generates the inner HTML for the Agent Openness tooltip card, """ descriptions = { "Open Source + Open Weights": "Both code and ML models are open", "Open Source": "Code is open but uses an ML model with closed-weights", "API Available": "No access to code; API access only", "Closed": "No access to code or API; UI access only", } html_items = [] for name, path in OPENNESS_SVG_MAP.items(): uri = get_svg_as_data_uri(path) desc = descriptions.get(name, "") # Create the HTML for a single row in the tooltip legend html_items.append(f""" """) return "".join(html_items) def build_pareto_tooltip_content() -> str: """Generates the inner HTML for the Pareto tooltip card with final copy.""" return f"""

On Pareto Frontier

""" def build_tooling_tooltip_content() -> str: """Generates the inner HTML for the Agent Tooling tooltip card.""" descriptions = { "Standard": "Uses only predefined tools from the evaluation environment (as defined in Inspect's state.tools).", "Custom with Standard Search": "Custom tools for accessing an equivalent underlying environment:", "Fully Custom": "Uses tools beyond constraints of Standard or Custom interface", } custom_interface_sub_list = """ """ html_items = [] for name, path in TOOLING_SVG_MAP.items(): uri = get_svg_as_data_uri(path) desc = descriptions.get(name, "") # Check if this is the special case that needs a sub-list sub_list_html = custom_interface_sub_list if name == "Custom with Standard Search" else "" html_items.append(f""" """) return "".join(html_items) def build_descriptions_tooltip_content(table) -> str: """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table.""" if table == "Overall": return """ """ elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]: return f""" """ else: # Fallback for any other table type, e.g., individual benchmarks return f""" """ # Dynamically generate the correct HTML for the legend parts openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP]) tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP]) # Create HTML for the "Openness" legend items openness_html_items = [] for name, path in OPENNESS_SVG_MAP.items(): uri = get_svg_as_data_uri(path) # Each item is now its own flexbox container to guarantee alignment openness_html_items.append( f'

' f' {name}

' f'{name}' f'

' ) openness_html = " ".join(openness_html_items) # Create HTML for the "Tooling" legend items tooling_html_items = [] for name, path in TOOLING_SVG_MAP.items(): uri = get_svg_as_data_uri(path) tooling_html_items.append( f'

' f' {name}

' f'{name}' f'

' ) tooling_html = " ".join(tooling_html_items) pareto_tooltip_content = build_pareto_tooltip_content() openness_tooltip_content = build_openness_tooltip_content() tooling_tooltip_content = build_tooling_tooltip_content() def create_legend_markdown(which_table: str) -> str: """ Generates the complete HTML for the legend section, including tooltips. This is used in the main leaderboard display. """ descriptions_tooltip_content = build_descriptions_tooltip_content(which_table) legend_markdown = f"""

Pareto

🏆 On frontier

Agent Openness

{openness_html}

Agent Tooling

{tooling_html}

Column Descriptions

""" return legend_markdown # --- Global State for Viewers (simple caching) --- CACHED_VIEWERS = {} CACHED_TAG_MAPS = {} class DummyViewer: """A mock viewer to be cached on error. It has a ._load() method to ensure it behaves like the real LeaderboardViewer.""" def __init__(self, error_df): self._error_df = error_df def _load(self): # The _load method returns the error DataFrame and an empty tag map return self._error_df, {} def get_leaderboard_viewer_instance(split: str): """ Fetches the LeaderboardViewer for a split, using a cache to avoid re-downloading data. On error, returns a stable DummyViewer object. """ global CACHED_VIEWERS, CACHED_TAG_MAPS if split in CACHED_VIEWERS: # Cache hit: return the cached viewer and tag map return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []}) # --- Cache miss: try to load data from the source --- try: print(f"Using Hugging Face dataset for split '{split}': {RESULTS_DATASET}/{CONFIG_NAME}") viewer = LeaderboardViewer( repo_id=RESULTS_DATASET, config=CONFIG_NAME, split=split, is_internal=IS_INTERNAL ) # Simplify tag map creation pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP) # Cache the results for next time CACHED_VIEWERS[split] = viewer CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly return viewer, pretty_tag_map except Exception as e: # On ANY error, create a consistent error message and cache a DummyViewer error_message = f"Error loading data for split '{split}': {e}" print(format_error(error_message)) dummy_df = pd.DataFrame({"Message": [error_message]}) dummy_viewer = DummyViewer(dummy_df) dummy_tag_map = {"Overall": []} # Cache the dummy objects so we don't try to fetch again on this run CACHED_VIEWERS[split] = dummy_viewer CACHED_TAG_MAPS[split] = dummy_tag_map return dummy_viewer, dummy_tag_map def create_leaderboard_display( full_df: pd.DataFrame, tag_map: dict, category_name: str, split_name: str ): """ This UI factory takes pre-loaded data and renders the main DataFrame and Plot for a given category (e.g., "Overall" or "Literature Understanding"). """ # 1. Instantiate the transformer and get the specific view for this category. # The function no longer loads data itself; it filters the data it receives. transformer = DataTransformer(full_df, tag_map) df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True) pareto_df = get_pareto_df(df_view) # Get the list of agents on the frontier. We'll use this list later. if not pareto_df.empty and 'id' in pareto_df.columns: pareto_agent_names = pareto_df['id'].tolist() else: pareto_agent_names = [] df_view['Pareto'] = df_view.apply( lambda row: '🏆' if row['id'] in pareto_agent_names else '', axis=1 ) # Create mapping for Openness / tooling df_view['Icon'] = df_view.apply( lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP), axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row ) # Format cost columns for col in df_view.columns: if "Cost" in col: df_view = format_cost_column(df_view, col) # Fill NaN scores with 0 for col in df_view.columns: if "Score" in col: df_view = format_score_column(df_view, col) scatter_plot = plots_dict.get('scatter_plot', go.Figure()) #Make pretty and format the LLM Base column df_view['LLM Base'] = df_view['LLM Base'].apply(clean_llm_base_list) df_view['LLM Base'] = df_view['LLM Base'].apply(format_llm_base_with_html) # append the repro url to the end of the agent name if 'Source' in df_view.columns: df_view['Agent'] = df_view.apply( lambda row: f"{row['Agent']} {row['Source']}" if row['Source'] else row['Agent'], axis=1 ) all_cols = df_view.columns.tolist() # Remove pareto and Icon columns and insert it at the beginning all_cols.insert(0, all_cols.pop(all_cols.index('Icon'))) all_cols.insert(0, all_cols.pop(all_cols.index('Pareto'))) df_view = df_view[all_cols] # Drop internally used columns that are not needed in the display columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source'] df_view = df_view.drop(columns=columns_to_drop, errors='ignore') df_headers = df_view.columns.tolist() df_datatypes = [] for col in df_headers: if col == "Logs" or "Cost" in col or "Score" in col: df_datatypes.append("markdown") elif col in ["Agent","Icon","LLM Base"]: df_datatypes.append("html") else: df_datatypes.append("str") header_rename_map = { "Pareto": "", "Icon": "", } # 2. Create the final list of headers for display. df_view = df_view.rename(columns=header_rename_map) # Dynamically set widths for the DataFrame columns fixed_start_widths = [40, 40, 200, 100, 200] num_score_cost_cols = 0 remaining_headers = df_headers[len(fixed_start_widths):] for col in remaining_headers: if "Score" in col or "Cost" in col: num_score_cost_cols += 1 dynamic_widths = [90] * num_score_cost_cols fixed_end_widths = [90, 50] # 5. Combine all the lists to create the final, fully dynamic list. final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths plot_component = gr.Plot( value=scatter_plot, show_label=False ) gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer") # Put table and key into an accordion with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"): dataframe_component = gr.DataFrame( headers=df_headers, value=df_view, datatype=df_datatypes, interactive=False, wrap=True, column_widths=final_column_widths, elem_classes=["wrap-header-df"], show_search="search", ) legend_markdown = create_legend_markdown(category_name) gr.HTML(value=legend_markdown, elem_id="legend-markdown") # Return the components so they can be referenced elsewhere. return plot_component, dataframe_component # # --- Detailed Benchmark Display --- def create_benchmark_details_display( full_df: pd.DataFrame, tag_map: dict, category_name: str ): """ Generates a detailed breakdown for each benchmark within a given category. For each benchmark, it creates a title, a filtered table, and a scatter plot. Args: full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split. tag_map (dict): The "pretty" tag map to find the list of benchmarks. category_name (str): The main category to display details for (e.g., "Literature Understanding"). """ # 1. Get the list of benchmarks for the selected category benchmark_names = tag_map.get(category_name, []) if not benchmark_names: gr.Markdown(f"No detailed benchmarks found for the category: {category_name}") return gr.Markdown("---") gr.Markdown("## Detailed Benchmark Results") # 2. Loop through each benchmark and create its UI components for benchmark_name in benchmark_names: with gr.Row(elem_classes=["benchmark-header"]): gr.Markdown(f"### {benchmark_name} Leaderboard", header_links=True) button_str = f""" """ gr.HTML(button_str,elem_classes="scroll-up-container") # 3. Prepare the data for this specific benchmark's table and plot benchmark_score_col = f"{benchmark_name} Score" benchmark_cost_col = f"{benchmark_name} Cost" # Define the columns needed for the detailed table table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'LLM Base'] # Filter to only columns that actually exist in the full dataframe existing_table_cols = [col for col in table_cols if col in full_df.columns] if benchmark_score_col not in existing_table_cols: gr.Markdown(f"Score data for {benchmark_name} not available.") continue # Skip to the next benchmark if score is missing # Create a specific DataFrame for the table view benchmark_table_df = full_df[existing_table_cols].copy() pareto_df = get_pareto_df(benchmark_table_df) # Get the list of agents on the frontier. We'll use this list later. if not pareto_df.empty and 'id' in pareto_df.columns: pareto_agent_names = pareto_df['id'].tolist() else: pareto_agent_names = [] benchmark_table_df['Pareto'] = benchmark_table_df.apply( lambda row: ' 🏆' if row['id'] in pareto_agent_names else '', axis=1 ) benchmark_table_df['Icon'] = benchmark_table_df.apply( lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP), axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row ) #Make pretty and format the LLM Base column benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(clean_llm_base_list) benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(format_llm_base_with_html) # append the repro url to the end of the agent name if 'Source' in benchmark_table_df.columns: benchmark_table_df['Agent'] = benchmark_table_df.apply( lambda row: f"{row['Agent']} {row['Source']}" if row['Source'] else row['Agent'], axis=1 ) # Calculated and add "Benchmark Attempted" column def check_benchmark_status(row): has_score = pd.notna(row.get(benchmark_score_col)) has_cost = pd.notna(row.get(benchmark_cost_col)) if has_score and has_cost: return "✅" if has_score or has_cost: return "⚠️" return "🚫 " # Apply the function to create the new column benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1) # Sort the DataFrame if benchmark_score_col in benchmark_table_df.columns: benchmark_table_df = benchmark_table_df.sort_values( by=benchmark_score_col, ascending=False, na_position='last' ) # 1. Format the cost and score columns benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col) benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col) desired_cols_in_order = [ 'Pareto', 'Icon', 'Agent', 'Submitter', 'LLM Base', 'Attempted Benchmark', benchmark_score_col, benchmark_cost_col, 'Logs' ] for col in desired_cols_in_order: if col not in benchmark_table_df.columns: benchmark_table_df[col] = pd.NA # Add as an empty column benchmark_table_df = benchmark_table_df[desired_cols_in_order] # Rename columns for a cleaner table display, as requested benchmark_table_df.rename({ benchmark_score_col: 'Score', benchmark_cost_col: 'Cost', }, inplace=True) # Ensure the 'Logs' column is formatted correctly df_headers = benchmark_table_df.columns.tolist() df_datatypes = [] for col in df_headers: if "Logs" in col or "Cost" in col or "Score" in col: df_datatypes.append("markdown") elif col in ["Agent","Icon", "LLM Base"]: df_datatypes.append("html") else: df_datatypes.append("str") # Remove Pareto, Openness, and Agent Tooling from the headers header_rename_map = { "Pareto": "", "Icon": "", } # 2. Create the final list of headers for display. benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map) benchmark_plot = _plot_scatter_plotly( data=full_df, x=benchmark_cost_col, y=benchmark_score_col, agent_col="Agent", name=benchmark_name ) gr.Plot(value=benchmark_plot, show_label=False) gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer") # Put table and key into an accordion with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"): gr.DataFrame( headers=df_headers, value=benchmark_table_df, datatype=df_datatypes, interactive=False, wrap=True, column_widths=[40, 40, 200, 150, 175, 85, 100, 100, 40], elem_classes=["wrap-header-df"] ) legend_markdown = create_legend_markdown(benchmark_name) gr.HTML(value=legend_markdown, elem_id="legend-markdown") def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]: """ Loads and transforms the complete dataset for a given split. This function handles caching and returns the final "pretty" DataFrame and tag map. """ viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split) if isinstance(viewer_or_data, (LeaderboardViewer, DummyViewer)): raw_df, _ = viewer_or_data._load() if raw_df.empty: return pd.DataFrame(), {} pretty_df = transform_raw_dataframe(raw_df) pretty_tag_map = create_pretty_tag_map(raw_tag_map, INFORMAL_TO_FORMAL_NAME_MAP) if "Logs" in pretty_df.columns: def format_log_entry_to_html(raw_uri): if pd.isna(raw_uri) or raw_uri == "": return "" web_url = hf_uri_to_web_url(str(raw_uri)) return hyperlink(web_url, "🔗") if web_url else "" # Apply the function to the "Logs" column pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html) if "Source" in pretty_df.columns: def format_source_url_to_html(raw_url): # Handle empty or NaN values, returning a blank string. if pd.isna(raw_url) or raw_url == "": return "" # Assume 'source_url' is already a valid web URL and doesn't need conversion. return hyperlink(str(raw_url), "🔗") # Apply the function to the "source_url" column. pretty_df["Source"] = pretty_df["Source"].apply(format_source_url_to_html) return pretty_df, pretty_tag_map # Fallback for unexpected types return pd.DataFrame(), {} # Create sub-nav bar for benchmarks def create_gradio_anchor_id(text: str, validation) -> str: """ Replicates the ID format created by gr.Markdown(header_links=True). Example: "Paper Finder Validation" -> "h-paper-finder-validation" """ text = text.lower() text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens text = re.sub(r'[^\w-]', '', text) # Remove non-word characters if validation: return f"h-{text}-leaderboard-1" return f"h-{text}-leaderboard" def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: bool = False) -> gr.HTML: """ Builds the entire sub-navigation bar as a single, self-contained HTML component. This bypasses Gradio's layout components, giving us full control. """ benchmark_names = tag_map.get(category_name, []) if not benchmark_names: # Return an empty HTML component to prevent errors return gr.HTML() # Start building the list of HTML button elements as strings html_buttons = [] for name in benchmark_names: target_id = create_gradio_anchor_id(name, validation) # Create a standard HTML button. # The onclick attribute calls our global JS function directly. # Note the mix of double and single quotes. button_str = f""" """ html_buttons.append(button_str) # Join the button strings and wrap them in a single div container # This container will be our flexbox row. full_html = f""" """ # Return the entire navigation bar as one single Gradio HTML component return gr.HTML(full_html) def format_llm_base_with_html(value): """ Formats the 'LLM Base' cell value. If the value is a list with more than 1 element, it returns an HTML with the full list in a hover-over tooltip. If it's a single-element list, it returns just that element. Otherwise, it returns the original value. """ if isinstance(value, list): if len(value) > 1: # Join the list items with a newline character for a clean tooltip tooltip_text = "\n".join(map(str, value)) # Return an HTML span with the title attribute for the tooltip return f'' if len(value) == 1: # If only one item, just return that item return value[0] # Return the value as-is if it's not a list or is an empty list return value