Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import json | |
| from src.about import ( | |
| REPRODUCIBILITY_TEXT, | |
| INTRODUCTION_TEXT, | |
| ABOUT_TEXT, | |
| TITLE, | |
| ) | |
| from src.display.css_html_js import custom_css, custom_js | |
| from src.display.formatting import make_clickable_field | |
| def build_leaderboard(type): | |
| with open('data/results.json', 'r') as f: | |
| results = json.load(f) | |
| with open('data/tasks.json', 'r') as f: | |
| tasks = json.load(f) | |
| # Filter tasks based on type | |
| filtered_tasks = {k: v for k, v in tasks.items() if v['type'] == type} | |
| data = [] | |
| for model_name, model_data in results.items(): | |
| # For agentic type, skip models that have all null values for agentic tasks | |
| if type == "agentic": | |
| has_agentic_results = any( | |
| model_data['results'].get(task, {}).get(tasks[task]['metric']) is not None | |
| for task in filtered_tasks | |
| ) | |
| if not has_agentic_results: | |
| continue | |
| model_sha = model_data["config"]["model_sha"] | |
| model_name = model_data["config"]["model_name"] | |
| row = { | |
| 'Model': make_clickable_field(model_name, model_sha) | |
| } | |
| for dataset, metrics in model_data['results'].items(): | |
| # Only include metrics for tasks of the specified type | |
| if dataset in filtered_tasks: | |
| value = next(iter(metrics.values())) | |
| log_url = metrics.get('log_url') | |
| # Use display name from tasks.json instead of raw dataset name | |
| display_name = filtered_tasks[dataset]['display_name'] | |
| # Round non-null values to 2 decimal places and make clickable if log_url exists | |
| if value is not None: | |
| value = round(value*100, 2) | |
| if log_url: | |
| value = make_clickable_field(value, log_url) | |
| row[display_name] = value | |
| data.append(row) | |
| results_df = pd.DataFrame(data) | |
| # Round all numeric columns to 2 decimal places | |
| numeric_cols = results_df.select_dtypes(include=['float64', 'float32']).columns | |
| results_df[numeric_cols] = results_df[numeric_cols].round(2) | |
| # Fill null values with "-" | |
| results_df = results_df.fillna("--") | |
| if type == "agentic": | |
| # Include agent column as second column after Model | |
| results_df.insert(1, 'Agent', make_clickable_field('Basic Agent', 'https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent')) | |
| return gr.components.Dataframe( | |
| value=results_df, | |
| datatype=["html" for _ in results_df.columns], | |
| column_widths=["250px" if c == "Model" else "150px" for c in results_df.columns], | |
| wrap=False, | |
| ) | |
| black_logo_path = "src/assets/logo-icon-black.png" | |
| white_logo_path = "src/assets/logo-icon-white.png" | |
| demo = gr.Blocks( | |
| css=custom_css, | |
| js=custom_js, | |
| theme=gr.themes.Default(primary_hue=gr.themes.colors.pink), | |
| fill_height=True, | |
| fill_width=True, | |
| ) | |
| with demo: | |
| gr.HTML(f""" | |
| <div id="page-header"> | |
| <div id="header-container"> | |
| <div id="left-container"> | |
| <img id="black-logo" src="/gradio_api/file={black_logo_path}"> | |
| <img id="white-logo" src="/gradio_api/file={white_logo_path}"> | |
| </div> | |
| <div id="centre-container"> | |
| <h1 style="margin-bottom: 0.25rem;">{TITLE}</h1> | |
| <p style="color:#eb088a; margin:0; font-size:1.2rem;">Explore Interactive Results & Traces</p> | |
| </div> | |
| <div id="right-container"> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="intro-text", sanitize_html=False) | |
| with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs: | |
| with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0): | |
| build_leaderboard("base") | |
| with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1): | |
| build_leaderboard("agentic") | |
| with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2): | |
| gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False) | |
| with gr.TabItem("Reproducibility", elem_classes="llm-benchmark-tab-table", id=3): | |
| gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False) | |
| assets = [black_logo_path, white_logo_path] | |
| demo.launch(allowed_paths=assets) | |