Spaces:

AutoBench
/

AutoBench-Leaderboard

Running

App Files Files Community

PeterKruger commited on Apr 16

Commit

c00108e

verified ·

1 Parent(s): 3d84e78

Upload app.py

Browse files

Formatting fixes for new benchmarks and some descripotions

Files changed (1) hide show

app.py +70 -27

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import pandas as pd
 import plotly.express as px
 import os
 import json
-from typing import Dict, List
 # --- Configuration ---
 RUNS_DIR = "runs"
@@ -11,6 +11,14 @@ DATA_DIR = "."  # For backward compatibility
 COST_COLUMN_SUMMARY = 'Costs (USD)'
 NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)'
 # --- Multi-Run Support Functions ---
 def discover_available_runs() -> List[Dict]:
     """Scan runs directory and return sorted list of available runs with metadata."""
@@ -91,6 +99,47 @@ def format_correlations_text(correlations_data: Dict) -> str:
         return f"**Benchmark Correlations:** AutoBench features " + ", ".join(correlation_parts) + "."
     return ""
 def load_run_data(run_id: str) -> Dict[str, pd.DataFrame]:
     """Load all CSV data for a specific run."""
     runs = discover_available_runs()
@@ -163,27 +212,17 @@ def process_run_data(data: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:
         summary_cols_display = [col for col in base_cols if col in df_summary.columns]
         df_summary_display = df_summary[summary_cols_display].copy()
-        # Benchmark display table - handle both old and new column names
-        benchmark_cols = ['Model', 'AutoBench']
-        # Handle different column name variations
-        chatbot_col = None
-        mmlu_col = None
         for col in df_summary.columns:
-            if col in ['Chatbot Ar.', 'LMArena']:
-                chatbot_col = col
-            elif col in ['MMLU Index', 'MMLU-Pro']:
-                mmlu_col = col
-        if chatbot_col:
-            benchmark_cols.append(chatbot_col)
-        if 'AAI Index' in df_summary.columns:
-            benchmark_cols.append('AAI Index')
-        if mmlu_col:
-            benchmark_cols.append(mmlu_col)
-        benchmark_cols = [col for col in benchmark_cols if col in df_summary.columns]
         df_benchmark_display = df_summary[benchmark_cols].copy()
         # Sort by AutoBench score
@@ -354,7 +393,7 @@ def update_leaderboard_data(selected_run_id: str) -> tuple:
         # Return empty/default values for all outputs
         empty_df = pd.DataFrame()
         return (
-            empty_df, empty_df, empty_df, empty_df, empty_df, empty_df,  # DataFrames
             None, "", None, "", None, "",  # Plots and messages
             "No run selected", ""  # Info message, correlations text
         )
@@ -366,7 +405,7 @@ def update_leaderboard_data(selected_run_id: str) -> tuple:
     if not data:
         empty_df = pd.DataFrame()
         return (
-            empty_df, empty_df, empty_df, empty_df, empty_df, empty_df,
             None, "Error loading data", None, "Error loading data", None, "Error loading data",
             f"Error loading run: {selected_run_id}", ""
         )
@@ -413,11 +452,12 @@ def update_leaderboard_data(selected_run_id: str) -> tuple:
     if 'model_count' in run_metadata:
         info_msg += f" - {run_metadata['model_count']} models"
-    # Get correlation text
     correlations_text = format_correlations_text(data.get("correlations", {}))
     return (
-        overall_rank_display, benchmark_display, cost_display, avg_latency_display, p99_latency_display, domain_display,
         cost_plot, cost_msg, avg_latency_plot, avg_latency_msg, p99_latency_plot, p99_latency_msg,
         info_msg, correlations_text
     )
@@ -468,7 +508,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
     # --- Tab 2: Benchmark Comparison ---
     with gr.Tab("Benchmark Comparison"):
         gr.Markdown("## Benchmark Comparison")
-        gr.Markdown("Comparison of AutoBench scores with other popular benchmarks. AutoBench features 82.51% correlation with Chatbot Arena, 83.74% with Artificial Analysis Intelligence Index, and 71.51% with MMLU. Models sorted by AutoBench score.")
         benchmark_comparison_table = gr.DataFrame(
             current_data.get("benchmark_display", pd.DataFrame()),
@@ -600,7 +642,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
         inputs=[run_selector],
         outputs=[
             overall_ranking_table,
-            benchmark_comparison_table,
             cost_breakdown_table,
             avg_latency_breakdown_table,
             p99_latency_breakdown_table,

 import plotly.express as px
 import os
 import json
+from typing import Dict, List, Optional
 # --- Configuration ---
 RUNS_DIR = "runs"
 COST_COLUMN_SUMMARY = 'Costs (USD)'
 NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)'
+# Columns in summary_data.csv that are operational metrics, not third-party reference benchmarks
+SUMMARY_NON_REFERENCE_COLUMNS = frozenset({
+    'Model', 'Model Name', 'Iterations', 'AutoBench',
+    COST_COLUMN_SUMMARY,
+    NEW_COST_COLUMN_SUMMARY,
+    'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)', 'Fail Rate %',
+})
 # --- Multi-Run Support Functions ---
 def discover_available_runs() -> List[Dict]:
     """Scan runs directory and return sorted list of available runs with metadata."""
         return f"**Benchmark Correlations:** AutoBench features " + ", ".join(correlation_parts) + "."
     return ""
+def _oxford_comma(names: List[str]) -> str:
+    if not names:
+        return ""
+    if len(names) == 1:
+        return names[0]
+    if len(names) == 2:
+        return f"{names[0]} and {names[1]}"
+    return ", ".join(names[:-1]) + f", and {names[-1]}"
+def format_benchmark_comparison_intro(data: Dict, run_metadata: Optional[Dict] = None) -> str:
+    """Build Benchmark Comparison tab intro from table columns, correlations, and optional metadata."""
+    meta = run_metadata or {}
+    override = meta.get("benchmark_comparison_intro_override")
+    if isinstance(override, str) and override.strip():
+        return override.strip()
+    bd = data.get("benchmark_display", pd.DataFrame())
+    ref_names = [c for c in bd.columns if c not in ("Model", "AutoBench")]
+    if ref_names:
+        listed = _oxford_comma(ref_names)
+        body = (
+            f"Comparison of AutoBench scores with reference benchmarks: {listed}. "
+            "Models sorted by AutoBench score (higher is better)."
+        )
+    else:
+        body = (
+            "Comparison of AutoBench scores with available reference benchmarks. "
+            "Models sorted by AutoBench score (higher is better)."
+        )
+    corr = format_correlations_text(data.get("correlations", {}))
+    auto_block = f"{body}\n\n{corr}" if corr else body
+    note = meta.get("benchmark_comparison_note")
+    if isinstance(note, str) and note.strip():
+        return f"{note.strip()}\n\n{auto_block}"
+    return auto_block
 def load_run_data(run_id: str) -> Dict[str, pd.DataFrame]:
     """Load all CSV data for a specific run."""
     runs = discover_available_runs()
         summary_cols_display = [col for col in base_cols if col in df_summary.columns]
         df_summary_display = df_summary[summary_cols_display].copy()
+        # Benchmark display: Model, AutoBench, then every other column not in the reserved set (CSV order)
+        benchmark_cols: List[str] = []
+        if 'Model' in df_summary.columns:
+            benchmark_cols.append('Model')
+        if 'AutoBench' in df_summary.columns:
+            benchmark_cols.append('AutoBench')
         for col in df_summary.columns:
+            if col in SUMMARY_NON_REFERENCE_COLUMNS or col in ('Model', 'AutoBench'):
+                continue
+            benchmark_cols.append(col)
+        benchmark_cols = [c for c in benchmark_cols if c in df_summary.columns]
         df_benchmark_display = df_summary[benchmark_cols].copy()
         # Sort by AutoBench score
         # Return empty/default values for all outputs
         empty_df = pd.DataFrame()
         return (
+            empty_df, empty_df, "", empty_df, empty_df, empty_df, empty_df,  # DataFrames + benchmark intro
             None, "", None, "", None, "",  # Plots and messages
             "No run selected", ""  # Info message, correlations text
         )
     if not data:
         empty_df = pd.DataFrame()
         return (
+            empty_df, empty_df, "", empty_df, empty_df, empty_df, empty_df,
             None, "Error loading data", None, "Error loading data", None, "Error loading data",
             f"Error loading run: {selected_run_id}", ""
         )
     if 'model_count' in run_metadata:
         info_msg += f" - {run_metadata['model_count']} models"
+    # Get correlation text and benchmark comparison intro
     correlations_text = format_correlations_text(data.get("correlations", {}))
+    benchmark_intro_text = format_benchmark_comparison_intro(data, run_metadata)
     return (
+        overall_rank_display, benchmark_display, benchmark_intro_text, cost_display, avg_latency_display, p99_latency_display, domain_display,
         cost_plot, cost_msg, avg_latency_plot, avg_latency_msg, p99_latency_plot, p99_latency_msg,
         info_msg, correlations_text
     )
     # --- Tab 2: Benchmark Comparison ---
     with gr.Tab("Benchmark Comparison"):
         gr.Markdown("## Benchmark Comparison")
+        benchmark_comparison_intro = gr.Markdown(
+            value=format_benchmark_comparison_intro(current_data, latest_run)
+        )
         benchmark_comparison_table = gr.DataFrame(
             current_data.get("benchmark_display", pd.DataFrame()),
         inputs=[run_selector],
         outputs=[
             overall_ranking_table,
+            benchmark_comparison_table,
+            benchmark_comparison_intro,
             cost_breakdown_table,
             avg_latency_breakdown_table,
             p99_latency_breakdown_table,