Upload app.py
Browse filesFormatting fixes for new benchmarks and some descripotions
app.py
CHANGED
|
@@ -3,7 +3,7 @@ import pandas as pd
|
|
| 3 |
import plotly.express as px
|
| 4 |
import os
|
| 5 |
import json
|
| 6 |
-
from typing import Dict, List
|
| 7 |
|
| 8 |
# --- Configuration ---
|
| 9 |
RUNS_DIR = "runs"
|
|
@@ -11,6 +11,14 @@ DATA_DIR = "." # For backward compatibility
|
|
| 11 |
COST_COLUMN_SUMMARY = 'Costs (USD)'
|
| 12 |
NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)'
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# --- Multi-Run Support Functions ---
|
| 15 |
def discover_available_runs() -> List[Dict]:
|
| 16 |
"""Scan runs directory and return sorted list of available runs with metadata."""
|
|
@@ -91,6 +99,47 @@ def format_correlations_text(correlations_data: Dict) -> str:
|
|
| 91 |
return f"**Benchmark Correlations:** AutoBench features " + ", ".join(correlation_parts) + "."
|
| 92 |
return ""
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
def load_run_data(run_id: str) -> Dict[str, pd.DataFrame]:
|
| 95 |
"""Load all CSV data for a specific run."""
|
| 96 |
runs = discover_available_runs()
|
|
@@ -163,27 +212,17 @@ def process_run_data(data: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:
|
|
| 163 |
summary_cols_display = [col for col in base_cols if col in df_summary.columns]
|
| 164 |
df_summary_display = df_summary[summary_cols_display].copy()
|
| 165 |
|
| 166 |
-
# Benchmark display
|
| 167 |
-
benchmark_cols = [
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
for col in df_summary.columns:
|
| 174 |
-
if col in
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
if chatbot_col:
|
| 180 |
-
benchmark_cols.append(chatbot_col)
|
| 181 |
-
if 'AAI Index' in df_summary.columns:
|
| 182 |
-
benchmark_cols.append('AAI Index')
|
| 183 |
-
if mmlu_col:
|
| 184 |
-
benchmark_cols.append(mmlu_col)
|
| 185 |
-
|
| 186 |
-
benchmark_cols = [col for col in benchmark_cols if col in df_summary.columns]
|
| 187 |
df_benchmark_display = df_summary[benchmark_cols].copy()
|
| 188 |
|
| 189 |
# Sort by AutoBench score
|
|
@@ -354,7 +393,7 @@ def update_leaderboard_data(selected_run_id: str) -> tuple:
|
|
| 354 |
# Return empty/default values for all outputs
|
| 355 |
empty_df = pd.DataFrame()
|
| 356 |
return (
|
| 357 |
-
empty_df, empty_df, empty_df, empty_df, empty_df, empty_df, # DataFrames
|
| 358 |
None, "", None, "", None, "", # Plots and messages
|
| 359 |
"No run selected", "" # Info message, correlations text
|
| 360 |
)
|
|
@@ -366,7 +405,7 @@ def update_leaderboard_data(selected_run_id: str) -> tuple:
|
|
| 366 |
if not data:
|
| 367 |
empty_df = pd.DataFrame()
|
| 368 |
return (
|
| 369 |
-
empty_df, empty_df, empty_df, empty_df, empty_df, empty_df,
|
| 370 |
None, "Error loading data", None, "Error loading data", None, "Error loading data",
|
| 371 |
f"Error loading run: {selected_run_id}", ""
|
| 372 |
)
|
|
@@ -413,11 +452,12 @@ def update_leaderboard_data(selected_run_id: str) -> tuple:
|
|
| 413 |
if 'model_count' in run_metadata:
|
| 414 |
info_msg += f" - {run_metadata['model_count']} models"
|
| 415 |
|
| 416 |
-
# Get correlation text
|
| 417 |
correlations_text = format_correlations_text(data.get("correlations", {}))
|
|
|
|
| 418 |
|
| 419 |
return (
|
| 420 |
-
overall_rank_display, benchmark_display, cost_display, avg_latency_display, p99_latency_display, domain_display,
|
| 421 |
cost_plot, cost_msg, avg_latency_plot, avg_latency_msg, p99_latency_plot, p99_latency_msg,
|
| 422 |
info_msg, correlations_text
|
| 423 |
)
|
|
@@ -468,7 +508,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 468 |
# --- Tab 2: Benchmark Comparison ---
|
| 469 |
with gr.Tab("Benchmark Comparison"):
|
| 470 |
gr.Markdown("## Benchmark Comparison")
|
| 471 |
-
gr.Markdown(
|
|
|
|
|
|
|
| 472 |
|
| 473 |
benchmark_comparison_table = gr.DataFrame(
|
| 474 |
current_data.get("benchmark_display", pd.DataFrame()),
|
|
@@ -600,7 +642,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 600 |
inputs=[run_selector],
|
| 601 |
outputs=[
|
| 602 |
overall_ranking_table,
|
| 603 |
-
benchmark_comparison_table,
|
|
|
|
| 604 |
cost_breakdown_table,
|
| 605 |
avg_latency_breakdown_table,
|
| 606 |
p99_latency_breakdown_table,
|
|
|
|
| 3 |
import plotly.express as px
|
| 4 |
import os
|
| 5 |
import json
|
| 6 |
+
from typing import Dict, List, Optional
|
| 7 |
|
| 8 |
# --- Configuration ---
|
| 9 |
RUNS_DIR = "runs"
|
|
|
|
| 11 |
COST_COLUMN_SUMMARY = 'Costs (USD)'
|
| 12 |
NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)'
|
| 13 |
|
| 14 |
+
# Columns in summary_data.csv that are operational metrics, not third-party reference benchmarks
|
| 15 |
+
SUMMARY_NON_REFERENCE_COLUMNS = frozenset({
|
| 16 |
+
'Model', 'Model Name', 'Iterations', 'AutoBench',
|
| 17 |
+
COST_COLUMN_SUMMARY,
|
| 18 |
+
NEW_COST_COLUMN_SUMMARY,
|
| 19 |
+
'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)', 'Fail Rate %',
|
| 20 |
+
})
|
| 21 |
+
|
| 22 |
# --- Multi-Run Support Functions ---
|
| 23 |
def discover_available_runs() -> List[Dict]:
|
| 24 |
"""Scan runs directory and return sorted list of available runs with metadata."""
|
|
|
|
| 99 |
return f"**Benchmark Correlations:** AutoBench features " + ", ".join(correlation_parts) + "."
|
| 100 |
return ""
|
| 101 |
|
| 102 |
+
|
| 103 |
+
def _oxford_comma(names: List[str]) -> str:
|
| 104 |
+
if not names:
|
| 105 |
+
return ""
|
| 106 |
+
if len(names) == 1:
|
| 107 |
+
return names[0]
|
| 108 |
+
if len(names) == 2:
|
| 109 |
+
return f"{names[0]} and {names[1]}"
|
| 110 |
+
return ", ".join(names[:-1]) + f", and {names[-1]}"
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def format_benchmark_comparison_intro(data: Dict, run_metadata: Optional[Dict] = None) -> str:
|
| 114 |
+
"""Build Benchmark Comparison tab intro from table columns, correlations, and optional metadata."""
|
| 115 |
+
meta = run_metadata or {}
|
| 116 |
+
override = meta.get("benchmark_comparison_intro_override")
|
| 117 |
+
if isinstance(override, str) and override.strip():
|
| 118 |
+
return override.strip()
|
| 119 |
+
|
| 120 |
+
bd = data.get("benchmark_display", pd.DataFrame())
|
| 121 |
+
ref_names = [c for c in bd.columns if c not in ("Model", "AutoBench")]
|
| 122 |
+
if ref_names:
|
| 123 |
+
listed = _oxford_comma(ref_names)
|
| 124 |
+
body = (
|
| 125 |
+
f"Comparison of AutoBench scores with reference benchmarks: {listed}. "
|
| 126 |
+
"Models sorted by AutoBench score (higher is better)."
|
| 127 |
+
)
|
| 128 |
+
else:
|
| 129 |
+
body = (
|
| 130 |
+
"Comparison of AutoBench scores with available reference benchmarks. "
|
| 131 |
+
"Models sorted by AutoBench score (higher is better)."
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
corr = format_correlations_text(data.get("correlations", {}))
|
| 135 |
+
auto_block = f"{body}\n\n{corr}" if corr else body
|
| 136 |
+
|
| 137 |
+
note = meta.get("benchmark_comparison_note")
|
| 138 |
+
if isinstance(note, str) and note.strip():
|
| 139 |
+
return f"{note.strip()}\n\n{auto_block}"
|
| 140 |
+
return auto_block
|
| 141 |
+
|
| 142 |
+
|
| 143 |
def load_run_data(run_id: str) -> Dict[str, pd.DataFrame]:
|
| 144 |
"""Load all CSV data for a specific run."""
|
| 145 |
runs = discover_available_runs()
|
|
|
|
| 212 |
summary_cols_display = [col for col in base_cols if col in df_summary.columns]
|
| 213 |
df_summary_display = df_summary[summary_cols_display].copy()
|
| 214 |
|
| 215 |
+
# Benchmark display: Model, AutoBench, then every other column not in the reserved set (CSV order)
|
| 216 |
+
benchmark_cols: List[str] = []
|
| 217 |
+
if 'Model' in df_summary.columns:
|
| 218 |
+
benchmark_cols.append('Model')
|
| 219 |
+
if 'AutoBench' in df_summary.columns:
|
| 220 |
+
benchmark_cols.append('AutoBench')
|
|
|
|
| 221 |
for col in df_summary.columns:
|
| 222 |
+
if col in SUMMARY_NON_REFERENCE_COLUMNS or col in ('Model', 'AutoBench'):
|
| 223 |
+
continue
|
| 224 |
+
benchmark_cols.append(col)
|
| 225 |
+
benchmark_cols = [c for c in benchmark_cols if c in df_summary.columns]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
df_benchmark_display = df_summary[benchmark_cols].copy()
|
| 227 |
|
| 228 |
# Sort by AutoBench score
|
|
|
|
| 393 |
# Return empty/default values for all outputs
|
| 394 |
empty_df = pd.DataFrame()
|
| 395 |
return (
|
| 396 |
+
empty_df, empty_df, "", empty_df, empty_df, empty_df, empty_df, # DataFrames + benchmark intro
|
| 397 |
None, "", None, "", None, "", # Plots and messages
|
| 398 |
"No run selected", "" # Info message, correlations text
|
| 399 |
)
|
|
|
|
| 405 |
if not data:
|
| 406 |
empty_df = pd.DataFrame()
|
| 407 |
return (
|
| 408 |
+
empty_df, empty_df, "", empty_df, empty_df, empty_df, empty_df,
|
| 409 |
None, "Error loading data", None, "Error loading data", None, "Error loading data",
|
| 410 |
f"Error loading run: {selected_run_id}", ""
|
| 411 |
)
|
|
|
|
| 452 |
if 'model_count' in run_metadata:
|
| 453 |
info_msg += f" - {run_metadata['model_count']} models"
|
| 454 |
|
| 455 |
+
# Get correlation text and benchmark comparison intro
|
| 456 |
correlations_text = format_correlations_text(data.get("correlations", {}))
|
| 457 |
+
benchmark_intro_text = format_benchmark_comparison_intro(data, run_metadata)
|
| 458 |
|
| 459 |
return (
|
| 460 |
+
overall_rank_display, benchmark_display, benchmark_intro_text, cost_display, avg_latency_display, p99_latency_display, domain_display,
|
| 461 |
cost_plot, cost_msg, avg_latency_plot, avg_latency_msg, p99_latency_plot, p99_latency_msg,
|
| 462 |
info_msg, correlations_text
|
| 463 |
)
|
|
|
|
| 508 |
# --- Tab 2: Benchmark Comparison ---
|
| 509 |
with gr.Tab("Benchmark Comparison"):
|
| 510 |
gr.Markdown("## Benchmark Comparison")
|
| 511 |
+
benchmark_comparison_intro = gr.Markdown(
|
| 512 |
+
value=format_benchmark_comparison_intro(current_data, latest_run)
|
| 513 |
+
)
|
| 514 |
|
| 515 |
benchmark_comparison_table = gr.DataFrame(
|
| 516 |
current_data.get("benchmark_display", pd.DataFrame()),
|
|
|
|
| 642 |
inputs=[run_selector],
|
| 643 |
outputs=[
|
| 644 |
overall_ranking_table,
|
| 645 |
+
benchmark_comparison_table,
|
| 646 |
+
benchmark_comparison_intro,
|
| 647 |
cost_breakdown_table,
|
| 648 |
avg_latency_breakdown_table,
|
| 649 |
p99_latency_breakdown_table,
|