PeterKruger commited on
Commit
c00108e
·
verified ·
1 Parent(s): 3d84e78

Upload app.py

Browse files

Formatting fixes for new benchmarks and some descripotions

Files changed (1) hide show
  1. app.py +70 -27
app.py CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
3
  import plotly.express as px
4
  import os
5
  import json
6
- from typing import Dict, List
7
 
8
  # --- Configuration ---
9
  RUNS_DIR = "runs"
@@ -11,6 +11,14 @@ DATA_DIR = "." # For backward compatibility
11
  COST_COLUMN_SUMMARY = 'Costs (USD)'
12
  NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)'
13
 
 
 
 
 
 
 
 
 
14
  # --- Multi-Run Support Functions ---
15
  def discover_available_runs() -> List[Dict]:
16
  """Scan runs directory and return sorted list of available runs with metadata."""
@@ -91,6 +99,47 @@ def format_correlations_text(correlations_data: Dict) -> str:
91
  return f"**Benchmark Correlations:** AutoBench features " + ", ".join(correlation_parts) + "."
92
  return ""
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def load_run_data(run_id: str) -> Dict[str, pd.DataFrame]:
95
  """Load all CSV data for a specific run."""
96
  runs = discover_available_runs()
@@ -163,27 +212,17 @@ def process_run_data(data: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:
163
  summary_cols_display = [col for col in base_cols if col in df_summary.columns]
164
  df_summary_display = df_summary[summary_cols_display].copy()
165
 
166
- # Benchmark display table - handle both old and new column names
167
- benchmark_cols = ['Model', 'AutoBench']
168
-
169
- # Handle different column name variations
170
- chatbot_col = None
171
- mmlu_col = None
172
-
173
  for col in df_summary.columns:
174
- if col in ['Chatbot Ar.', 'LMArena']:
175
- chatbot_col = col
176
- elif col in ['MMLU Index', 'MMLU-Pro']:
177
- mmlu_col = col
178
-
179
- if chatbot_col:
180
- benchmark_cols.append(chatbot_col)
181
- if 'AAI Index' in df_summary.columns:
182
- benchmark_cols.append('AAI Index')
183
- if mmlu_col:
184
- benchmark_cols.append(mmlu_col)
185
-
186
- benchmark_cols = [col for col in benchmark_cols if col in df_summary.columns]
187
  df_benchmark_display = df_summary[benchmark_cols].copy()
188
 
189
  # Sort by AutoBench score
@@ -354,7 +393,7 @@ def update_leaderboard_data(selected_run_id: str) -> tuple:
354
  # Return empty/default values for all outputs
355
  empty_df = pd.DataFrame()
356
  return (
357
- empty_df, empty_df, empty_df, empty_df, empty_df, empty_df, # DataFrames
358
  None, "", None, "", None, "", # Plots and messages
359
  "No run selected", "" # Info message, correlations text
360
  )
@@ -366,7 +405,7 @@ def update_leaderboard_data(selected_run_id: str) -> tuple:
366
  if not data:
367
  empty_df = pd.DataFrame()
368
  return (
369
- empty_df, empty_df, empty_df, empty_df, empty_df, empty_df,
370
  None, "Error loading data", None, "Error loading data", None, "Error loading data",
371
  f"Error loading run: {selected_run_id}", ""
372
  )
@@ -413,11 +452,12 @@ def update_leaderboard_data(selected_run_id: str) -> tuple:
413
  if 'model_count' in run_metadata:
414
  info_msg += f" - {run_metadata['model_count']} models"
415
 
416
- # Get correlation text
417
  correlations_text = format_correlations_text(data.get("correlations", {}))
 
418
 
419
  return (
420
- overall_rank_display, benchmark_display, cost_display, avg_latency_display, p99_latency_display, domain_display,
421
  cost_plot, cost_msg, avg_latency_plot, avg_latency_msg, p99_latency_plot, p99_latency_msg,
422
  info_msg, correlations_text
423
  )
@@ -468,7 +508,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
468
  # --- Tab 2: Benchmark Comparison ---
469
  with gr.Tab("Benchmark Comparison"):
470
  gr.Markdown("## Benchmark Comparison")
471
- gr.Markdown("Comparison of AutoBench scores with other popular benchmarks. AutoBench features 82.51% correlation with Chatbot Arena, 83.74% with Artificial Analysis Intelligence Index, and 71.51% with MMLU. Models sorted by AutoBench score.")
 
 
472
 
473
  benchmark_comparison_table = gr.DataFrame(
474
  current_data.get("benchmark_display", pd.DataFrame()),
@@ -600,7 +642,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
600
  inputs=[run_selector],
601
  outputs=[
602
  overall_ranking_table,
603
- benchmark_comparison_table,
 
604
  cost_breakdown_table,
605
  avg_latency_breakdown_table,
606
  p99_latency_breakdown_table,
 
3
  import plotly.express as px
4
  import os
5
  import json
6
+ from typing import Dict, List, Optional
7
 
8
  # --- Configuration ---
9
  RUNS_DIR = "runs"
 
11
  COST_COLUMN_SUMMARY = 'Costs (USD)'
12
  NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)'
13
 
14
+ # Columns in summary_data.csv that are operational metrics, not third-party reference benchmarks
15
+ SUMMARY_NON_REFERENCE_COLUMNS = frozenset({
16
+ 'Model', 'Model Name', 'Iterations', 'AutoBench',
17
+ COST_COLUMN_SUMMARY,
18
+ NEW_COST_COLUMN_SUMMARY,
19
+ 'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)', 'Fail Rate %',
20
+ })
21
+
22
  # --- Multi-Run Support Functions ---
23
  def discover_available_runs() -> List[Dict]:
24
  """Scan runs directory and return sorted list of available runs with metadata."""
 
99
  return f"**Benchmark Correlations:** AutoBench features " + ", ".join(correlation_parts) + "."
100
  return ""
101
 
102
+
103
+ def _oxford_comma(names: List[str]) -> str:
104
+ if not names:
105
+ return ""
106
+ if len(names) == 1:
107
+ return names[0]
108
+ if len(names) == 2:
109
+ return f"{names[0]} and {names[1]}"
110
+ return ", ".join(names[:-1]) + f", and {names[-1]}"
111
+
112
+
113
+ def format_benchmark_comparison_intro(data: Dict, run_metadata: Optional[Dict] = None) -> str:
114
+ """Build Benchmark Comparison tab intro from table columns, correlations, and optional metadata."""
115
+ meta = run_metadata or {}
116
+ override = meta.get("benchmark_comparison_intro_override")
117
+ if isinstance(override, str) and override.strip():
118
+ return override.strip()
119
+
120
+ bd = data.get("benchmark_display", pd.DataFrame())
121
+ ref_names = [c for c in bd.columns if c not in ("Model", "AutoBench")]
122
+ if ref_names:
123
+ listed = _oxford_comma(ref_names)
124
+ body = (
125
+ f"Comparison of AutoBench scores with reference benchmarks: {listed}. "
126
+ "Models sorted by AutoBench score (higher is better)."
127
+ )
128
+ else:
129
+ body = (
130
+ "Comparison of AutoBench scores with available reference benchmarks. "
131
+ "Models sorted by AutoBench score (higher is better)."
132
+ )
133
+
134
+ corr = format_correlations_text(data.get("correlations", {}))
135
+ auto_block = f"{body}\n\n{corr}" if corr else body
136
+
137
+ note = meta.get("benchmark_comparison_note")
138
+ if isinstance(note, str) and note.strip():
139
+ return f"{note.strip()}\n\n{auto_block}"
140
+ return auto_block
141
+
142
+
143
  def load_run_data(run_id: str) -> Dict[str, pd.DataFrame]:
144
  """Load all CSV data for a specific run."""
145
  runs = discover_available_runs()
 
212
  summary_cols_display = [col for col in base_cols if col in df_summary.columns]
213
  df_summary_display = df_summary[summary_cols_display].copy()
214
 
215
+ # Benchmark display: Model, AutoBench, then every other column not in the reserved set (CSV order)
216
+ benchmark_cols: List[str] = []
217
+ if 'Model' in df_summary.columns:
218
+ benchmark_cols.append('Model')
219
+ if 'AutoBench' in df_summary.columns:
220
+ benchmark_cols.append('AutoBench')
 
221
  for col in df_summary.columns:
222
+ if col in SUMMARY_NON_REFERENCE_COLUMNS or col in ('Model', 'AutoBench'):
223
+ continue
224
+ benchmark_cols.append(col)
225
+ benchmark_cols = [c for c in benchmark_cols if c in df_summary.columns]
 
 
 
 
 
 
 
 
 
226
  df_benchmark_display = df_summary[benchmark_cols].copy()
227
 
228
  # Sort by AutoBench score
 
393
  # Return empty/default values for all outputs
394
  empty_df = pd.DataFrame()
395
  return (
396
+ empty_df, empty_df, "", empty_df, empty_df, empty_df, empty_df, # DataFrames + benchmark intro
397
  None, "", None, "", None, "", # Plots and messages
398
  "No run selected", "" # Info message, correlations text
399
  )
 
405
  if not data:
406
  empty_df = pd.DataFrame()
407
  return (
408
+ empty_df, empty_df, "", empty_df, empty_df, empty_df, empty_df,
409
  None, "Error loading data", None, "Error loading data", None, "Error loading data",
410
  f"Error loading run: {selected_run_id}", ""
411
  )
 
452
  if 'model_count' in run_metadata:
453
  info_msg += f" - {run_metadata['model_count']} models"
454
 
455
+ # Get correlation text and benchmark comparison intro
456
  correlations_text = format_correlations_text(data.get("correlations", {}))
457
+ benchmark_intro_text = format_benchmark_comparison_intro(data, run_metadata)
458
 
459
  return (
460
+ overall_rank_display, benchmark_display, benchmark_intro_text, cost_display, avg_latency_display, p99_latency_display, domain_display,
461
  cost_plot, cost_msg, avg_latency_plot, avg_latency_msg, p99_latency_plot, p99_latency_msg,
462
  info_msg, correlations_text
463
  )
 
508
  # --- Tab 2: Benchmark Comparison ---
509
  with gr.Tab("Benchmark Comparison"):
510
  gr.Markdown("## Benchmark Comparison")
511
+ benchmark_comparison_intro = gr.Markdown(
512
+ value=format_benchmark_comparison_intro(current_data, latest_run)
513
+ )
514
 
515
  benchmark_comparison_table = gr.DataFrame(
516
  current_data.get("benchmark_display", pd.DataFrame()),
 
642
  inputs=[run_selector],
643
  outputs=[
644
  overall_ranking_table,
645
+ benchmark_comparison_table,
646
+ benchmark_comparison_intro,
647
  cost_breakdown_table,
648
  avg_latency_breakdown_table,
649
  p99_latency_breakdown_table,