Spaces:
Running
Running
openhands openhands commited on
Commit ·
2854ddd
1
Parent(s): 16307c3
Add runtime column and Cost/Performance + Runtime/Performance charts to all pages
Browse files- Add runtime tracking to data loader (simple_data_loader.py)
- Add format_runtime_column function (leaderboard_transformer.py)
- Update _plot_scatter_plotly with plot_type parameter ('cost' or 'runtime')
- Update chart titles to 'OpenHands Index XXX Cost/Performance' and 'OpenHands Index XXX Runtime/Performance'
- Add Runtime column to main leaderboard and benchmark details tables
- Display both cost/performance and runtime/performance scatter plots on all pages
Co-authored-by: openhands <openhands@all-hands.dev>
- leaderboard_transformer.py +66 -10
- simple_data_loader.py +15 -3
- ui_components.py +97 -41
leaderboard_transformer.py
CHANGED
|
@@ -164,6 +164,7 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 164 |
'average cost': 'Average Cost',
|
| 165 |
'total cost': 'Average Cost', # Legacy support
|
| 166 |
'Overall cost': 'Average Cost', # Legacy support
|
|
|
|
| 167 |
'categories_completed': 'Categories Completed',
|
| 168 |
'Logs': 'Logs',
|
| 169 |
'Openness': 'Openness',
|
|
@@ -395,7 +396,8 @@ def _plot_scatter_plotly(
|
|
| 395 |
x: Optional[str],
|
| 396 |
y: str,
|
| 397 |
agent_col: str = 'Agent',
|
| 398 |
-
name: Optional[str] = None
|
|
|
|
| 399 |
) -> go.Figure:
|
| 400 |
|
| 401 |
# --- Section 1: Define Mappings ---
|
|
@@ -427,7 +429,11 @@ def _plot_scatter_plotly(
|
|
| 427 |
data_plot = data.copy()
|
| 428 |
data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
|
| 429 |
|
| 430 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
max_reported_cost = 0
|
| 432 |
divider_line_x = 0
|
| 433 |
|
|
@@ -500,12 +506,12 @@ def _plot_scatter_plotly(
|
|
| 500 |
))
|
| 501 |
|
| 502 |
# --- Section 5: Prepare for Marker Plotting ---
|
| 503 |
-
def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
|
| 504 |
"""
|
| 505 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 506 |
Format: {lm_name} (SDK {version})
|
| 507 |
Average Score: {score}
|
| 508 |
-
Average Cost: {
|
| 509 |
Openness: {openness}
|
| 510 |
"""
|
| 511 |
h_pad = " "
|
|
@@ -528,11 +534,17 @@ def _plot_scatter_plotly(
|
|
| 528 |
# Average Score
|
| 529 |
parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
|
| 530 |
|
| 531 |
-
# Average Cost
|
| 532 |
-
if
|
| 533 |
-
|
|
|
|
|
|
|
|
|
|
| 534 |
else:
|
| 535 |
-
|
|
|
|
|
|
|
|
|
|
| 536 |
|
| 537 |
# Openness
|
| 538 |
parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
|
|
@@ -548,7 +560,8 @@ def _plot_scatter_plotly(
|
|
| 548 |
x_axis_label=x_axis_label,
|
| 549 |
x_col=x_col_to_use,
|
| 550 |
y_col=y_col_to_use,
|
| 551 |
-
divider_line_x=divider_line_x
|
|
|
|
| 552 |
),
|
| 553 |
axis=1
|
| 554 |
)
|
|
@@ -695,10 +708,16 @@ def _plot_scatter_plotly(
|
|
| 695 |
range=[x_min_log, x_max_log] # Match domain coordinate calculation
|
| 696 |
)
|
| 697 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 698 |
# Build layout configuration
|
| 699 |
layout_config = dict(
|
| 700 |
template="plotly_white",
|
| 701 |
-
title=
|
| 702 |
xaxis=xaxis_config,
|
| 703 |
yaxis=dict(title="Average (mean) score", range=[y_min, y_max]), # Match domain calculation
|
| 704 |
legend=dict(
|
|
@@ -789,6 +808,43 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
|
|
| 789 |
return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
|
| 790 |
|
| 791 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
def get_pareto_df(data, cost_col=None, score_col=None):
|
| 793 |
"""
|
| 794 |
Calculate the Pareto frontier for the given data.
|
|
|
|
| 164 |
'average cost': 'Average Cost',
|
| 165 |
'total cost': 'Average Cost', # Legacy support
|
| 166 |
'Overall cost': 'Average Cost', # Legacy support
|
| 167 |
+
'average runtime': 'Average Runtime',
|
| 168 |
'categories_completed': 'Categories Completed',
|
| 169 |
'Logs': 'Logs',
|
| 170 |
'Openness': 'Openness',
|
|
|
|
| 396 |
x: Optional[str],
|
| 397 |
y: str,
|
| 398 |
agent_col: str = 'Agent',
|
| 399 |
+
name: Optional[str] = None,
|
| 400 |
+
plot_type: str = 'cost' # 'cost' or 'runtime'
|
| 401 |
) -> go.Figure:
|
| 402 |
|
| 403 |
# --- Section 1: Define Mappings ---
|
|
|
|
| 429 |
data_plot = data.copy()
|
| 430 |
data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
|
| 431 |
|
| 432 |
+
# Set axis labels based on plot type
|
| 433 |
+
if plot_type == 'runtime':
|
| 434 |
+
x_axis_label = f"Average (mean) runtime per problem (seconds)" if x else "Runtime (Data N/A)"
|
| 435 |
+
else:
|
| 436 |
+
x_axis_label = f"Average (mean) cost per problem (USD)" if x else "Cost (Data N/A)"
|
| 437 |
max_reported_cost = 0
|
| 438 |
divider_line_x = 0
|
| 439 |
|
|
|
|
| 506 |
))
|
| 507 |
|
| 508 |
# --- Section 5: Prepare for Marker Plotting ---
|
| 509 |
+
def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x, is_runtime=False):
|
| 510 |
"""
|
| 511 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 512 |
Format: {lm_name} (SDK {version})
|
| 513 |
Average Score: {score}
|
| 514 |
+
Average Cost/Runtime: {value}
|
| 515 |
Openness: {openness}
|
| 516 |
"""
|
| 517 |
h_pad = " "
|
|
|
|
| 534 |
# Average Score
|
| 535 |
parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
|
| 536 |
|
| 537 |
+
# Average Cost or Runtime
|
| 538 |
+
if is_runtime:
|
| 539 |
+
if divider_line_x > 0 and row[x_col] >= divider_line_x:
|
| 540 |
+
parts.append(f"{h_pad}Average Runtime: <b>Missing</b>{h_pad}<br>")
|
| 541 |
+
else:
|
| 542 |
+
parts.append(f"{h_pad}Average Runtime: <b>{row[x_col]:.0f}s</b>{h_pad}<br>")
|
| 543 |
else:
|
| 544 |
+
if divider_line_x > 0 and row[x_col] >= divider_line_x:
|
| 545 |
+
parts.append(f"{h_pad}Average Cost: <b>Missing</b>{h_pad}<br>")
|
| 546 |
+
else:
|
| 547 |
+
parts.append(f"{h_pad}Average Cost: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
|
| 548 |
|
| 549 |
# Openness
|
| 550 |
parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
|
|
|
|
| 560 |
x_axis_label=x_axis_label,
|
| 561 |
x_col=x_col_to_use,
|
| 562 |
y_col=y_col_to_use,
|
| 563 |
+
divider_line_x=divider_line_x,
|
| 564 |
+
is_runtime=(plot_type == 'runtime')
|
| 565 |
),
|
| 566 |
axis=1
|
| 567 |
)
|
|
|
|
| 708 |
range=[x_min_log, x_max_log] # Match domain coordinate calculation
|
| 709 |
)
|
| 710 |
|
| 711 |
+
# Set title based on plot type
|
| 712 |
+
if plot_type == 'runtime':
|
| 713 |
+
plot_title = f"OpenHands Index {name} Runtime/Performance"
|
| 714 |
+
else:
|
| 715 |
+
plot_title = f"OpenHands Index {name} Cost/Performance"
|
| 716 |
+
|
| 717 |
# Build layout configuration
|
| 718 |
layout_config = dict(
|
| 719 |
template="plotly_white",
|
| 720 |
+
title=plot_title,
|
| 721 |
xaxis=xaxis_config,
|
| 722 |
yaxis=dict(title="Average (mean) score", range=[y_min, y_max]), # Match domain calculation
|
| 723 |
legend=dict(
|
|
|
|
| 808 |
return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
|
| 809 |
|
| 810 |
|
| 811 |
+
def format_runtime_column(df: pd.DataFrame, runtime_col_name: str) -> pd.DataFrame:
|
| 812 |
+
"""
|
| 813 |
+
Applies custom formatting to a runtime column based on its corresponding score column.
|
| 814 |
+
- If runtime is not null, formats as time with 's' suffix.
|
| 815 |
+
- If runtime is null but score is not, it becomes "Missing".
|
| 816 |
+
- If both runtime and score are null, it becomes "Not Submitted".
|
| 817 |
+
Args:
|
| 818 |
+
df: The DataFrame to modify.
|
| 819 |
+
runtime_col_name: The name of the runtime column to format (e.g., "Average Runtime").
|
| 820 |
+
Returns:
|
| 821 |
+
The DataFrame with the formatted runtime column.
|
| 822 |
+
"""
|
| 823 |
+
# Find the corresponding score column by replacing "Runtime" with "Score"
|
| 824 |
+
score_col_name = runtime_col_name.replace("Runtime", "Score")
|
| 825 |
+
|
| 826 |
+
# Ensure the score column actually exists to avoid errors
|
| 827 |
+
if score_col_name not in df.columns:
|
| 828 |
+
return df # Return the DataFrame unmodified if there's no matching score
|
| 829 |
+
|
| 830 |
+
def apply_formatting_logic(row):
|
| 831 |
+
runtime_value = row[runtime_col_name]
|
| 832 |
+
score_value = row[score_col_name]
|
| 833 |
+
status_color = "#ec4899"
|
| 834 |
+
|
| 835 |
+
if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
|
| 836 |
+
return f"{runtime_value:.0f}s"
|
| 837 |
+
elif pd.notna(score_value):
|
| 838 |
+
return f'<span style="color: {status_color};">Missing</span>' # Score exists, but runtime is missing
|
| 839 |
+
else:
|
| 840 |
+
return f'<span style="color: {status_color};">Not Submitted</span>' # Neither score nor runtime exists
|
| 841 |
+
|
| 842 |
+
# Apply the logic to the specified runtime column and update the DataFrame
|
| 843 |
+
df[runtime_col_name] = df.apply(apply_formatting_logic, axis=1)
|
| 844 |
+
|
| 845 |
+
return df
|
| 846 |
+
|
| 847 |
+
|
| 848 |
def get_pareto_df(data, cost_col=None, score_col=None):
|
| 849 |
"""
|
| 850 |
Calculate the Pareto frontier for the given data.
|
simple_data_loader.py
CHANGED
|
@@ -269,7 +269,7 @@ class SimpleLeaderboardViewer:
|
|
| 269 |
dataset_costs = []
|
| 270 |
|
| 271 |
# Track category-level data for aggregation
|
| 272 |
-
category_data = {} # {category: {'scores': [...], 'costs': [
|
| 273 |
|
| 274 |
for _, row in agent_records.iterrows():
|
| 275 |
tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
|
|
@@ -277,6 +277,7 @@ class SimpleLeaderboardViewer:
|
|
| 277 |
# Add columns for this specific dataset/benchmark
|
| 278 |
record[f'{tag} score'] = row['score']
|
| 279 |
record[f'{tag} cost'] = row['cost_per_instance']
|
|
|
|
| 280 |
dataset_scores.append(row['score'])
|
| 281 |
dataset_costs.append(row['cost_per_instance'])
|
| 282 |
|
|
@@ -289,12 +290,14 @@ class SimpleLeaderboardViewer:
|
|
| 289 |
if tag in self.benchmark_to_categories:
|
| 290 |
for category in self.benchmark_to_categories[tag]:
|
| 291 |
if category not in category_data:
|
| 292 |
-
category_data[category] = {'scores': [], 'costs': []}
|
| 293 |
category_data[category]['scores'].append(row['score'])
|
| 294 |
category_data[category]['costs'].append(row['cost_per_instance'])
|
|
|
|
| 295 |
|
| 296 |
-
# Calculate category-level aggregates and track average cost
|
| 297 |
all_costs = []
|
|
|
|
| 298 |
categories_with_scores = 0
|
| 299 |
for category in ALL_CATEGORIES:
|
| 300 |
if category in category_data and category_data[category]['scores']:
|
|
@@ -308,6 +311,12 @@ class SimpleLeaderboardViewer:
|
|
| 308 |
avg_cost = sum(valid_costs) / len(valid_costs)
|
| 309 |
record[f'{category} cost'] = avg_cost
|
| 310 |
all_costs.extend(valid_costs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
else:
|
| 312 |
# Category not submitted - will show as NA
|
| 313 |
pass
|
|
@@ -323,6 +332,9 @@ class SimpleLeaderboardViewer:
|
|
| 323 |
# Average cost per instance across all benchmarks
|
| 324 |
record['average cost'] = sum(all_costs) / len(all_costs) if all_costs else None
|
| 325 |
|
|
|
|
|
|
|
|
|
|
| 326 |
# Track how many categories were completed
|
| 327 |
record['categories_completed'] = categories_with_scores
|
| 328 |
|
|
|
|
| 269 |
dataset_costs = []
|
| 270 |
|
| 271 |
# Track category-level data for aggregation
|
| 272 |
+
category_data = {} # {category: {'scores': [...], 'costs': [], 'runtimes': []}}
|
| 273 |
|
| 274 |
for _, row in agent_records.iterrows():
|
| 275 |
tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
|
|
|
|
| 277 |
# Add columns for this specific dataset/benchmark
|
| 278 |
record[f'{tag} score'] = row['score']
|
| 279 |
record[f'{tag} cost'] = row['cost_per_instance']
|
| 280 |
+
record[f'{tag} runtime'] = row.get('average_runtime')
|
| 281 |
dataset_scores.append(row['score'])
|
| 282 |
dataset_costs.append(row['cost_per_instance'])
|
| 283 |
|
|
|
|
| 290 |
if tag in self.benchmark_to_categories:
|
| 291 |
for category in self.benchmark_to_categories[tag]:
|
| 292 |
if category not in category_data:
|
| 293 |
+
category_data[category] = {'scores': [], 'costs': [], 'runtimes': []}
|
| 294 |
category_data[category]['scores'].append(row['score'])
|
| 295 |
category_data[category]['costs'].append(row['cost_per_instance'])
|
| 296 |
+
category_data[category]['runtimes'].append(row.get('average_runtime'))
|
| 297 |
|
| 298 |
+
# Calculate category-level aggregates and track average cost/runtime
|
| 299 |
all_costs = []
|
| 300 |
+
all_runtimes = []
|
| 301 |
categories_with_scores = 0
|
| 302 |
for category in ALL_CATEGORIES:
|
| 303 |
if category in category_data and category_data[category]['scores']:
|
|
|
|
| 311 |
avg_cost = sum(valid_costs) / len(valid_costs)
|
| 312 |
record[f'{category} cost'] = avg_cost
|
| 313 |
all_costs.extend(valid_costs)
|
| 314 |
+
if data['runtimes']:
|
| 315 |
+
valid_runtimes = [r for r in data['runtimes'] if r is not None]
|
| 316 |
+
if valid_runtimes:
|
| 317 |
+
avg_runtime = sum(valid_runtimes) / len(valid_runtimes)
|
| 318 |
+
record[f'{category} runtime'] = avg_runtime
|
| 319 |
+
all_runtimes.extend(valid_runtimes)
|
| 320 |
else:
|
| 321 |
# Category not submitted - will show as NA
|
| 322 |
pass
|
|
|
|
| 332 |
# Average cost per instance across all benchmarks
|
| 333 |
record['average cost'] = sum(all_costs) / len(all_costs) if all_costs else None
|
| 334 |
|
| 335 |
+
# Average runtime per instance across all benchmarks
|
| 336 |
+
record['average runtime'] = sum(all_runtimes) / len(all_runtimes) if all_runtimes else None
|
| 337 |
+
|
| 338 |
# Track how many categories were completed
|
| 339 |
record['categories_completed'] = categories_with_scores
|
| 340 |
|
ui_components.py
CHANGED
|
@@ -17,6 +17,7 @@ from leaderboard_transformer import (
|
|
| 17 |
_plot_scatter_plotly,
|
| 18 |
format_cost_column,
|
| 19 |
format_score_column,
|
|
|
|
| 20 |
get_pareto_df,
|
| 21 |
clean_llm_base_list,
|
| 22 |
)
|
|
@@ -515,6 +516,10 @@ def create_leaderboard_display(
|
|
| 515 |
if "Score" in col:
|
| 516 |
df_display = format_score_column(df_display, col)
|
| 517 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
# Clean the Language Model column first
|
| 519 |
df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list)
|
| 520 |
|
|
@@ -583,47 +588,65 @@ def create_leaderboard_display(
|
|
| 583 |
# If no complete entries exist, show all entries by default
|
| 584 |
has_complete_entries = len(df_display_complete) > 0
|
| 585 |
|
| 586 |
-
# Determine primary score/cost columns for scatter plot
|
| 587 |
if category_name == "Overall":
|
| 588 |
primary_score_col = "Average Score"
|
| 589 |
primary_cost_col = "Average Cost"
|
|
|
|
| 590 |
else:
|
| 591 |
primary_score_col = f"{category_name} Score"
|
| 592 |
primary_cost_col = f"{category_name} Cost"
|
|
|
|
| 593 |
|
| 594 |
-
# Function to create scatter plot from data
|
| 595 |
-
def
|
| 596 |
return _plot_scatter_plotly(
|
| 597 |
data=df_data,
|
| 598 |
x=primary_cost_col if primary_cost_col in df_data.columns else None,
|
| 599 |
y=primary_score_col if primary_score_col in df_data.columns else "Average Score",
|
| 600 |
agent_col="SDK Version",
|
| 601 |
-
name=category_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
)
|
| 603 |
|
| 604 |
-
# Create initial scatter plots for both complete and all data
|
| 605 |
-
|
| 606 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
|
| 608 |
# Now get headers from the renamed dataframe (use all entries to ensure headers are present)
|
| 609 |
df_headers = df_display_all.columns.tolist()
|
| 610 |
df_datatypes = []
|
| 611 |
for col in df_headers:
|
| 612 |
-
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 613 |
df_datatypes.append("markdown")
|
| 614 |
elif col in ["SDK Version", "Language Model"]:
|
| 615 |
df_datatypes.append("html")
|
| 616 |
else:
|
| 617 |
df_datatypes.append("str")
|
| 618 |
# Dynamically set widths for the DataFrame columns
|
| 619 |
-
# Order: Language Model, SDK Version, Average Score, Average Cost, ...
|
| 620 |
-
fixed_start_widths = [280, 100, 100] # Language Model (with icons), SDK Version, Average Score
|
| 621 |
-
|
| 622 |
remaining_headers = df_headers[len(fixed_start_widths):]
|
| 623 |
for col in remaining_headers:
|
| 624 |
-
if "Score" in col or "Cost" in col:
|
| 625 |
-
|
| 626 |
-
dynamic_widths = [90] *
|
| 627 |
fixed_end_widths = [90, 100, 50] # Categories Attempted, Date, Logs
|
| 628 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 629 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
|
@@ -644,10 +667,18 @@ def create_leaderboard_display(
|
|
| 644 |
show_incomplete_checkbox = None
|
| 645 |
gr.Markdown(f"*No entries with all 5 categories completed yet. Showing all {num_total} entries.*")
|
| 646 |
|
| 647 |
-
# Plot
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 651 |
show_label=False,
|
| 652 |
)
|
| 653 |
gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|
|
@@ -669,17 +700,17 @@ def create_leaderboard_display(
|
|
| 669 |
elem_id="main-leaderboard"
|
| 670 |
)
|
| 671 |
|
| 672 |
-
# Update function for the toggle - updates both table and
|
| 673 |
def update_display(show_incomplete):
|
| 674 |
if show_incomplete:
|
| 675 |
-
return df_display_all,
|
| 676 |
else:
|
| 677 |
-
return df_display_complete,
|
| 678 |
|
| 679 |
show_incomplete_checkbox.change(
|
| 680 |
fn=update_display,
|
| 681 |
inputs=[show_incomplete_checkbox],
|
| 682 |
-
outputs=[dataframe_component,
|
| 683 |
)
|
| 684 |
else:
|
| 685 |
dataframe_component = gr.DataFrame(
|
|
@@ -719,21 +750,23 @@ def create_leaderboard_display(
|
|
| 719 |
new_df_display_complete = prepare_df_for_display(new_df_view_complete)
|
| 720 |
new_df_display_all = prepare_df_for_display(new_df_view_full)
|
| 721 |
|
| 722 |
-
# Create new scatter plots
|
| 723 |
-
|
| 724 |
-
|
|
|
|
|
|
|
| 725 |
|
| 726 |
# Return the appropriate data based on checkbox state
|
| 727 |
if current_checkbox_state:
|
| 728 |
-
return new_df_display_all,
|
| 729 |
else:
|
| 730 |
-
return new_df_display_complete,
|
| 731 |
|
| 732 |
# No change, return current values
|
| 733 |
if current_checkbox_state:
|
| 734 |
-
return df_display_all,
|
| 735 |
else:
|
| 736 |
-
return df_display_complete,
|
| 737 |
|
| 738 |
# Create a timer that checks for updates every 60 seconds
|
| 739 |
refresh_timer = gr.Timer(value=60)
|
|
@@ -743,7 +776,7 @@ def create_leaderboard_display(
|
|
| 743 |
refresh_timer.tick(
|
| 744 |
fn=check_and_refresh_data,
|
| 745 |
inputs=[show_incomplete_checkbox],
|
| 746 |
-
outputs=[dataframe_component,
|
| 747 |
)
|
| 748 |
else:
|
| 749 |
# If no checkbox, always show all data
|
|
@@ -756,18 +789,19 @@ def create_leaderboard_display(
|
|
| 756 |
new_transformer = DataTransformer(new_df, new_tag_map)
|
| 757 |
new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True)
|
| 758 |
new_df_display_all = prepare_df_for_display(new_df_view_full)
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
|
|
|
| 762 |
|
| 763 |
refresh_timer.tick(
|
| 764 |
fn=check_and_refresh_all,
|
| 765 |
inputs=[],
|
| 766 |
-
outputs=[dataframe_component,
|
| 767 |
)
|
| 768 |
|
| 769 |
# Return the components so they can be referenced elsewhere.
|
| 770 |
-
return
|
| 771 |
|
| 772 |
# # --- Detailed Benchmark Display ---
|
| 773 |
def create_benchmark_details_display(
|
|
@@ -807,10 +841,11 @@ def create_benchmark_details_display(
|
|
| 807 |
# 3. Prepare the data for this specific benchmark's table and plot
|
| 808 |
benchmark_score_col = f"{benchmark_name} Score"
|
| 809 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
|
|
|
| 810 |
benchmark_download_col = f"{benchmark_name} Download"
|
| 811 |
|
| 812 |
# Define the columns needed for the detailed table
|
| 813 |
-
table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs', benchmark_download_col, 'id', 'Language Model']
|
| 814 |
|
| 815 |
# Filter to only columns that actually exist in the full dataframe
|
| 816 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
@@ -914,6 +949,7 @@ def create_benchmark_details_display(
|
|
| 914 |
'Attempted Benchmark',
|
| 915 |
benchmark_score_col,
|
| 916 |
benchmark_cost_col,
|
|
|
|
| 917 |
'Date',
|
| 918 |
'Logs',
|
| 919 |
benchmark_download_col
|
|
@@ -922,10 +958,16 @@ def create_benchmark_details_display(
|
|
| 922 |
if col not in benchmark_table_df.columns:
|
| 923 |
benchmark_table_df[col] = pd.NA # Add as an empty column
|
| 924 |
benchmark_table_df = benchmark_table_df[desired_cols_in_order]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 925 |
# Rename columns for a cleaner table display, as requested
|
| 926 |
benchmark_table_df.rename(columns={
|
| 927 |
benchmark_score_col: 'Score',
|
| 928 |
benchmark_cost_col: 'Cost',
|
|
|
|
| 929 |
benchmark_download_col: '⬇️', # Empty-ish header with icon hint
|
| 930 |
}, inplace=True)
|
| 931 |
|
|
@@ -933,20 +975,34 @@ def create_benchmark_details_display(
|
|
| 933 |
df_headers = benchmark_table_df.columns.tolist()
|
| 934 |
df_datatypes = []
|
| 935 |
for col in df_headers:
|
| 936 |
-
if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col:
|
| 937 |
df_datatypes.append("markdown")
|
| 938 |
elif col in ["SDK Version", "Language Model"]:
|
| 939 |
df_datatypes.append("html")
|
| 940 |
else:
|
| 941 |
df_datatypes.append("str")
|
| 942 |
-
|
|
|
|
|
|
|
| 943 |
data=full_df,
|
| 944 |
x=benchmark_cost_col,
|
| 945 |
y=benchmark_score_col,
|
| 946 |
agent_col="SDK Version",
|
| 947 |
-
name=benchmark_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 948 |
)
|
| 949 |
-
gr.Plot(value=
|
| 950 |
gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|
| 951 |
|
| 952 |
# Put table and key into an accordion
|
|
@@ -957,7 +1013,7 @@ def create_benchmark_details_display(
|
|
| 957 |
datatype=df_datatypes,
|
| 958 |
interactive=False,
|
| 959 |
wrap=True,
|
| 960 |
-
column_widths=[200, 80, 40, 80, 80, 150, 40, 40], # Language Model, SDK Version, Attempted, Score, Cost, Date, Logs, Download
|
| 961 |
show_search="search",
|
| 962 |
elem_classes=["wrap-header-df"]
|
| 963 |
)
|
|
|
|
| 17 |
_plot_scatter_plotly,
|
| 18 |
format_cost_column,
|
| 19 |
format_score_column,
|
| 20 |
+
format_runtime_column,
|
| 21 |
get_pareto_df,
|
| 22 |
clean_llm_base_list,
|
| 23 |
)
|
|
|
|
| 516 |
if "Score" in col:
|
| 517 |
df_display = format_score_column(df_display, col)
|
| 518 |
|
| 519 |
+
for col in df_display.columns:
|
| 520 |
+
if "Runtime" in col:
|
| 521 |
+
df_display = format_runtime_column(df_display, col)
|
| 522 |
+
|
| 523 |
# Clean the Language Model column first
|
| 524 |
df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list)
|
| 525 |
|
|
|
|
| 588 |
# If no complete entries exist, show all entries by default
|
| 589 |
has_complete_entries = len(df_display_complete) > 0
|
| 590 |
|
| 591 |
+
# Determine primary score/cost/runtime columns for scatter plot
|
| 592 |
if category_name == "Overall":
|
| 593 |
primary_score_col = "Average Score"
|
| 594 |
primary_cost_col = "Average Cost"
|
| 595 |
+
primary_runtime_col = "Average Runtime"
|
| 596 |
else:
|
| 597 |
primary_score_col = f"{category_name} Score"
|
| 598 |
primary_cost_col = f"{category_name} Cost"
|
| 599 |
+
primary_runtime_col = f"{category_name} Runtime"
|
| 600 |
|
| 601 |
+
# Function to create cost/performance scatter plot from data
|
| 602 |
+
def create_cost_scatter_plot(df_data):
|
| 603 |
return _plot_scatter_plotly(
|
| 604 |
data=df_data,
|
| 605 |
x=primary_cost_col if primary_cost_col in df_data.columns else None,
|
| 606 |
y=primary_score_col if primary_score_col in df_data.columns else "Average Score",
|
| 607 |
agent_col="SDK Version",
|
| 608 |
+
name=category_name,
|
| 609 |
+
plot_type='cost'
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
# Function to create runtime/performance scatter plot from data
|
| 613 |
+
def create_runtime_scatter_plot(df_data):
|
| 614 |
+
return _plot_scatter_plotly(
|
| 615 |
+
data=df_data,
|
| 616 |
+
x=primary_runtime_col if primary_runtime_col in df_data.columns else None,
|
| 617 |
+
y=primary_score_col if primary_score_col in df_data.columns else "Average Score",
|
| 618 |
+
agent_col="SDK Version",
|
| 619 |
+
name=category_name,
|
| 620 |
+
plot_type='runtime'
|
| 621 |
)
|
| 622 |
|
| 623 |
+
# Create initial cost scatter plots for both complete and all data
|
| 624 |
+
cost_scatter_complete = create_cost_scatter_plot(df_view_complete) if has_complete_entries else go.Figure()
|
| 625 |
+
cost_scatter_all = create_cost_scatter_plot(df_view_full)
|
| 626 |
+
|
| 627 |
+
# Create initial runtime scatter plots for both complete and all data
|
| 628 |
+
runtime_scatter_complete = create_runtime_scatter_plot(df_view_complete) if has_complete_entries else go.Figure()
|
| 629 |
+
runtime_scatter_all = create_runtime_scatter_plot(df_view_full)
|
| 630 |
|
| 631 |
# Now get headers from the renamed dataframe (use all entries to ensure headers are present)
|
| 632 |
df_headers = df_display_all.columns.tolist()
|
| 633 |
df_datatypes = []
|
| 634 |
for col in df_headers:
|
| 635 |
+
if col == "Logs" or "Cost" in col or "Score" in col or "Runtime" in col:
|
| 636 |
df_datatypes.append("markdown")
|
| 637 |
elif col in ["SDK Version", "Language Model"]:
|
| 638 |
df_datatypes.append("html")
|
| 639 |
else:
|
| 640 |
df_datatypes.append("str")
|
| 641 |
# Dynamically set widths for the DataFrame columns
|
| 642 |
+
# Order: Language Model, SDK Version, Average Score, Average Cost, Average Runtime, ...
|
| 643 |
+
fixed_start_widths = [280, 100, 100, 90] # Language Model (with icons), SDK Version, Average Score, Average Runtime
|
| 644 |
+
num_score_cost_runtime_cols = 0
|
| 645 |
remaining_headers = df_headers[len(fixed_start_widths):]
|
| 646 |
for col in remaining_headers:
|
| 647 |
+
if "Score" in col or "Cost" in col or "Runtime" in col:
|
| 648 |
+
num_score_cost_runtime_cols += 1
|
| 649 |
+
dynamic_widths = [90] * num_score_cost_runtime_cols
|
| 650 |
fixed_end_widths = [90, 100, 50] # Categories Attempted, Date, Logs
|
| 651 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 652 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
|
|
|
| 667 |
show_incomplete_checkbox = None
|
| 668 |
gr.Markdown(f"*No entries with all 5 categories completed yet. Showing all {num_total} entries.*")
|
| 669 |
|
| 670 |
+
# Plot components - show complete entries by default if available
|
| 671 |
+
# Cost/Performance plot
|
| 672 |
+
initial_cost_plot = cost_scatter_complete if has_complete_entries else cost_scatter_all
|
| 673 |
+
cost_plot_component = gr.Plot(
|
| 674 |
+
value=initial_cost_plot,
|
| 675 |
+
show_label=False,
|
| 676 |
+
)
|
| 677 |
+
|
| 678 |
+
# Runtime/Performance plot
|
| 679 |
+
initial_runtime_plot = runtime_scatter_complete if has_complete_entries else runtime_scatter_all
|
| 680 |
+
runtime_plot_component = gr.Plot(
|
| 681 |
+
value=initial_runtime_plot,
|
| 682 |
show_label=False,
|
| 683 |
)
|
| 684 |
gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|
|
|
|
| 700 |
elem_id="main-leaderboard"
|
| 701 |
)
|
| 702 |
|
| 703 |
+
# Update function for the toggle - updates both table and plots
|
| 704 |
def update_display(show_incomplete):
|
| 705 |
if show_incomplete:
|
| 706 |
+
return df_display_all, cost_scatter_all, runtime_scatter_all
|
| 707 |
else:
|
| 708 |
+
return df_display_complete, cost_scatter_complete, runtime_scatter_complete
|
| 709 |
|
| 710 |
show_incomplete_checkbox.change(
|
| 711 |
fn=update_display,
|
| 712 |
inputs=[show_incomplete_checkbox],
|
| 713 |
+
outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
|
| 714 |
)
|
| 715 |
else:
|
| 716 |
dataframe_component = gr.DataFrame(
|
|
|
|
| 750 |
new_df_display_complete = prepare_df_for_display(new_df_view_complete)
|
| 751 |
new_df_display_all = prepare_df_for_display(new_df_view_full)
|
| 752 |
|
| 753 |
+
# Create new scatter plots (both cost and runtime)
|
| 754 |
+
new_cost_scatter_complete = create_cost_scatter_plot(new_df_view_complete) if len(new_df_display_complete) > 0 else go.Figure()
|
| 755 |
+
new_cost_scatter_all = create_cost_scatter_plot(new_df_view_full)
|
| 756 |
+
new_runtime_scatter_complete = create_runtime_scatter_plot(new_df_view_complete) if len(new_df_display_complete) > 0 else go.Figure()
|
| 757 |
+
new_runtime_scatter_all = create_runtime_scatter_plot(new_df_view_full)
|
| 758 |
|
| 759 |
# Return the appropriate data based on checkbox state
|
| 760 |
if current_checkbox_state:
|
| 761 |
+
return new_df_display_all, new_cost_scatter_all, new_runtime_scatter_all
|
| 762 |
else:
|
| 763 |
+
return new_df_display_complete, new_cost_scatter_complete, new_runtime_scatter_complete
|
| 764 |
|
| 765 |
# No change, return current values
|
| 766 |
if current_checkbox_state:
|
| 767 |
+
return df_display_all, cost_scatter_all, runtime_scatter_all
|
| 768 |
else:
|
| 769 |
+
return df_display_complete, cost_scatter_complete, runtime_scatter_complete
|
| 770 |
|
| 771 |
# Create a timer that checks for updates every 60 seconds
|
| 772 |
refresh_timer = gr.Timer(value=60)
|
|
|
|
| 776 |
refresh_timer.tick(
|
| 777 |
fn=check_and_refresh_data,
|
| 778 |
inputs=[show_incomplete_checkbox],
|
| 779 |
+
outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
|
| 780 |
)
|
| 781 |
else:
|
| 782 |
# If no checkbox, always show all data
|
|
|
|
| 789 |
new_transformer = DataTransformer(new_df, new_tag_map)
|
| 790 |
new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True)
|
| 791 |
new_df_display_all = prepare_df_for_display(new_df_view_full)
|
| 792 |
+
new_cost_scatter_all = create_cost_scatter_plot(new_df_view_full)
|
| 793 |
+
new_runtime_scatter_all = create_runtime_scatter_plot(new_df_view_full)
|
| 794 |
+
return new_df_display_all, new_cost_scatter_all, new_runtime_scatter_all
|
| 795 |
+
return df_display_all, cost_scatter_all, runtime_scatter_all
|
| 796 |
|
| 797 |
refresh_timer.tick(
|
| 798 |
fn=check_and_refresh_all,
|
| 799 |
inputs=[],
|
| 800 |
+
outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
|
| 801 |
)
|
| 802 |
|
| 803 |
# Return the components so they can be referenced elsewhere.
|
| 804 |
+
return cost_plot_component, runtime_plot_component, dataframe_component
|
| 805 |
|
| 806 |
# # --- Detailed Benchmark Display ---
|
| 807 |
def create_benchmark_details_display(
|
|
|
|
| 841 |
# 3. Prepare the data for this specific benchmark's table and plot
|
| 842 |
benchmark_score_col = f"{benchmark_name} Score"
|
| 843 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 844 |
+
benchmark_runtime_col = f"{benchmark_name} Runtime"
|
| 845 |
benchmark_download_col = f"{benchmark_name} Download"
|
| 846 |
|
| 847 |
# Define the columns needed for the detailed table
|
| 848 |
+
table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col, benchmark_runtime_col, 'Logs', benchmark_download_col, 'id', 'Language Model']
|
| 849 |
|
| 850 |
# Filter to only columns that actually exist in the full dataframe
|
| 851 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
|
|
| 949 |
'Attempted Benchmark',
|
| 950 |
benchmark_score_col,
|
| 951 |
benchmark_cost_col,
|
| 952 |
+
benchmark_runtime_col,
|
| 953 |
'Date',
|
| 954 |
'Logs',
|
| 955 |
benchmark_download_col
|
|
|
|
| 958 |
if col not in benchmark_table_df.columns:
|
| 959 |
benchmark_table_df[col] = pd.NA # Add as an empty column
|
| 960 |
benchmark_table_df = benchmark_table_df[desired_cols_in_order]
|
| 961 |
+
|
| 962 |
+
# Format the runtime column before renaming
|
| 963 |
+
if benchmark_runtime_col in benchmark_table_df.columns:
|
| 964 |
+
benchmark_table_df = format_runtime_column(benchmark_table_df, benchmark_runtime_col)
|
| 965 |
+
|
| 966 |
# Rename columns for a cleaner table display, as requested
|
| 967 |
benchmark_table_df.rename(columns={
|
| 968 |
benchmark_score_col: 'Score',
|
| 969 |
benchmark_cost_col: 'Cost',
|
| 970 |
+
benchmark_runtime_col: 'Runtime',
|
| 971 |
benchmark_download_col: '⬇️', # Empty-ish header with icon hint
|
| 972 |
}, inplace=True)
|
| 973 |
|
|
|
|
| 975 |
df_headers = benchmark_table_df.columns.tolist()
|
| 976 |
df_datatypes = []
|
| 977 |
for col in df_headers:
|
| 978 |
+
if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col or "Runtime" in col:
|
| 979 |
df_datatypes.append("markdown")
|
| 980 |
elif col in ["SDK Version", "Language Model"]:
|
| 981 |
df_datatypes.append("html")
|
| 982 |
else:
|
| 983 |
df_datatypes.append("str")
|
| 984 |
+
|
| 985 |
+
# Cost/Performance plot
|
| 986 |
+
cost_benchmark_plot = _plot_scatter_plotly(
|
| 987 |
data=full_df,
|
| 988 |
x=benchmark_cost_col,
|
| 989 |
y=benchmark_score_col,
|
| 990 |
agent_col="SDK Version",
|
| 991 |
+
name=benchmark_name,
|
| 992 |
+
plot_type='cost'
|
| 993 |
+
)
|
| 994 |
+
gr.Plot(value=cost_benchmark_plot, show_label=False)
|
| 995 |
+
|
| 996 |
+
# Runtime/Performance plot
|
| 997 |
+
runtime_benchmark_plot = _plot_scatter_plotly(
|
| 998 |
+
data=full_df,
|
| 999 |
+
x=benchmark_runtime_col,
|
| 1000 |
+
y=benchmark_score_col,
|
| 1001 |
+
agent_col="SDK Version",
|
| 1002 |
+
name=benchmark_name,
|
| 1003 |
+
plot_type='runtime'
|
| 1004 |
)
|
| 1005 |
+
gr.Plot(value=runtime_benchmark_plot, show_label=False)
|
| 1006 |
gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|
| 1007 |
|
| 1008 |
# Put table and key into an accordion
|
|
|
|
| 1013 |
datatype=df_datatypes,
|
| 1014 |
interactive=False,
|
| 1015 |
wrap=True,
|
| 1016 |
+
column_widths=[200, 80, 40, 80, 80, 80, 150, 40, 40], # Language Model, SDK Version, Attempted, Score, Cost, Runtime, Date, Logs, Download
|
| 1017 |
show_search="search",
|
| 1018 |
elem_classes=["wrap-header-df"]
|
| 1019 |
)
|