Spaces:
Running
Running
openhands
openhands
commited on
Commit
·
5998027
1
Parent(s):
55da48c
feat: Update leaderboard calculations and add incomplete entries toggle
Browse filesChanges:
1. Rename 'OpenHands Version' to 'SDK Version'
2. Rename 'Overall Score' to 'Average Score' - now divides by 5 regardless
of categories completed (missing categories count as 0)
3. Rename 'Overall Cost' to 'Total Cost' - now sums all category costs
4. Add 'Show incomplete entries' toggle (default: hidden) to filter
entries that don't have all 5 categories submitted
5. Fix agent grouping to use version+model combination instead of just
version (fixes issue where different models with same SDK version
were incorrectly merged)
6. Fix model_dump() to use mode='json' for proper enum serialization
7. Track 'categories_completed' count per agent
Co-authored-by: openhands <openhands@all-hands.dev>
- leaderboard_transformer.py +16 -12
- simple_data_loader.py +42 -27
- ui_components.py +106 -67
leaderboard_transformer.py
CHANGED
|
@@ -96,19 +96,23 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 96 |
"""
|
| 97 |
Takes a raw column name from the DataFrame and returns a "pretty" version.
|
| 98 |
Handles three cases:
|
| 99 |
-
1. Fixed names (e.g., '
|
| 100 |
2. Dynamic names (e.g., 'swe_bench_lite score' -> 'SWE-bench Lite Score').
|
| 101 |
3. Fallback for any other names.
|
| 102 |
"""
|
| 103 |
# Case 1: Handle fixed, special-case mappings first.
|
| 104 |
fixed_mappings = {
|
| 105 |
'id': 'id',
|
| 106 |
-
'
|
|
|
|
| 107 |
'Language model': 'Language Model',
|
| 108 |
'Agent description': 'Agent Description',
|
| 109 |
'Submission date': 'Date',
|
| 110 |
-
'
|
| 111 |
-
'Overall
|
|
|
|
|
|
|
|
|
|
| 112 |
'Logs': 'Logs',
|
| 113 |
'Openness': 'Openness',
|
| 114 |
'LLM base': 'Model',
|
|
@@ -256,7 +260,7 @@ class DataTransformer:
|
|
| 256 |
df_view = df_sorted.copy()
|
| 257 |
|
| 258 |
# --- 3. Add Columns for Agent Openness ---
|
| 259 |
-
base_cols = ["id","Language Model","
|
| 260 |
new_cols = ["Openness"]
|
| 261 |
ending_cols = ["Date", "Logs"]
|
| 262 |
|
|
@@ -310,7 +314,7 @@ class DataTransformer:
|
|
| 310 |
data=df_view,
|
| 311 |
x=primary_cost_col,
|
| 312 |
y=primary_score_col,
|
| 313 |
-
agent_col="
|
| 314 |
name=primary_metric
|
| 315 |
) if use_plotly else go.Figure()
|
| 316 |
# Use a consistent key for easy retrieval later
|
|
@@ -324,7 +328,7 @@ class DataTransformer:
|
|
| 324 |
plots['scatter_plot'] = go.Figure()
|
| 325 |
return df_view, plots
|
| 326 |
|
| 327 |
-
DEFAULT_Y_COLUMN = "
|
| 328 |
DUMMY_X_VALUE_FOR_MISSING_COSTS = 0
|
| 329 |
|
| 330 |
def _plot_scatter_plotly(
|
|
@@ -551,7 +555,7 @@ def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame:
|
|
| 551 |
- If both cost and score are null, it becomes "Not Attempted".
|
| 552 |
Args:
|
| 553 |
df: The DataFrame to modify.
|
| 554 |
-
cost_col_name: The name of the cost column to format (e.g., "
|
| 555 |
Returns:
|
| 556 |
The DataFrame with the formatted cost column.
|
| 557 |
"""
|
|
@@ -584,10 +588,10 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
|
|
| 584 |
Applies custom formatting to a score column for display.
|
| 585 |
- If a score is 0 or NaN, it's displayed as a colored "0".
|
| 586 |
- Other scores are formatted to two decimal places.
|
| 587 |
-
-
|
| 588 |
"""
|
| 589 |
status_color = "#ec4899" # The same color as your other status text
|
| 590 |
-
|
| 591 |
|
| 592 |
def apply_formatting(score_value):
|
| 593 |
# Explicitly handle missing values without turning them into zeros
|
|
@@ -601,8 +605,8 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
|
|
| 601 |
else:
|
| 602 |
formatted = str(score_value)
|
| 603 |
|
| 604 |
-
# Make
|
| 605 |
-
if
|
| 606 |
return f"<strong>{formatted}</strong>"
|
| 607 |
return formatted
|
| 608 |
|
|
|
|
| 96 |
"""
|
| 97 |
Takes a raw column name from the DataFrame and returns a "pretty" version.
|
| 98 |
Handles three cases:
|
| 99 |
+
1. Fixed names (e.g., 'SDK version' -> 'SDK Version', 'Language model' -> 'Language Model').
|
| 100 |
2. Dynamic names (e.g., 'swe_bench_lite score' -> 'SWE-bench Lite Score').
|
| 101 |
3. Fallback for any other names.
|
| 102 |
"""
|
| 103 |
# Case 1: Handle fixed, special-case mappings first.
|
| 104 |
fixed_mappings = {
|
| 105 |
'id': 'id',
|
| 106 |
+
'SDK version': 'SDK Version',
|
| 107 |
+
'Openhands version': 'SDK Version', # Legacy support
|
| 108 |
'Language model': 'Language Model',
|
| 109 |
'Agent description': 'Agent Description',
|
| 110 |
'Submission date': 'Date',
|
| 111 |
+
'average score': 'Average Score',
|
| 112 |
+
'Overall': 'Average Score', # Legacy support
|
| 113 |
+
'total cost': 'Total Cost',
|
| 114 |
+
'Overall cost': 'Total Cost', # Legacy support
|
| 115 |
+
'categories_completed': 'Categories Completed',
|
| 116 |
'Logs': 'Logs',
|
| 117 |
'Openness': 'Openness',
|
| 118 |
'LLM base': 'Model',
|
|
|
|
| 260 |
df_view = df_sorted.copy()
|
| 261 |
|
| 262 |
# --- 3. Add Columns for Agent Openness ---
|
| 263 |
+
base_cols = ["id","Language Model","SDK Version","Source"]
|
| 264 |
new_cols = ["Openness"]
|
| 265 |
ending_cols = ["Date", "Logs"]
|
| 266 |
|
|
|
|
| 314 |
data=df_view,
|
| 315 |
x=primary_cost_col,
|
| 316 |
y=primary_score_col,
|
| 317 |
+
agent_col="SDK Version",
|
| 318 |
name=primary_metric
|
| 319 |
) if use_plotly else go.Figure()
|
| 320 |
# Use a consistent key for easy retrieval later
|
|
|
|
| 328 |
plots['scatter_plot'] = go.Figure()
|
| 329 |
return df_view, plots
|
| 330 |
|
| 331 |
+
DEFAULT_Y_COLUMN = "Average Score"
|
| 332 |
DUMMY_X_VALUE_FOR_MISSING_COSTS = 0
|
| 333 |
|
| 334 |
def _plot_scatter_plotly(
|
|
|
|
| 555 |
- If both cost and score are null, it becomes "Not Attempted".
|
| 556 |
Args:
|
| 557 |
df: The DataFrame to modify.
|
| 558 |
+
cost_col_name: The name of the cost column to format (e.g., "Total Cost").
|
| 559 |
Returns:
|
| 560 |
The DataFrame with the formatted cost column.
|
| 561 |
"""
|
|
|
|
| 588 |
Applies custom formatting to a score column for display.
|
| 589 |
- If a score is 0 or NaN, it's displayed as a colored "0".
|
| 590 |
- Other scores are formatted to two decimal places.
|
| 591 |
+
- Average Score values are displayed in bold.
|
| 592 |
"""
|
| 593 |
status_color = "#ec4899" # The same color as your other status text
|
| 594 |
+
is_average_score = (score_col_name == "Average Score")
|
| 595 |
|
| 596 |
def apply_formatting(score_value):
|
| 597 |
# Explicitly handle missing values without turning them into zeros
|
|
|
|
| 605 |
else:
|
| 606 |
formatted = str(score_value)
|
| 607 |
|
| 608 |
+
# Make Average Score bold
|
| 609 |
+
if is_average_score and score_value != 0:
|
| 610 |
return f"<strong>{formatted}</strong>"
|
| 611 |
return formatted
|
| 612 |
|
simple_data_loader.py
CHANGED
|
@@ -65,7 +65,8 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
|
|
| 65 |
if _ensure_schema_models() and Metadata and ScoreEntry:
|
| 66 |
try:
|
| 67 |
validated_metadata = Metadata(**metadata_raw)
|
| 68 |
-
|
|
|
|
| 69 |
except Exception as e:
|
| 70 |
errors.append(f"Metadata validation error in {agent_dir.name}: {e}")
|
| 71 |
metadata_dict = metadata_raw # Fall back to raw data
|
|
@@ -74,7 +75,8 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
|
|
| 74 |
for i, score in enumerate(scores_raw):
|
| 75 |
try:
|
| 76 |
validated_score = ScoreEntry(**score)
|
| 77 |
-
|
|
|
|
| 78 |
except Exception as e:
|
| 79 |
errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
|
| 80 |
validated_scores.append(score) # Fall back to raw data
|
|
@@ -223,23 +225,30 @@ class SimpleLeaderboardViewer:
|
|
| 223 |
try:
|
| 224 |
|
| 225 |
# Transform to expected format for leaderboard
|
| 226 |
-
# Group by agent to aggregate results across datasets
|
| 227 |
transformed_records = []
|
| 228 |
|
| 229 |
-
for
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
# Build a single record for this agent
|
| 233 |
first_record = agent_records.iloc[0]
|
|
|
|
| 234 |
|
| 235 |
# Normalize openness to "open" or "closed"
|
| 236 |
from aliases import OPENNESS_MAPPING
|
| 237 |
raw_openness = first_record['openness']
|
| 238 |
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
|
| 239 |
|
|
|
|
|
|
|
|
|
|
| 240 |
record = {
|
| 241 |
# Core agent info - use final display names
|
| 242 |
-
'
|
| 243 |
'Language model': first_record['llm_base'], # Will become "Language Model"
|
| 244 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
| 245 |
'date': first_record['submission_time'], # Will become "Date"
|
|
@@ -273,30 +282,36 @@ class SimpleLeaderboardViewer:
|
|
| 273 |
category_data[category]['scores'].append(row['score'])
|
| 274 |
category_data[category]['costs'].append(row['total_cost'])
|
| 275 |
|
| 276 |
-
# Calculate category-level aggregates
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
for category
|
| 280 |
-
if
|
|
|
|
| 281 |
avg_score = sum(data['scores']) / len(data['scores'])
|
| 282 |
record[f'{category} score'] = avg_score
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
|
| 289 |
-
#
|
| 290 |
-
|
| 291 |
-
if category_avg_scores:
|
| 292 |
-
record['overall score'] = sum(category_avg_scores) / len(category_avg_scores)
|
| 293 |
-
else:
|
| 294 |
-
record['overall score'] = None
|
| 295 |
-
|
| 296 |
-
if category_avg_costs:
|
| 297 |
-
record['overall cost'] = sum(category_avg_costs) / len(category_avg_costs)
|
| 298 |
-
else:
|
| 299 |
-
record['overall cost'] = None
|
| 300 |
|
| 301 |
transformed_records.append(record)
|
| 302 |
|
|
|
|
| 65 |
if _ensure_schema_models() and Metadata and ScoreEntry:
|
| 66 |
try:
|
| 67 |
validated_metadata = Metadata(**metadata_raw)
|
| 68 |
+
# Use mode='json' to serialize enums as strings
|
| 69 |
+
metadata_dict = validated_metadata.model_dump(mode='json')
|
| 70 |
except Exception as e:
|
| 71 |
errors.append(f"Metadata validation error in {agent_dir.name}: {e}")
|
| 72 |
metadata_dict = metadata_raw # Fall back to raw data
|
|
|
|
| 75 |
for i, score in enumerate(scores_raw):
|
| 76 |
try:
|
| 77 |
validated_score = ScoreEntry(**score)
|
| 78 |
+
# Use mode='json' to serialize enums as strings
|
| 79 |
+
validated_scores.append(validated_score.model_dump(mode='json'))
|
| 80 |
except Exception as e:
|
| 81 |
errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
|
| 82 |
validated_scores.append(score) # Fall back to raw data
|
|
|
|
| 225 |
try:
|
| 226 |
|
| 227 |
# Transform to expected format for leaderboard
|
| 228 |
+
# Group by agent (version + model combination) to aggregate results across datasets
|
| 229 |
transformed_records = []
|
| 230 |
|
| 231 |
+
# Create a unique identifier for each agent (version + model)
|
| 232 |
+
df['agent_id'] = df['agent_version'] + '_' + df['llm_base']
|
| 233 |
+
|
| 234 |
+
for agent_id in df['agent_id'].unique():
|
| 235 |
+
agent_records = df[df['agent_id'] == agent_id]
|
| 236 |
|
| 237 |
# Build a single record for this agent
|
| 238 |
first_record = agent_records.iloc[0]
|
| 239 |
+
agent_version = first_record['agent_version']
|
| 240 |
|
| 241 |
# Normalize openness to "open" or "closed"
|
| 242 |
from aliases import OPENNESS_MAPPING
|
| 243 |
raw_openness = first_record['openness']
|
| 244 |
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
|
| 245 |
|
| 246 |
+
# All 5 categories for the leaderboard
|
| 247 |
+
ALL_CATEGORIES = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
|
| 248 |
+
|
| 249 |
record = {
|
| 250 |
# Core agent info - use final display names
|
| 251 |
+
'SDK version': agent_version, # Will become "SDK Version"
|
| 252 |
'Language model': first_record['llm_base'], # Will become "Language Model"
|
| 253 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
| 254 |
'date': first_record['submission_time'], # Will become "Date"
|
|
|
|
| 282 |
category_data[category]['scores'].append(row['score'])
|
| 283 |
category_data[category]['costs'].append(row['total_cost'])
|
| 284 |
|
| 285 |
+
# Calculate category-level aggregates and track total cost
|
| 286 |
+
total_cost = 0.0
|
| 287 |
+
categories_with_scores = 0
|
| 288 |
+
for category in ALL_CATEGORIES:
|
| 289 |
+
if category in category_data and category_data[category]['scores']:
|
| 290 |
+
data = category_data[category]
|
| 291 |
avg_score = sum(data['scores']) / len(data['scores'])
|
| 292 |
record[f'{category} score'] = avg_score
|
| 293 |
+
categories_with_scores += 1
|
| 294 |
+
if data['costs']:
|
| 295 |
+
cat_cost = sum(data['costs'])
|
| 296 |
+
record[f'{category} cost'] = cat_cost
|
| 297 |
+
total_cost += cat_cost
|
| 298 |
+
else:
|
| 299 |
+
# Category not submitted - will show as NA
|
| 300 |
+
pass
|
| 301 |
+
|
| 302 |
+
# Calculate average score: always divide by 5 (treating missing categories as 0)
|
| 303 |
+
# This penalizes incomplete submissions
|
| 304 |
+
score_sum = sum(
|
| 305 |
+
record.get(f'{cat} score', 0) or 0
|
| 306 |
+
for cat in ALL_CATEGORIES
|
| 307 |
+
)
|
| 308 |
+
record['average score'] = score_sum / 5
|
| 309 |
+
|
| 310 |
+
# Total cost is the sum of all category costs
|
| 311 |
+
record['total cost'] = total_cost if total_cost > 0 else None
|
| 312 |
|
| 313 |
+
# Track how many categories were completed
|
| 314 |
+
record['categories_completed'] = categories_with_scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
transformed_records.append(record)
|
| 317 |
|
ui_components.py
CHANGED
|
@@ -147,10 +147,10 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 147 |
"""Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
|
| 148 |
if table == "Overall":
|
| 149 |
return """
|
| 150 |
-
<div class="tooltip-description-item"><b>
|
| 151 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 152 |
-
<div class="tooltip-description-item"><b>
|
| 153 |
-
<div class="tooltip-description-item"><b>
|
| 154 |
<div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
|
| 155 |
<div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per problem (USD) across Bug Fixing benchmarks.</div>
|
| 156 |
<div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
|
|
@@ -166,7 +166,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 166 |
"""
|
| 167 |
elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
|
| 168 |
return f"""
|
| 169 |
-
<div class="tooltip-description-item"><b>
|
| 170 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 171 |
<div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
|
| 172 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
|
|
@@ -178,7 +178,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 178 |
else:
|
| 179 |
# Fallback for any other table type, e.g., individual benchmarks
|
| 180 |
return f"""
|
| 181 |
-
<div class="tooltip-description-item"><b>
|
| 182 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 183 |
<div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
|
| 184 |
<div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
|
|
@@ -360,70 +360,83 @@ def create_leaderboard_display(
|
|
| 360 |
# 1. Instantiate the transformer and get the specific view for this category.
|
| 361 |
# The function no longer loads data itself; it filters the data it receives.
|
| 362 |
transformer = DataTransformer(full_df, tag_map)
|
| 363 |
-
|
| 364 |
-
pareto_df = get_pareto_df(df_view)
|
| 365 |
-
# Get the list of agents on the frontier. We'll use this list later.
|
| 366 |
-
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
|
| 367 |
-
trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">'
|
| 368 |
-
if not pareto_df.empty and 'id' in pareto_df.columns:
|
| 369 |
-
pareto_agent_names = pareto_df['id'].tolist()
|
| 370 |
-
else:
|
| 371 |
-
pareto_agent_names = []
|
| 372 |
-
df_view['Pareto'] = df_view.apply(
|
| 373 |
-
lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
|
| 374 |
-
axis=1
|
| 375 |
-
)
|
| 376 |
-
# Generate openness icons for each row
|
| 377 |
-
def get_openness_icon_html(row):
|
| 378 |
-
openness_val = row.get('Openness', '')
|
| 379 |
-
uri = get_svg_as_data_uri(OPENNESS_ICON_MAP.get(openness_val, "assets/ellipse-pink.svg"))
|
| 380 |
-
return f'<img src="{uri}" alt="{openness_val}" title="{openness_val}" style="width:24px; height:24px;">'
|
| 381 |
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
df_view['Language Model'] = df_view['Language Model'].apply(clean_llm_base_list)
|
| 396 |
-
df_view['Language Model'] = df_view['Language Model'].apply(format_llm_base_with_html)
|
| 397 |
-
# append the repro url to the end of the OpenHands Version
|
| 398 |
-
if 'Source' in df_view.columns:
|
| 399 |
-
df_view['OpenHands Version'] = df_view.apply(
|
| 400 |
-
lambda row: f"{row['OpenHands Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['OpenHands Version'],
|
| 401 |
axis=1
|
| 402 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
|
| 420 |
# Now get headers from the renamed dataframe
|
| 421 |
-
df_headers =
|
| 422 |
df_datatypes = []
|
| 423 |
for col in df_headers:
|
| 424 |
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 425 |
df_datatypes.append("markdown")
|
| 426 |
-
elif col in ["
|
| 427 |
df_datatypes.append("html")
|
| 428 |
else:
|
| 429 |
df_datatypes.append("str")
|
|
@@ -451,9 +464,21 @@ def create_leaderboard_display(
|
|
| 451 |
|
| 452 |
# Put table and key into an accordion
|
| 453 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
dataframe_component = gr.DataFrame(
|
| 455 |
headers=df_headers,
|
| 456 |
-
value=
|
| 457 |
datatype=df_datatypes,
|
| 458 |
interactive=False,
|
| 459 |
wrap=True,
|
|
@@ -462,6 +487,20 @@ def create_leaderboard_display(
|
|
| 462 |
show_search="search",
|
| 463 |
elem_id="main-leaderboard"
|
| 464 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
legend_markdown = create_legend_markdown(category_name)
|
| 466 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
| 467 |
|
|
@@ -508,7 +547,7 @@ def create_benchmark_details_display(
|
|
| 508 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 509 |
|
| 510 |
# Define the columns needed for the detailed table
|
| 511 |
-
table_cols = ['
|
| 512 |
|
| 513 |
# Filter to only columns that actually exist in the full dataframe
|
| 514 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
@@ -543,10 +582,10 @@ def create_benchmark_details_display(
|
|
| 543 |
#Make pretty and format the Language Model column
|
| 544 |
benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
|
| 545 |
benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(format_llm_base_with_html)
|
| 546 |
-
# append the repro url to the end of the
|
| 547 |
if 'Source' in benchmark_table_df.columns:
|
| 548 |
-
benchmark_table_df['
|
| 549 |
-
lambda row: f"{row['
|
| 550 |
axis=1
|
| 551 |
)
|
| 552 |
|
|
@@ -574,7 +613,7 @@ def create_benchmark_details_display(
|
|
| 574 |
'Pareto',
|
| 575 |
'Icon',
|
| 576 |
'Language Model',
|
| 577 |
-
'
|
| 578 |
'Attempted Benchmark',
|
| 579 |
benchmark_score_col,
|
| 580 |
benchmark_cost_col,
|
|
@@ -603,7 +642,7 @@ def create_benchmark_details_display(
|
|
| 603 |
for col in df_headers:
|
| 604 |
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 605 |
df_datatypes.append("markdown")
|
| 606 |
-
elif col in ["
|
| 607 |
df_datatypes.append("html")
|
| 608 |
else:
|
| 609 |
df_datatypes.append("str")
|
|
@@ -611,7 +650,7 @@ def create_benchmark_details_display(
|
|
| 611 |
data=full_df,
|
| 612 |
x=benchmark_cost_col,
|
| 613 |
y=benchmark_score_col,
|
| 614 |
-
agent_col="
|
| 615 |
name=benchmark_name
|
| 616 |
)
|
| 617 |
with gr.Row():
|
|
|
|
| 147 |
"""Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
|
| 148 |
if table == "Overall":
|
| 149 |
return """
|
| 150 |
+
<div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands SDK evaluated.</div>
|
| 151 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 152 |
+
<div class="tooltip-description-item"><b>Average Score:</b> Sum of category scores divided by 5. Missing categories count as 0.</div>
|
| 153 |
+
<div class="tooltip-description-item"><b>Total Cost:</b> Sum of costs across all submitted categories, in USD.</div>
|
| 154 |
<div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
|
| 155 |
<div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per problem (USD) across Bug Fixing benchmarks.</div>
|
| 156 |
<div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
|
|
|
|
| 166 |
"""
|
| 167 |
elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
|
| 168 |
return f"""
|
| 169 |
+
<div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
|
| 170 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 171 |
<div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
|
| 172 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
|
|
|
|
| 178 |
else:
|
| 179 |
# Fallback for any other table type, e.g., individual benchmarks
|
| 180 |
return f"""
|
| 181 |
+
<div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
|
| 182 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 183 |
<div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
|
| 184 |
<div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
|
|
|
|
| 360 |
# 1. Instantiate the transformer and get the specific view for this category.
|
| 361 |
# The function no longer loads data itself; it filters the data it receives.
|
| 362 |
transformer = DataTransformer(full_df, tag_map)
|
| 363 |
+
df_view_full, plots_dict = transformer.view(tag=category_name, use_plotly=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
+
def prepare_df_for_display(df_view):
|
| 366 |
+
"""Prepare a DataFrame for display with all formatting applied."""
|
| 367 |
+
df_display = df_view.copy()
|
| 368 |
+
|
| 369 |
+
pareto_df = get_pareto_df(df_display)
|
| 370 |
+
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
|
| 371 |
+
trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">'
|
| 372 |
+
if not pareto_df.empty and 'id' in pareto_df.columns:
|
| 373 |
+
pareto_agent_names = pareto_df['id'].tolist()
|
| 374 |
+
else:
|
| 375 |
+
pareto_agent_names = []
|
| 376 |
+
df_display['Pareto'] = df_display.apply(
|
| 377 |
+
lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
axis=1
|
| 379 |
)
|
| 380 |
+
|
| 381 |
+
def get_openness_icon_html(row):
|
| 382 |
+
openness_val = row.get('Openness', '')
|
| 383 |
+
uri = get_svg_as_data_uri(OPENNESS_ICON_MAP.get(openness_val, "assets/ellipse-pink.svg"))
|
| 384 |
+
return f'<img src="{uri}" alt="{openness_val}" title="{openness_val}" style="width:24px; height:24px;">'
|
| 385 |
+
|
| 386 |
+
df_display['Icon'] = df_display.apply(get_openness_icon_html, axis=1)
|
| 387 |
+
|
| 388 |
+
for col in df_display.columns:
|
| 389 |
+
if "Cost" in col:
|
| 390 |
+
df_display = format_cost_column(df_display, col)
|
| 391 |
+
|
| 392 |
+
for col in df_display.columns:
|
| 393 |
+
if "Score" in col:
|
| 394 |
+
df_display = format_score_column(df_display, col)
|
| 395 |
+
|
| 396 |
+
df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list)
|
| 397 |
+
df_display['Language Model'] = df_display['Language Model'].apply(format_llm_base_with_html)
|
| 398 |
+
|
| 399 |
+
if 'Source' in df_display.columns:
|
| 400 |
+
df_display['SDK Version'] = df_display.apply(
|
| 401 |
+
lambda row: f"{row['SDK Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['SDK Version'],
|
| 402 |
+
axis=1
|
| 403 |
+
)
|
| 404 |
|
| 405 |
+
all_cols = df_display.columns.tolist()
|
| 406 |
+
all_cols.insert(0, all_cols.pop(all_cols.index('Icon')))
|
| 407 |
+
all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
|
| 408 |
+
df_display = df_display[all_cols]
|
| 409 |
+
|
| 410 |
+
columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source', 'Categories Completed']
|
| 411 |
+
df_display = df_display.drop(columns=columns_to_drop, errors='ignore')
|
| 412 |
+
|
| 413 |
+
header_rename_map = {
|
| 414 |
+
"Pareto": "",
|
| 415 |
+
"Icon": "",
|
| 416 |
+
}
|
| 417 |
+
df_display = df_display.rename(columns=header_rename_map)
|
| 418 |
+
|
| 419 |
+
return df_display
|
| 420 |
+
|
| 421 |
+
# Prepare both complete and all entries versions
|
| 422 |
+
# Complete entries have all 5 categories submitted
|
| 423 |
+
if 'Categories Completed' in df_view_full.columns:
|
| 424 |
+
df_view_complete = df_view_full[df_view_full['Categories Completed'] == 5].copy()
|
| 425 |
+
else:
|
| 426 |
+
df_view_complete = df_view_full.copy()
|
| 427 |
+
|
| 428 |
+
df_display_complete = prepare_df_for_display(df_view_complete)
|
| 429 |
+
df_display_all = prepare_df_for_display(df_view_full)
|
| 430 |
+
|
| 431 |
+
scatter_plot = plots_dict.get('scatter_plot', go.Figure())
|
| 432 |
|
| 433 |
# Now get headers from the renamed dataframe
|
| 434 |
+
df_headers = df_display_complete.columns.tolist()
|
| 435 |
df_datatypes = []
|
| 436 |
for col in df_headers:
|
| 437 |
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 438 |
df_datatypes.append("markdown")
|
| 439 |
+
elif col in ["SDK Version","Language Model", ""]: # "" for renamed Pareto/Icon columns
|
| 440 |
df_datatypes.append("html")
|
| 441 |
else:
|
| 442 |
df_datatypes.append("str")
|
|
|
|
| 464 |
|
| 465 |
# Put table and key into an accordion
|
| 466 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
| 467 |
+
# Add toggle for showing incomplete entries
|
| 468 |
+
num_complete = len(df_display_complete)
|
| 469 |
+
num_total = len(df_display_all)
|
| 470 |
+
num_incomplete = num_total - num_complete
|
| 471 |
+
|
| 472 |
+
show_incomplete_checkbox = gr.Checkbox(
|
| 473 |
+
label=f"Show incomplete entries ({num_incomplete} entries with fewer than 5 categories)",
|
| 474 |
+
value=False,
|
| 475 |
+
elem_id="show-incomplete-toggle"
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
# Start with complete entries only (default)
|
| 479 |
dataframe_component = gr.DataFrame(
|
| 480 |
headers=df_headers,
|
| 481 |
+
value=df_display_complete,
|
| 482 |
datatype=df_datatypes,
|
| 483 |
interactive=False,
|
| 484 |
wrap=True,
|
|
|
|
| 487 |
show_search="search",
|
| 488 |
elem_id="main-leaderboard"
|
| 489 |
)
|
| 490 |
+
|
| 491 |
+
# Update function for the toggle
|
| 492 |
+
def update_table(show_incomplete):
|
| 493 |
+
if show_incomplete:
|
| 494 |
+
return df_display_all
|
| 495 |
+
else:
|
| 496 |
+
return df_display_complete
|
| 497 |
+
|
| 498 |
+
show_incomplete_checkbox.change(
|
| 499 |
+
fn=update_table,
|
| 500 |
+
inputs=[show_incomplete_checkbox],
|
| 501 |
+
outputs=[dataframe_component]
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
legend_markdown = create_legend_markdown(category_name)
|
| 505 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
| 506 |
|
|
|
|
| 547 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 548 |
|
| 549 |
# Define the columns needed for the detailed table
|
| 550 |
+
table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']
|
| 551 |
|
| 552 |
# Filter to only columns that actually exist in the full dataframe
|
| 553 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
|
|
| 582 |
#Make pretty and format the Language Model column
|
| 583 |
benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
|
| 584 |
benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(format_llm_base_with_html)
|
| 585 |
+
# append the repro url to the end of the SDK Version
|
| 586 |
if 'Source' in benchmark_table_df.columns:
|
| 587 |
+
benchmark_table_df['SDK Version'] = benchmark_table_df.apply(
|
| 588 |
+
lambda row: f"{row['SDK Version']} {row['Source']}" if row['Source'] else row['SDK Version'],
|
| 589 |
axis=1
|
| 590 |
)
|
| 591 |
|
|
|
|
| 613 |
'Pareto',
|
| 614 |
'Icon',
|
| 615 |
'Language Model',
|
| 616 |
+
'SDK Version',
|
| 617 |
'Attempted Benchmark',
|
| 618 |
benchmark_score_col,
|
| 619 |
benchmark_cost_col,
|
|
|
|
| 642 |
for col in df_headers:
|
| 643 |
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 644 |
df_datatypes.append("markdown")
|
| 645 |
+
elif col in ["SDK Version", "Language Model", ""]: # "" for renamed Pareto/Icon columns
|
| 646 |
df_datatypes.append("html")
|
| 647 |
else:
|
| 648 |
df_datatypes.append("str")
|
|
|
|
| 650 |
data=full_df,
|
| 651 |
x=benchmark_cost_col,
|
| 652 |
y=benchmark_score_col,
|
| 653 |
+
agent_col="SDK Version",
|
| 654 |
name=benchmark_name
|
| 655 |
)
|
| 656 |
with gr.Row():
|