Spaces:
Running
Running
openhands
openhands
commited on
Commit
·
6737ff3
1
Parent(s):
af81bcf
UI cleanup and About page updates
Browse files1. Removed legend box from graph (OpenHands/Pareto/Company Logos)
2. Combined Pareto, lock, and company logo columns into Language Model column
3. Removed dataset structure SVG image from main page
4. Updated Acknowledgements section - shortened AstaBench credit, added links to component benchmarks
5. Updated Citation to be for OpenHands Index with authors: Juan Michelini, Simon Rosenberg, Xingyao Wang, Graham Neubig
Co-authored-by: openhands <openhands@all-hands.dev>
- about.py +15 -17
- main_page.py +0 -11
- ui_components.py +89 -84
about.py
CHANGED
|
@@ -155,23 +155,21 @@ def build_page():
|
|
| 155 |
"""
|
| 156 |
<h2>Acknowledgements</h2>
|
| 157 |
<p>
|
| 158 |
-
The OpenHands Index leaderboard interface
|
| 159 |
<a href="https://huggingface.co/spaces/allenai/asta-bench-leaderboard" target="_blank" class="primary-link-button">AstaBench Leaderboard</a>
|
| 160 |
-
developed by the Allen Institute for AI.
|
| 161 |
-
a clear and effective leaderboard design that we have customized for the software engineering domain.
|
| 162 |
</p>
|
| 163 |
<p>
|
| 164 |
-
|
| 165 |
</p>
|
| 166 |
<ul class="info-list">
|
| 167 |
-
<li>
|
| 168 |
-
<li
|
| 169 |
-
<li>
|
|
|
|
|
|
|
|
|
|
| 170 |
</ul>
|
| 171 |
-
<p>
|
| 172 |
-
We have extended and modified this foundation to support software engineering benchmarks and the
|
| 173 |
-
specific requirements of evaluating AI coding agents.
|
| 174 |
-
</p>
|
| 175 |
"""
|
| 176 |
)
|
| 177 |
gr.Markdown("---", elem_classes="divider-line")
|
|
@@ -181,14 +179,14 @@ def build_page():
|
|
| 181 |
"""
|
| 182 |
<h2>Citation</h2>
|
| 183 |
<p>
|
| 184 |
-
If you
|
| 185 |
</p>
|
| 186 |
<pre class="citation-block">
|
| 187 |
-
@misc{
|
| 188 |
-
title={OpenHands:
|
| 189 |
-
author={
|
| 190 |
-
year={
|
| 191 |
-
howpublished={https://
|
| 192 |
}</pre>
|
| 193 |
"""
|
| 194 |
)
|
|
|
|
| 155 |
"""
|
| 156 |
<h2>Acknowledgements</h2>
|
| 157 |
<p>
|
| 158 |
+
The OpenHands Index leaderboard interface is adapted from the
|
| 159 |
<a href="https://huggingface.co/spaces/allenai/asta-bench-leaderboard" target="_blank" class="primary-link-button">AstaBench Leaderboard</a>
|
| 160 |
+
developed by the Allen Institute for AI.
|
|
|
|
| 161 |
</p>
|
| 162 |
<p>
|
| 163 |
+
We thank the teams behind the component benchmarks used in OpenHands Index:
|
| 164 |
</p>
|
| 165 |
<ul class="info-list">
|
| 166 |
+
<li><a href="https://www.swebench.com/" target="_blank">SWE-bench</a> - Princeton NLP Group</li>
|
| 167 |
+
<li><a href="https://github.com/multi-swe-bench/multi-swe-bench" target="_blank">Multi-SWE-bench</a> - Multi-SWE-bench Team</li>
|
| 168 |
+
<li><a href="https://github.com/OpenHands/SWE-bench-multimodal" target="_blank">SWE-bench Multimodal</a> - OpenHands Team</li>
|
| 169 |
+
<li><a href="https://github.com/logic-star-ai/swt-bench" target="_blank">SWT-bench</a> - Logic Star AI</li>
|
| 170 |
+
<li><a href="https://github.com/commit-0/commit0" target="_blank">Commit0</a> - Commit0 Team</li>
|
| 171 |
+
<li><a href="https://huggingface.co/gaia-benchmark" target="_blank">GAIA</a> - Hugging Face & Meta AI</li>
|
| 172 |
</ul>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
"""
|
| 174 |
)
|
| 175 |
gr.Markdown("---", elem_classes="divider-line")
|
|
|
|
| 179 |
"""
|
| 180 |
<h2>Citation</h2>
|
| 181 |
<p>
|
| 182 |
+
If you reference the OpenHands Index in your work, please cite:
|
| 183 |
</p>
|
| 184 |
<pre class="citation-block">
|
| 185 |
+
@misc{openhandsindex2025,
|
| 186 |
+
title={OpenHands Index: A Comprehensive Leaderboard for AI Coding Agents},
|
| 187 |
+
author={Juan Michelini and Simon Rosenberg and Xingyao Wang and Graham Neubig},
|
| 188 |
+
year={2025},
|
| 189 |
+
howpublished={https://huggingface.co/spaces/OpenHands/openhands-index}
|
| 190 |
}</pre>
|
| 191 |
"""
|
| 192 |
)
|
main_page.py
CHANGED
|
@@ -20,17 +20,6 @@ def build_page():
|
|
| 20 |
with gr.Column(scale=1):
|
| 21 |
gr.HTML(INTRO_PARAGRAPH, elem_id="intro-paragraph")
|
| 22 |
|
| 23 |
-
with gr.Column(scale=1):
|
| 24 |
-
gr.Image(
|
| 25 |
-
value="assets/overall.svg",
|
| 26 |
-
show_label=False,
|
| 27 |
-
interactive=False,
|
| 28 |
-
show_download_button=False,
|
| 29 |
-
show_fullscreen_button=False,
|
| 30 |
-
show_share_button=False,
|
| 31 |
-
elem_id="diagram-image"
|
| 32 |
-
)
|
| 33 |
-
|
| 34 |
# --- Leaderboard Display Section ---
|
| 35 |
gr.Markdown("---")
|
| 36 |
CATEGORY_NAME = "Overall"
|
|
|
|
| 20 |
with gr.Column(scale=1):
|
| 21 |
gr.HTML(INTRO_PARAGRAPH, elem_id="intro-paragraph")
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# --- Leaderboard Display Section ---
|
| 24 |
gr.Markdown("---")
|
| 25 |
CATEGORY_NAME = "Overall"
|
ui_components.py
CHANGED
|
@@ -442,32 +442,13 @@ def create_leaderboard_display(
|
|
| 442 |
"""Prepare a DataFrame for display with all formatting applied."""
|
| 443 |
df_display = df_view.copy()
|
| 444 |
|
|
|
|
| 445 |
pareto_df = get_pareto_df(df_display)
|
| 446 |
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
|
| 447 |
-
trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">'
|
| 448 |
if not pareto_df.empty and 'id' in pareto_df.columns:
|
| 449 |
pareto_agent_names = pareto_df['id'].tolist()
|
| 450 |
else:
|
| 451 |
pareto_agent_names = []
|
| 452 |
-
df_display['Pareto'] = df_display.apply(
|
| 453 |
-
lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
|
| 454 |
-
axis=1
|
| 455 |
-
)
|
| 456 |
-
|
| 457 |
-
def get_openness_icon_html(row):
|
| 458 |
-
openness_val = row.get('Openness', '')
|
| 459 |
-
# Use custom lock SVG icons: blue open lock, red closed lock
|
| 460 |
-
if openness_val in [aliases.CANONICAL_OPENNESS_OPEN, 'Open', 'Open Source', 'Open Source + Open Weights']:
|
| 461 |
-
uri = get_svg_as_data_uri("assets/lock-open.svg")
|
| 462 |
-
return f'<img src="{uri}" alt="Open" title="Open source model" style="width:20px; height:20px;">'
|
| 463 |
-
else:
|
| 464 |
-
uri = get_svg_as_data_uri("assets/lock-closed.svg")
|
| 465 |
-
return f'<img src="{uri}" alt="Closed" title="Closed source model" style="width:20px; height:20px;">'
|
| 466 |
-
|
| 467 |
-
df_display['Icon'] = df_display.apply(get_openness_icon_html, axis=1)
|
| 468 |
-
|
| 469 |
-
# Add company logo column based on the Language Model
|
| 470 |
-
df_display['Company'] = df_display['Language Model'].apply(get_company_logo_html)
|
| 471 |
|
| 472 |
for col in df_display.columns:
|
| 473 |
if "Cost" in col:
|
|
@@ -477,32 +458,56 @@ def create_leaderboard_display(
|
|
| 477 |
if "Score" in col:
|
| 478 |
df_display = format_score_column(df_display, col)
|
| 479 |
|
|
|
|
| 480 |
df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list)
|
| 481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
|
| 483 |
if 'Source' in df_display.columns:
|
| 484 |
df_display['SDK Version'] = df_display.apply(
|
| 485 |
lambda row: f"{row['SDK Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['SDK Version'],
|
| 486 |
axis=1
|
| 487 |
)
|
| 488 |
-
|
| 489 |
-
all_cols = df_display.columns.tolist()
|
| 490 |
-
# Move Company logo column after Icon
|
| 491 |
-
if 'Company' in all_cols:
|
| 492 |
-
all_cols.insert(0, all_cols.pop(all_cols.index('Company')))
|
| 493 |
-
all_cols.insert(0, all_cols.pop(all_cols.index('Icon')))
|
| 494 |
-
all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
|
| 495 |
-
df_display = df_display[all_cols]
|
| 496 |
|
| 497 |
columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source']
|
| 498 |
df_display = df_display.drop(columns=columns_to_drop, errors='ignore')
|
| 499 |
-
|
| 500 |
-
header_rename_map = {
|
| 501 |
-
"Pareto": "",
|
| 502 |
-
"Icon": "",
|
| 503 |
-
"Company": "",
|
| 504 |
-
}
|
| 505 |
-
df_display = df_display.rename(columns=header_rename_map)
|
| 506 |
|
| 507 |
return df_display
|
| 508 |
|
|
@@ -528,13 +533,13 @@ def create_leaderboard_display(
|
|
| 528 |
for col in df_headers:
|
| 529 |
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 530 |
df_datatypes.append("markdown")
|
| 531 |
-
elif col in ["SDK Version","Language Model"
|
| 532 |
df_datatypes.append("html")
|
| 533 |
else:
|
| 534 |
df_datatypes.append("str")
|
| 535 |
# Dynamically set widths for the DataFrame columns
|
| 536 |
-
# Order:
|
| 537 |
-
fixed_start_widths = [
|
| 538 |
num_score_cost_cols = 0
|
| 539 |
remaining_headers = df_headers[len(fixed_start_widths):]
|
| 540 |
for col in remaining_headers:
|
|
@@ -545,15 +550,11 @@ def create_leaderboard_display(
|
|
| 545 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 546 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
| 547 |
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
)
|
| 554 |
-
gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|
| 555 |
-
with gr.Column(scale=1):
|
| 556 |
-
gr.HTML(value=plot_legend_html)
|
| 557 |
|
| 558 |
# Put table and key into an accordion
|
| 559 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
|
@@ -671,35 +672,53 @@ def create_benchmark_details_display(
|
|
| 671 |
pareto_df = get_pareto_df(benchmark_table_df)
|
| 672 |
# Get the list of agents on the frontier. We'll use this list later.
|
| 673 |
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
|
| 674 |
-
trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">'
|
| 675 |
if not pareto_df.empty and 'id' in pareto_df.columns:
|
| 676 |
pareto_agent_names = pareto_df['id'].tolist()
|
| 677 |
else:
|
| 678 |
pareto_agent_names = []
|
| 679 |
-
benchmark_table_df['Pareto'] = benchmark_table_df.apply(
|
| 680 |
-
lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
|
| 681 |
-
axis=1
|
| 682 |
-
)
|
| 683 |
|
| 684 |
-
#
|
| 685 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
openness_val = row.get('Openness', '')
|
| 687 |
-
# Use custom lock SVG icons: blue open lock, red closed lock
|
| 688 |
if openness_val in [aliases.CANONICAL_OPENNESS_OPEN, 'Open', 'Open Source', 'Open Source + Open Weights']:
|
| 689 |
-
|
| 690 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 691 |
else:
|
| 692 |
-
|
| 693 |
-
|
|
|
|
| 694 |
|
| 695 |
-
benchmark_table_df['
|
| 696 |
|
| 697 |
-
# Add company logo column based on the Language Model
|
| 698 |
-
benchmark_table_df['Company'] = benchmark_table_df['Language Model'].apply(get_company_logo_html)
|
| 699 |
-
|
| 700 |
-
#Make pretty and format the Language Model column
|
| 701 |
-
benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
|
| 702 |
-
benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(format_llm_base_with_html)
|
| 703 |
# append the repro url to the end of the SDK Version
|
| 704 |
if 'Source' in benchmark_table_df.columns:
|
| 705 |
benchmark_table_df['SDK Version'] = benchmark_table_df.apply(
|
|
@@ -728,9 +747,6 @@ def create_benchmark_details_display(
|
|
| 728 |
benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
|
| 729 |
benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
|
| 730 |
desired_cols_in_order = [
|
| 731 |
-
'Pareto',
|
| 732 |
-
'Icon',
|
| 733 |
-
'Company',
|
| 734 |
'Language Model',
|
| 735 |
'SDK Version',
|
| 736 |
'Attempted Benchmark',
|
|
@@ -748,13 +764,6 @@ def create_benchmark_details_display(
|
|
| 748 |
benchmark_score_col: 'Score',
|
| 749 |
benchmark_cost_col: 'Cost',
|
| 750 |
}, inplace=True)
|
| 751 |
-
# Remove Pareto, Icon, and Company column headers (rename to empty string)
|
| 752 |
-
header_rename_map = {
|
| 753 |
-
"Pareto": "",
|
| 754 |
-
"Icon": "",
|
| 755 |
-
"Company": "",
|
| 756 |
-
}
|
| 757 |
-
benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
|
| 758 |
|
| 759 |
# Now get headers from the renamed dataframe
|
| 760 |
df_headers = benchmark_table_df.columns.tolist()
|
|
@@ -762,7 +771,7 @@ def create_benchmark_details_display(
|
|
| 762 |
for col in df_headers:
|
| 763 |
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 764 |
df_datatypes.append("markdown")
|
| 765 |
-
elif col in ["SDK Version", "Language Model"
|
| 766 |
df_datatypes.append("html")
|
| 767 |
else:
|
| 768 |
df_datatypes.append("str")
|
|
@@ -773,12 +782,8 @@ def create_benchmark_details_display(
|
|
| 773 |
agent_col="SDK Version",
|
| 774 |
name=benchmark_name
|
| 775 |
)
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
gr.Plot(value=benchmark_plot, show_label=False)
|
| 779 |
-
gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|
| 780 |
-
with gr.Column(scale=1):
|
| 781 |
-
gr.HTML(value=plot_legend_html)
|
| 782 |
|
| 783 |
# Put table and key into an accordion
|
| 784 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
|
|
|
| 442 |
"""Prepare a DataFrame for display with all formatting applied."""
|
| 443 |
df_display = df_view.copy()
|
| 444 |
|
| 445 |
+
# Get Pareto frontier info
|
| 446 |
pareto_df = get_pareto_df(df_display)
|
| 447 |
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
|
|
|
|
| 448 |
if not pareto_df.empty and 'id' in pareto_df.columns:
|
| 449 |
pareto_agent_names = pareto_df['id'].tolist()
|
| 450 |
else:
|
| 451 |
pareto_agent_names = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
|
| 453 |
for col in df_display.columns:
|
| 454 |
if "Cost" in col:
|
|
|
|
| 458 |
if "Score" in col:
|
| 459 |
df_display = format_score_column(df_display, col)
|
| 460 |
|
| 461 |
+
# Clean the Language Model column first
|
| 462 |
df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list)
|
| 463 |
+
|
| 464 |
+
# Now combine icons with Language Model column
|
| 465 |
+
def format_language_model_with_icons(row):
|
| 466 |
+
icons_html = ''
|
| 467 |
+
|
| 468 |
+
# Add Pareto trophy if on frontier
|
| 469 |
+
if row['id'] in pareto_agent_names:
|
| 470 |
+
icons_html += f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:18px; height:18px; vertical-align:middle; margin-right:4px;">'
|
| 471 |
+
|
| 472 |
+
# Add openness lock icon
|
| 473 |
+
openness_val = row.get('Openness', '')
|
| 474 |
+
if openness_val in [aliases.CANONICAL_OPENNESS_OPEN, 'Open', 'Open Source', 'Open Source + Open Weights']:
|
| 475 |
+
lock_uri = get_svg_as_data_uri("assets/lock-open.svg")
|
| 476 |
+
icons_html += f'<img src="{lock_uri}" alt="Open" title="Open source model" style="width:16px; height:16px; vertical-align:middle; margin-right:4px;">'
|
| 477 |
+
else:
|
| 478 |
+
lock_uri = get_svg_as_data_uri("assets/lock-closed.svg")
|
| 479 |
+
icons_html += f'<img src="{lock_uri}" alt="Closed" title="Closed source model" style="width:16px; height:16px; vertical-align:middle; margin-right:4px;">'
|
| 480 |
+
|
| 481 |
+
# Add company logo
|
| 482 |
+
company_html = get_company_logo_html(row['Language Model'])
|
| 483 |
+
if company_html:
|
| 484 |
+
icons_html += company_html.replace('style="', 'style="vertical-align:middle; margin-right:6px; ')
|
| 485 |
+
|
| 486 |
+
# Format the model name
|
| 487 |
+
model_name = row['Language Model']
|
| 488 |
+
if isinstance(model_name, list):
|
| 489 |
+
if len(model_name) > 1:
|
| 490 |
+
tooltip_text = "\\n".join(map(str, model_name))
|
| 491 |
+
model_text = f'<span class="tooltip-icon cell-tooltip-icon" style="cursor: help;" data-tooltip="{tooltip_text}">{model_name[0]} (+ {len(model_name) - 1}) ⓘ</span>'
|
| 492 |
+
elif len(model_name) == 1:
|
| 493 |
+
model_text = model_name[0]
|
| 494 |
+
else:
|
| 495 |
+
model_text = str(model_name)
|
| 496 |
+
else:
|
| 497 |
+
model_text = str(model_name)
|
| 498 |
+
|
| 499 |
+
return f'{icons_html}{model_text}'
|
| 500 |
+
|
| 501 |
+
df_display['Language Model'] = df_display.apply(format_language_model_with_icons, axis=1)
|
| 502 |
|
| 503 |
if 'Source' in df_display.columns:
|
| 504 |
df_display['SDK Version'] = df_display.apply(
|
| 505 |
lambda row: f"{row['SDK Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['SDK Version'],
|
| 506 |
axis=1
|
| 507 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
|
| 509 |
columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source']
|
| 510 |
df_display = df_display.drop(columns=columns_to_drop, errors='ignore')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
return df_display
|
| 513 |
|
|
|
|
| 533 |
for col in df_headers:
|
| 534 |
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 535 |
df_datatypes.append("markdown")
|
| 536 |
+
elif col in ["SDK Version", "Language Model"]:
|
| 537 |
df_datatypes.append("html")
|
| 538 |
else:
|
| 539 |
df_datatypes.append("str")
|
| 540 |
# Dynamically set widths for the DataFrame columns
|
| 541 |
+
# Order: Language Model, SDK Version, Average Score, Average Cost, ...
|
| 542 |
+
fixed_start_widths = [280, 100, 100] # Language Model (with icons), SDK Version, Average Score
|
| 543 |
num_score_cost_cols = 0
|
| 544 |
remaining_headers = df_headers[len(fixed_start_widths):]
|
| 545 |
for col in remaining_headers:
|
|
|
|
| 550 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 551 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
| 552 |
|
| 553 |
+
plot_component = gr.Plot(
|
| 554 |
+
value=scatter_plot,
|
| 555 |
+
show_label=False,
|
| 556 |
+
)
|
| 557 |
+
gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
|
| 559 |
# Put table and key into an accordion
|
| 560 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
|
|
|
| 672 |
pareto_df = get_pareto_df(benchmark_table_df)
|
| 673 |
# Get the list of agents on the frontier. We'll use this list later.
|
| 674 |
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
|
|
|
|
| 675 |
if not pareto_df.empty and 'id' in pareto_df.columns:
|
| 676 |
pareto_agent_names = pareto_df['id'].tolist()
|
| 677 |
else:
|
| 678 |
pareto_agent_names = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 679 |
|
| 680 |
+
# Clean the Language Model column first
|
| 681 |
+
benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
|
| 682 |
+
|
| 683 |
+
# Combine icons with Language Model column
|
| 684 |
+
def format_language_model_with_icons(row):
|
| 685 |
+
icons_html = ''
|
| 686 |
+
|
| 687 |
+
# Add Pareto trophy if on frontier
|
| 688 |
+
if row['id'] in pareto_agent_names:
|
| 689 |
+
icons_html += f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:18px; height:18px; vertical-align:middle; margin-right:4px;">'
|
| 690 |
+
|
| 691 |
+
# Add openness lock icon
|
| 692 |
openness_val = row.get('Openness', '')
|
|
|
|
| 693 |
if openness_val in [aliases.CANONICAL_OPENNESS_OPEN, 'Open', 'Open Source', 'Open Source + Open Weights']:
|
| 694 |
+
lock_uri = get_svg_as_data_uri("assets/lock-open.svg")
|
| 695 |
+
icons_html += f'<img src="{lock_uri}" alt="Open" title="Open source model" style="width:16px; height:16px; vertical-align:middle; margin-right:4px;">'
|
| 696 |
+
else:
|
| 697 |
+
lock_uri = get_svg_as_data_uri("assets/lock-closed.svg")
|
| 698 |
+
icons_html += f'<img src="{lock_uri}" alt="Closed" title="Closed source model" style="width:16px; height:16px; vertical-align:middle; margin-right:4px;">'
|
| 699 |
+
|
| 700 |
+
# Add company logo
|
| 701 |
+
company_html = get_company_logo_html(row['Language Model'])
|
| 702 |
+
if company_html:
|
| 703 |
+
icons_html += company_html.replace('style="', 'style="vertical-align:middle; margin-right:6px; ')
|
| 704 |
+
|
| 705 |
+
# Format the model name
|
| 706 |
+
model_name = row['Language Model']
|
| 707 |
+
if isinstance(model_name, list):
|
| 708 |
+
if len(model_name) > 1:
|
| 709 |
+
tooltip_text = "\\n".join(map(str, model_name))
|
| 710 |
+
model_text = f'<span class="tooltip-icon cell-tooltip-icon" style="cursor: help;" data-tooltip="{tooltip_text}">{model_name[0]} (+ {len(model_name) - 1}) ⓘ</span>'
|
| 711 |
+
elif len(model_name) == 1:
|
| 712 |
+
model_text = model_name[0]
|
| 713 |
+
else:
|
| 714 |
+
model_text = str(model_name)
|
| 715 |
else:
|
| 716 |
+
model_text = str(model_name)
|
| 717 |
+
|
| 718 |
+
return f'{icons_html}{model_text}'
|
| 719 |
|
| 720 |
+
benchmark_table_df['Language Model'] = benchmark_table_df.apply(format_language_model_with_icons, axis=1)
|
| 721 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
# append the repro url to the end of the SDK Version
|
| 723 |
if 'Source' in benchmark_table_df.columns:
|
| 724 |
benchmark_table_df['SDK Version'] = benchmark_table_df.apply(
|
|
|
|
| 747 |
benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
|
| 748 |
benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
|
| 749 |
desired_cols_in_order = [
|
|
|
|
|
|
|
|
|
|
| 750 |
'Language Model',
|
| 751 |
'SDK Version',
|
| 752 |
'Attempted Benchmark',
|
|
|
|
| 764 |
benchmark_score_col: 'Score',
|
| 765 |
benchmark_cost_col: 'Cost',
|
| 766 |
}, inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 767 |
|
| 768 |
# Now get headers from the renamed dataframe
|
| 769 |
df_headers = benchmark_table_df.columns.tolist()
|
|
|
|
| 771 |
for col in df_headers:
|
| 772 |
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 773 |
df_datatypes.append("markdown")
|
| 774 |
+
elif col in ["SDK Version", "Language Model"]:
|
| 775 |
df_datatypes.append("html")
|
| 776 |
else:
|
| 777 |
df_datatypes.append("str")
|
|
|
|
| 782 |
agent_col="SDK Version",
|
| 783 |
name=benchmark_name
|
| 784 |
)
|
| 785 |
+
gr.Plot(value=benchmark_plot, show_label=False)
|
| 786 |
+
gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 787 |
|
| 788 |
# Put table and key into an accordion
|
| 789 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|