Spaces:
Running
Running
openhands openhands commited on
Commit ·
5f628a6
1
Parent(s): dfa8bfc
fix: Add Visualization column to main table (not just benchmark tables)
Browse filesThe previous commit only added visualization links to individual benchmark
detail tables. This commit adds the Visualization column to the main front
page table as well.
Changes:
- Add 'visualization' column to record in simple_data_loader.py
- Add 'Visualization' to ending_cols in leaderboard_transformer.py
- Add make_visualization_icons() to format the column in ui_components.py
- Update column widths and datatypes to include Visualization
- Add tooltip description for Visualization column
Co-authored-by: openhands <openhands@all-hands.dev>
- leaderboard_transformer.py +1 -1
- simple_data_loader.py +1 -0
- ui_components.py +17 -2
leaderboard_transformer.py
CHANGED
|
@@ -814,7 +814,7 @@ class DataTransformer:
|
|
| 814 |
# --- 3. Add Columns for Agent Openness ---
|
| 815 |
base_cols = ["id","Language Model","SDK Version","Source"]
|
| 816 |
new_cols = ["Openness"]
|
| 817 |
-
ending_cols = ["Date", "Logs"]
|
| 818 |
|
| 819 |
# For Overall view, use "Average Cost" and "Average Runtime" (per instance across all benchmarks)
|
| 820 |
if tag is None or tag == "Overall":
|
|
|
|
| 814 |
# --- 3. Add Columns for Agent Openness ---
|
| 815 |
base_cols = ["id","Language Model","SDK Version","Source"]
|
| 816 |
new_cols = ["Openness"]
|
| 817 |
+
ending_cols = ["Date", "Logs", "Visualization"]
|
| 818 |
|
| 819 |
# For Overall view, use "Average Cost" and "Average Runtime" (per instance across all benchmarks)
|
| 820 |
if tag is None or tag == "Overall":
|
simple_data_loader.py
CHANGED
|
@@ -239,6 +239,7 @@ class SimpleLeaderboardViewer:
|
|
| 239 |
'id': agent_id,
|
| 240 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 241 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
|
|
|
| 242 |
}
|
| 243 |
|
| 244 |
# Add per-dataset scores and costs
|
|
|
|
| 239 |
'id': agent_id,
|
| 240 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 241 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
| 242 |
+
'visualization': '', # Will become "Visualization" - populated below
|
| 243 |
}
|
| 244 |
|
| 245 |
# Add per-dataset scores and costs
|
ui_components.py
CHANGED
|
@@ -170,6 +170,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 170 |
<div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
|
| 171 |
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
|
| 172 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
|
|
|
| 173 |
"""
|
| 174 |
elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
|
| 175 |
return f"""
|
|
@@ -735,7 +736,7 @@ def create_leaderboard_display(
|
|
| 735 |
df_headers = df_display_all.columns.tolist()
|
| 736 |
df_datatypes = []
|
| 737 |
for col in df_headers:
|
| 738 |
-
if col
|
| 739 |
df_datatypes.append("markdown")
|
| 740 |
elif col in ["SDK Version", "Language Model"]:
|
| 741 |
df_datatypes.append("html")
|
|
@@ -750,7 +751,7 @@ def create_leaderboard_display(
|
|
| 750 |
if "Score" in col or "Cost" in col or "Runtime" in col:
|
| 751 |
num_score_cost_runtime_cols += 1
|
| 752 |
dynamic_widths = [90] * num_score_cost_runtime_cols
|
| 753 |
-
fixed_end_widths = [90, 100, 120] # Categories Attempted, Date, Logs
|
| 754 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 755 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
| 756 |
|
|
@@ -1294,6 +1295,20 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
|
|
| 1294 |
parts.append(f'<a href="{url}" title="Download {benchmark} results" target="_blank">⬇️</a>')
|
| 1295 |
return " ".join(parts) if parts else ""
|
| 1296 |
pretty_df["Logs"] = pretty_df.index.map(make_download_icons)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1297 |
|
| 1298 |
if "Source" in pretty_df.columns:
|
| 1299 |
def format_source_url_to_html(raw_url):
|
|
|
|
| 170 |
<div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
|
| 171 |
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
|
| 172 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 173 |
+
<div class="tooltip-description-item"><b>Visualization:</b> View interactive evaluation visualization (when available).</div>
|
| 174 |
"""
|
| 175 |
elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
|
| 176 |
return f"""
|
|
|
|
| 736 |
df_headers = df_display_all.columns.tolist()
|
| 737 |
df_datatypes = []
|
| 738 |
for col in df_headers:
|
| 739 |
+
if col in ["Logs", "Visualization"] or "Cost" in col or "Score" in col or "Runtime" in col:
|
| 740 |
df_datatypes.append("markdown")
|
| 741 |
elif col in ["SDK Version", "Language Model"]:
|
| 742 |
df_datatypes.append("html")
|
|
|
|
| 751 |
if "Score" in col or "Cost" in col or "Runtime" in col:
|
| 752 |
num_score_cost_runtime_cols += 1
|
| 753 |
dynamic_widths = [90] * num_score_cost_runtime_cols
|
| 754 |
+
fixed_end_widths = [90, 100, 120, 120] # Categories Attempted, Date, Logs, Visualization
|
| 755 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 756 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
| 757 |
|
|
|
|
| 1295 |
parts.append(f'<a href="{url}" title="Download {benchmark} results" target="_blank">⬇️</a>')
|
| 1296 |
return " ".join(parts) if parts else ""
|
| 1297 |
pretty_df["Logs"] = pretty_df.index.map(make_download_icons)
|
| 1298 |
+
|
| 1299 |
+
if "Visualization" in pretty_df.columns:
|
| 1300 |
+
# Format Visualization column with bar chart icons using raw_df data
|
| 1301 |
+
benchmarks = ['swe-bench', 'swe-bench-multimodal', 'commit0', 'swt-bench', 'gaia']
|
| 1302 |
+
def make_visualization_icons(idx):
|
| 1303 |
+
parts = []
|
| 1304 |
+
for benchmark in benchmarks:
|
| 1305 |
+
viz_col = f"{benchmark} visualization"
|
| 1306 |
+
if viz_col in raw_df.columns:
|
| 1307 |
+
url = raw_df.loc[idx, viz_col] if idx in raw_df.index else None
|
| 1308 |
+
if pd.notna(url) and url != "":
|
| 1309 |
+
parts.append(f'<a href="{url}" title="Visualize {benchmark} results" target="_blank">📊</a>')
|
| 1310 |
+
return " ".join(parts) if parts else ""
|
| 1311 |
+
pretty_df["Visualization"] = pretty_df.index.map(make_visualization_icons)
|
| 1312 |
|
| 1313 |
if "Source" in pretty_df.columns:
|
| 1314 |
def format_source_url_to_html(raw_url):
|