Spaces:
Running
Running
openhands openhands commited on
Commit Β·
dfa8bfc
1
Parent(s): 0f8031a
feat: Add Visualization column for Laminar eval links
Browse filesDisplay eval_visualization_page URLs as clickable π icons in a new
Visualization column next to Logs.
Changes:
- Load eval_visualization_page from score entries in simple_data_loader.py
- Store as '{benchmark} visualization' column similar to download URLs
- Add π column to benchmark detail tables with clickable links
- Only show icon if visualization URL exists, otherwise blank
- Add tooltip documentation for the new column
Co-authored-by: openhands <openhands@all-hands.dev>
- simple_data_loader.py +6 -0
- ui_components.py +17 -4
simple_data_loader.py
CHANGED
|
@@ -172,6 +172,7 @@ class SimpleLeaderboardViewer:
|
|
| 172 |
'average_runtime': score_entry.get('average_runtime'),
|
| 173 |
'tags': [score_entry.get('benchmark')],
|
| 174 |
'full_archive': score_entry.get('full_archive', ''), # Download URL for trajectories
|
|
|
|
| 175 |
}
|
| 176 |
all_records.append(record)
|
| 177 |
|
|
@@ -262,6 +263,11 @@ class SimpleLeaderboardViewer:
|
|
| 262 |
if full_archive_url:
|
| 263 |
record[f'{tag} download'] = full_archive_url
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
# Track category-level data for aggregation
|
| 266 |
if tag in self.benchmark_to_categories:
|
| 267 |
for category in self.benchmark_to_categories[tag]:
|
|
|
|
| 172 |
'average_runtime': score_entry.get('average_runtime'),
|
| 173 |
'tags': [score_entry.get('benchmark')],
|
| 174 |
'full_archive': score_entry.get('full_archive', ''), # Download URL for trajectories
|
| 175 |
+
'eval_visualization_page': score_entry.get('eval_visualization_page', ''), # Laminar visualization URL
|
| 176 |
}
|
| 177 |
all_records.append(record)
|
| 178 |
|
|
|
|
| 263 |
if full_archive_url:
|
| 264 |
record[f'{tag} download'] = full_archive_url
|
| 265 |
|
| 266 |
+
# Store the eval_visualization_page URL for this benchmark (for Laminar visualization)
|
| 267 |
+
viz_url = row.get('eval_visualization_page', '') if hasattr(row, 'get') else row['eval_visualization_page'] if 'eval_visualization_page' in row.index else ''
|
| 268 |
+
if viz_url:
|
| 269 |
+
record[f'{tag} visualization'] = viz_url
|
| 270 |
+
|
| 271 |
# Track category-level data for aggregation
|
| 272 |
if tag in self.benchmark_to_categories:
|
| 273 |
for category in self.benchmark_to_categories[tag]:
|
ui_components.py
CHANGED
|
@@ -181,6 +181,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 181 |
<div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
|
| 182 |
<div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
|
| 183 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
|
|
|
| 184 |
<div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
|
| 185 |
"""
|
| 186 |
else:
|
|
@@ -192,6 +193,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 192 |
<div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
|
| 193 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
|
| 194 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
|
|
|
| 195 |
<div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
|
| 196 |
"""
|
| 197 |
|
|
@@ -1073,9 +1075,10 @@ def create_benchmark_details_display(
|
|
| 1073 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 1074 |
benchmark_runtime_col = f"{benchmark_name} Runtime"
|
| 1075 |
benchmark_download_col = f"{benchmark_name} Download"
|
|
|
|
| 1076 |
|
| 1077 |
# Define the columns needed for the detailed table
|
| 1078 |
-
table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col, benchmark_runtime_col, 'Logs', benchmark_download_col, 'id', 'Language Model']
|
| 1079 |
|
| 1080 |
# Filter to only columns that actually exist in the full dataframe
|
| 1081 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
@@ -1173,6 +1176,14 @@ def create_benchmark_details_display(
|
|
| 1173 |
return f"[β¬οΈ]({url})"
|
| 1174 |
benchmark_table_df[benchmark_download_col] = benchmark_table_df[benchmark_download_col].apply(format_download_link)
|
| 1175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1176 |
desired_cols_in_order = [
|
| 1177 |
'Language Model',
|
| 1178 |
'SDK Version',
|
|
@@ -1182,6 +1193,7 @@ def create_benchmark_details_display(
|
|
| 1182 |
benchmark_runtime_col,
|
| 1183 |
'Date',
|
| 1184 |
'Logs',
|
|
|
|
| 1185 |
benchmark_download_col
|
| 1186 |
]
|
| 1187 |
for col in desired_cols_in_order:
|
|
@@ -1202,14 +1214,15 @@ def create_benchmark_details_display(
|
|
| 1202 |
benchmark_score_col: 'Score',
|
| 1203 |
benchmark_cost_col: 'Cost',
|
| 1204 |
benchmark_runtime_col: 'Runtime',
|
| 1205 |
-
|
|
|
|
| 1206 |
}, inplace=True)
|
| 1207 |
|
| 1208 |
# Now get headers from the renamed dataframe
|
| 1209 |
df_headers = benchmark_table_df.columns.tolist()
|
| 1210 |
df_datatypes = []
|
| 1211 |
for col in df_headers:
|
| 1212 |
-
if col in ["Logs", "β¬οΈ"] or "Cost" in col or "Score" in col or "Runtime" in col:
|
| 1213 |
df_datatypes.append("markdown")
|
| 1214 |
elif col in ["SDK Version", "Language Model"]:
|
| 1215 |
df_datatypes.append("html")
|
|
@@ -1247,7 +1260,7 @@ def create_benchmark_details_display(
|
|
| 1247 |
datatype=df_datatypes,
|
| 1248 |
interactive=False,
|
| 1249 |
wrap=True,
|
| 1250 |
-
column_widths=[200, 80, 40, 80, 80, 80, 100, 120, 40], # Language Model, SDK Version, Attempted, Score, Cost, Runtime, Date, Logs, Download
|
| 1251 |
show_search="search",
|
| 1252 |
elem_classes=["wrap-header-df"]
|
| 1253 |
)
|
|
|
|
| 181 |
<div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
|
| 182 |
<div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
|
| 183 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 184 |
+
<div class="tooltip-description-item"><b>π Visualization:</b> View interactive evaluation visualization (when available).</div>
|
| 185 |
<div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
|
| 186 |
"""
|
| 187 |
else:
|
|
|
|
| 193 |
<div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
|
| 194 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
|
| 195 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 196 |
+
<div class="tooltip-description-item"><b>π Visualization:</b> View interactive evaluation visualization (when available).</div>
|
| 197 |
<div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
|
| 198 |
"""
|
| 199 |
|
|
|
|
| 1075 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 1076 |
benchmark_runtime_col = f"{benchmark_name} Runtime"
|
| 1077 |
benchmark_download_col = f"{benchmark_name} Download"
|
| 1078 |
+
benchmark_visualization_col = f"{benchmark_name} Visualization"
|
| 1079 |
|
| 1080 |
# Define the columns needed for the detailed table
|
| 1081 |
+
table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col, benchmark_runtime_col, 'Logs', benchmark_visualization_col, benchmark_download_col, 'id', 'Language Model']
|
| 1082 |
|
| 1083 |
# Filter to only columns that actually exist in the full dataframe
|
| 1084 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
|
|
| 1176 |
return f"[β¬οΈ]({url})"
|
| 1177 |
benchmark_table_df[benchmark_download_col] = benchmark_table_df[benchmark_download_col].apply(format_download_link)
|
| 1178 |
|
| 1179 |
+
# Format visualization column as clickable icon (bar chart emoji)
|
| 1180 |
+
if benchmark_visualization_col in benchmark_table_df.columns:
|
| 1181 |
+
def format_visualization_link(url):
|
| 1182 |
+
if pd.isna(url) or url == "":
|
| 1183 |
+
return ""
|
| 1184 |
+
return f"[π]({url})"
|
| 1185 |
+
benchmark_table_df[benchmark_visualization_col] = benchmark_table_df[benchmark_visualization_col].apply(format_visualization_link)
|
| 1186 |
+
|
| 1187 |
desired_cols_in_order = [
|
| 1188 |
'Language Model',
|
| 1189 |
'SDK Version',
|
|
|
|
| 1193 |
benchmark_runtime_col,
|
| 1194 |
'Date',
|
| 1195 |
'Logs',
|
| 1196 |
+
benchmark_visualization_col,
|
| 1197 |
benchmark_download_col
|
| 1198 |
]
|
| 1199 |
for col in desired_cols_in_order:
|
|
|
|
| 1214 |
benchmark_score_col: 'Score',
|
| 1215 |
benchmark_cost_col: 'Cost',
|
| 1216 |
benchmark_runtime_col: 'Runtime',
|
| 1217 |
+
benchmark_visualization_col: 'π', # Visualization column header
|
| 1218 |
+
benchmark_download_col: 'β¬οΈ', # Download column header
|
| 1219 |
}, inplace=True)
|
| 1220 |
|
| 1221 |
# Now get headers from the renamed dataframe
|
| 1222 |
df_headers = benchmark_table_df.columns.tolist()
|
| 1223 |
df_datatypes = []
|
| 1224 |
for col in df_headers:
|
| 1225 |
+
if col in ["Logs", "π", "β¬οΈ"] or "Cost" in col or "Score" in col or "Runtime" in col:
|
| 1226 |
df_datatypes.append("markdown")
|
| 1227 |
elif col in ["SDK Version", "Language Model"]:
|
| 1228 |
df_datatypes.append("html")
|
|
|
|
| 1260 |
datatype=df_datatypes,
|
| 1261 |
interactive=False,
|
| 1262 |
wrap=True,
|
| 1263 |
+
column_widths=[200, 80, 40, 80, 80, 80, 100, 120, 40, 40], # Language Model, SDK Version, Attempted, Score, Cost, Runtime, Date, Logs, Visualization, Download
|
| 1264 |
show_search="search",
|
| 1265 |
elem_classes=["wrap-header-df"]
|
| 1266 |
)
|