openhands openhands commited on
Commit
dfa8bfc
Β·
1 Parent(s): 0f8031a

feat: Add Visualization column for Laminar eval links

Browse files

Display eval_visualization_page URLs as clickable πŸ“Š icons in a new
Visualization column next to Logs.

Changes:
- Load eval_visualization_page from score entries in simple_data_loader.py
- Store as '{benchmark} visualization' column similar to download URLs
- Add πŸ“Š column to benchmark detail tables with clickable links
- Only show icon if visualization URL exists, otherwise blank
- Add tooltip documentation for the new column

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (2) hide show
  1. simple_data_loader.py +6 -0
  2. ui_components.py +17 -4
simple_data_loader.py CHANGED
@@ -172,6 +172,7 @@ class SimpleLeaderboardViewer:
172
  'average_runtime': score_entry.get('average_runtime'),
173
  'tags': [score_entry.get('benchmark')],
174
  'full_archive': score_entry.get('full_archive', ''), # Download URL for trajectories
 
175
  }
176
  all_records.append(record)
177
 
@@ -262,6 +263,11 @@ class SimpleLeaderboardViewer:
262
  if full_archive_url:
263
  record[f'{tag} download'] = full_archive_url
264
 
 
 
 
 
 
265
  # Track category-level data for aggregation
266
  if tag in self.benchmark_to_categories:
267
  for category in self.benchmark_to_categories[tag]:
 
172
  'average_runtime': score_entry.get('average_runtime'),
173
  'tags': [score_entry.get('benchmark')],
174
  'full_archive': score_entry.get('full_archive', ''), # Download URL for trajectories
175
+ 'eval_visualization_page': score_entry.get('eval_visualization_page', ''), # Laminar visualization URL
176
  }
177
  all_records.append(record)
178
 
 
263
  if full_archive_url:
264
  record[f'{tag} download'] = full_archive_url
265
 
266
+ # Store the eval_visualization_page URL for this benchmark (for Laminar visualization)
267
+ viz_url = row.get('eval_visualization_page', '') if hasattr(row, 'get') else row['eval_visualization_page'] if 'eval_visualization_page' in row.index else ''
268
+ if viz_url:
269
+ record[f'{tag} visualization'] = viz_url
270
+
271
  # Track category-level data for aggregation
272
  if tag in self.benchmark_to_categories:
273
  for category in self.benchmark_to_categories[tag]:
ui_components.py CHANGED
@@ -181,6 +181,7 @@ def build_descriptions_tooltip_content(table) -> str:
181
  <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
182
  <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
183
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
 
184
  <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
185
  """
186
  else:
@@ -192,6 +193,7 @@ def build_descriptions_tooltip_content(table) -> str:
192
  <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
193
  <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
194
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
 
195
  <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
196
  """
197
 
@@ -1073,9 +1075,10 @@ def create_benchmark_details_display(
1073
  benchmark_cost_col = f"{benchmark_name} Cost"
1074
  benchmark_runtime_col = f"{benchmark_name} Runtime"
1075
  benchmark_download_col = f"{benchmark_name} Download"
 
1076
 
1077
  # Define the columns needed for the detailed table
1078
- table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col, benchmark_runtime_col, 'Logs', benchmark_download_col, 'id', 'Language Model']
1079
 
1080
  # Filter to only columns that actually exist in the full dataframe
1081
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -1173,6 +1176,14 @@ def create_benchmark_details_display(
1173
  return f"[⬇️]({url})"
1174
  benchmark_table_df[benchmark_download_col] = benchmark_table_df[benchmark_download_col].apply(format_download_link)
1175
 
 
 
 
 
 
 
 
 
1176
  desired_cols_in_order = [
1177
  'Language Model',
1178
  'SDK Version',
@@ -1182,6 +1193,7 @@ def create_benchmark_details_display(
1182
  benchmark_runtime_col,
1183
  'Date',
1184
  'Logs',
 
1185
  benchmark_download_col
1186
  ]
1187
  for col in desired_cols_in_order:
@@ -1202,14 +1214,15 @@ def create_benchmark_details_display(
1202
  benchmark_score_col: 'Score',
1203
  benchmark_cost_col: 'Cost',
1204
  benchmark_runtime_col: 'Runtime',
1205
- benchmark_download_col: '⬇️', # Empty-ish header with icon hint
 
1206
  }, inplace=True)
1207
 
1208
  # Now get headers from the renamed dataframe
1209
  df_headers = benchmark_table_df.columns.tolist()
1210
  df_datatypes = []
1211
  for col in df_headers:
1212
- if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col or "Runtime" in col:
1213
  df_datatypes.append("markdown")
1214
  elif col in ["SDK Version", "Language Model"]:
1215
  df_datatypes.append("html")
@@ -1247,7 +1260,7 @@ def create_benchmark_details_display(
1247
  datatype=df_datatypes,
1248
  interactive=False,
1249
  wrap=True,
1250
- column_widths=[200, 80, 40, 80, 80, 80, 100, 120, 40], # Language Model, SDK Version, Attempted, Score, Cost, Runtime, Date, Logs, Download
1251
  show_search="search",
1252
  elem_classes=["wrap-header-df"]
1253
  )
 
181
  <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
182
  <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
183
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
184
+ <div class="tooltip-description-item"><b>πŸ“Š Visualization:</b> View interactive evaluation visualization (when available).</div>
185
  <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
186
  """
187
  else:
 
193
  <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
194
  <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
195
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
196
+ <div class="tooltip-description-item"><b>πŸ“Š Visualization:</b> View interactive evaluation visualization (when available).</div>
197
  <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
198
  """
199
 
 
1075
  benchmark_cost_col = f"{benchmark_name} Cost"
1076
  benchmark_runtime_col = f"{benchmark_name} Runtime"
1077
  benchmark_download_col = f"{benchmark_name} Download"
1078
+ benchmark_visualization_col = f"{benchmark_name} Visualization"
1079
 
1080
  # Define the columns needed for the detailed table
1081
+ table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col, benchmark_runtime_col, 'Logs', benchmark_visualization_col, benchmark_download_col, 'id', 'Language Model']
1082
 
1083
  # Filter to only columns that actually exist in the full dataframe
1084
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
 
1176
  return f"[⬇️]({url})"
1177
  benchmark_table_df[benchmark_download_col] = benchmark_table_df[benchmark_download_col].apply(format_download_link)
1178
 
1179
+ # Format visualization column as clickable icon (bar chart emoji)
1180
+ if benchmark_visualization_col in benchmark_table_df.columns:
1181
+ def format_visualization_link(url):
1182
+ if pd.isna(url) or url == "":
1183
+ return ""
1184
+ return f"[πŸ“Š]({url})"
1185
+ benchmark_table_df[benchmark_visualization_col] = benchmark_table_df[benchmark_visualization_col].apply(format_visualization_link)
1186
+
1187
  desired_cols_in_order = [
1188
  'Language Model',
1189
  'SDK Version',
 
1193
  benchmark_runtime_col,
1194
  'Date',
1195
  'Logs',
1196
+ benchmark_visualization_col,
1197
  benchmark_download_col
1198
  ]
1199
  for col in desired_cols_in_order:
 
1214
  benchmark_score_col: 'Score',
1215
  benchmark_cost_col: 'Cost',
1216
  benchmark_runtime_col: 'Runtime',
1217
+ benchmark_visualization_col: 'πŸ“Š', # Visualization column header
1218
+ benchmark_download_col: '⬇️', # Download column header
1219
  }, inplace=True)
1220
 
1221
  # Now get headers from the renamed dataframe
1222
  df_headers = benchmark_table_df.columns.tolist()
1223
  df_datatypes = []
1224
  for col in df_headers:
1225
+ if col in ["Logs", "πŸ“Š", "⬇️"] or "Cost" in col or "Score" in col or "Runtime" in col:
1226
  df_datatypes.append("markdown")
1227
  elif col in ["SDK Version", "Language Model"]:
1228
  df_datatypes.append("html")
 
1260
  datatype=df_datatypes,
1261
  interactive=False,
1262
  wrap=True,
1263
+ column_widths=[200, 80, 40, 80, 80, 80, 100, 120, 40, 40], # Language Model, SDK Version, Attempted, Score, Cost, Runtime, Date, Logs, Visualization, Download
1264
  show_search="search",
1265
  elem_classes=["wrap-header-df"]
1266
  )