Anas Awadalla
commited on
Commit
Β·
a860139
1
Parent(s):
98d976c
multi select models
Browse files- src/streamlit_app.py +133 -0
src/streamlit_app.py
CHANGED
|
@@ -384,6 +384,89 @@ def create_bar_chart(data: pd.DataFrame, metric: str, title: str):
|
|
| 384 |
|
| 385 |
return chart + text
|
| 386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
def main():
|
| 388 |
st.title("π― Grounding Benchmark Leaderboard")
|
| 389 |
st.markdown("Visualization of model performance on grounding benchmarks")
|
|
@@ -508,6 +591,56 @@ def main():
|
|
| 508 |
st.altair_chart(chart, use_container_width=True)
|
| 509 |
else:
|
| 510 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
if __name__ == "__main__":
|
| 513 |
main()
|
|
|
|
| 384 |
|
| 385 |
return chart + text
|
| 386 |
|
| 387 |
+
def create_results_table(data: pd.DataFrame, dataset: str):
|
| 388 |
+
"""Create a formatted results table with best scores highlighted."""
|
| 389 |
+
if data.empty:
|
| 390 |
+
return None
|
| 391 |
+
|
| 392 |
+
# Copy data to avoid modifying original
|
| 393 |
+
table_data = data.copy()
|
| 394 |
+
|
| 395 |
+
# Remove columns we don't want to display
|
| 396 |
+
columns_to_drop = ['is_best_not_last', 'all_checkpoints']
|
| 397 |
+
table_data = table_data.drop(columns=[col for col in columns_to_drop if col in table_data.columns])
|
| 398 |
+
|
| 399 |
+
# Sort by overall score in descending order
|
| 400 |
+
if 'overall' in table_data.columns:
|
| 401 |
+
table_data = table_data.sort_values('overall', ascending=False)
|
| 402 |
+
|
| 403 |
+
# Determine which columns to show based on dataset
|
| 404 |
+
if dataset == 'screenspot-v2':
|
| 405 |
+
# Show all breakdown columns
|
| 406 |
+
column_order = ['model', 'desktop_text', 'desktop_icon', 'web_text', 'web_icon',
|
| 407 |
+
'desktop_avg', 'web_avg', 'text_avg', 'icon_avg', 'overall']
|
| 408 |
+
column_names = {
|
| 409 |
+
'model': 'Model',
|
| 410 |
+
'desktop_text': 'Desktop Text',
|
| 411 |
+
'desktop_icon': 'Desktop Icon',
|
| 412 |
+
'web_text': 'Web Text',
|
| 413 |
+
'web_icon': 'Web Icon',
|
| 414 |
+
'desktop_avg': 'Desktop Avg',
|
| 415 |
+
'web_avg': 'Web Avg',
|
| 416 |
+
'text_avg': 'Text Avg',
|
| 417 |
+
'icon_avg': 'Icon Avg',
|
| 418 |
+
'overall': 'Overall'
|
| 419 |
+
}
|
| 420 |
+
elif 'text' in table_data.columns and 'icon' in table_data.columns:
|
| 421 |
+
# Show text/icon breakdown
|
| 422 |
+
column_order = ['model', 'text', 'icon', 'overall']
|
| 423 |
+
column_names = {
|
| 424 |
+
'model': 'Model',
|
| 425 |
+
'text': 'Text',
|
| 426 |
+
'icon': 'Icon',
|
| 427 |
+
'overall': 'Overall'
|
| 428 |
+
}
|
| 429 |
+
else:
|
| 430 |
+
# Show only overall
|
| 431 |
+
column_order = ['model', 'overall']
|
| 432 |
+
column_names = {
|
| 433 |
+
'model': 'Model',
|
| 434 |
+
'overall': 'Overall'
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
# Filter and reorder columns
|
| 438 |
+
available_columns = [col for col in column_order if col in table_data.columns]
|
| 439 |
+
table_data = table_data[available_columns]
|
| 440 |
+
|
| 441 |
+
# Rename columns for display
|
| 442 |
+
table_data = table_data.rename(columns=column_names)
|
| 443 |
+
|
| 444 |
+
# Round numeric columns to 1 decimal place
|
| 445 |
+
numeric_columns = [col for col in table_data.columns if col != 'Model']
|
| 446 |
+
for col in numeric_columns:
|
| 447 |
+
if col in table_data.columns:
|
| 448 |
+
table_data[col] = table_data[col].round(1)
|
| 449 |
+
|
| 450 |
+
# Apply styling to highlight best scores
|
| 451 |
+
def highlight_best(s):
|
| 452 |
+
"""Highlight the best score in each column."""
|
| 453 |
+
if s.name == 'Model':
|
| 454 |
+
return [''] * len(s)
|
| 455 |
+
|
| 456 |
+
# Find the maximum value
|
| 457 |
+
max_val = s.max()
|
| 458 |
+
# Return style for each cell
|
| 459 |
+
return ['font-weight: bold; color: #2E7D32' if v == max_val else '' for v in s]
|
| 460 |
+
|
| 461 |
+
# Style the dataframe
|
| 462 |
+
styled_table = table_data.style.apply(highlight_best)
|
| 463 |
+
|
| 464 |
+
# Format numbers to show 1 decimal place
|
| 465 |
+
format_dict = {col: '{:.1f}' for col in numeric_columns if col in table_data.columns}
|
| 466 |
+
styled_table = styled_table.format(format_dict)
|
| 467 |
+
|
| 468 |
+
return styled_table
|
| 469 |
+
|
| 470 |
def main():
|
| 471 |
st.title("π― Grounding Benchmark Leaderboard")
|
| 472 |
st.markdown("Visualization of model performance on grounding benchmarks")
|
|
|
|
| 591 |
st.altair_chart(chart, use_container_width=True)
|
| 592 |
else:
|
| 593 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
| 594 |
+
|
| 595 |
+
# Display results table
|
| 596 |
+
st.subheader("π Results Table")
|
| 597 |
+
|
| 598 |
+
# Filter ui_metrics_df to only include selected models
|
| 599 |
+
if not ui_metrics_df.empty:
|
| 600 |
+
table_df = ui_metrics_df[ui_metrics_df['model'].isin(selected_models)].copy()
|
| 601 |
+
|
| 602 |
+
# Add baselines to the table if available
|
| 603 |
+
if selected_dataset in BASELINES:
|
| 604 |
+
baseline_rows = []
|
| 605 |
+
for baseline_name, baseline_metrics in BASELINES[selected_dataset].items():
|
| 606 |
+
baseline_row = {'model': f"{baseline_name} (baseline)"}
|
| 607 |
+
|
| 608 |
+
# Map baseline metrics to table columns
|
| 609 |
+
if selected_dataset == 'screenspot-v2':
|
| 610 |
+
baseline_row.update({
|
| 611 |
+
'desktop_text': baseline_metrics.get('desktop_text', 0),
|
| 612 |
+
'desktop_icon': baseline_metrics.get('desktop_icon', 0),
|
| 613 |
+
'web_text': baseline_metrics.get('web_text', 0),
|
| 614 |
+
'web_icon': baseline_metrics.get('web_icon', 0),
|
| 615 |
+
'overall': baseline_metrics.get('overall', 0)
|
| 616 |
+
})
|
| 617 |
+
# Calculate averages if not provided
|
| 618 |
+
if 'desktop_text' in baseline_metrics and 'desktop_icon' in baseline_metrics:
|
| 619 |
+
baseline_row['desktop_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['desktop_icon']) / 2
|
| 620 |
+
if 'web_text' in baseline_metrics and 'web_icon' in baseline_metrics:
|
| 621 |
+
baseline_row['web_avg'] = (baseline_metrics['web_text'] + baseline_metrics['web_icon']) / 2
|
| 622 |
+
if 'desktop_text' in baseline_metrics and 'web_text' in baseline_metrics:
|
| 623 |
+
baseline_row['text_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['web_text']) / 2
|
| 624 |
+
if 'desktop_icon' in baseline_metrics and 'web_icon' in baseline_metrics:
|
| 625 |
+
baseline_row['icon_avg'] = (baseline_metrics['desktop_icon'] + baseline_metrics['web_icon']) / 2
|
| 626 |
+
else:
|
| 627 |
+
baseline_row['overall'] = baseline_metrics.get('overall', 0)
|
| 628 |
+
|
| 629 |
+
baseline_rows.append(baseline_row)
|
| 630 |
+
|
| 631 |
+
# Append baselines to table
|
| 632 |
+
if baseline_rows:
|
| 633 |
+
baseline_df = pd.DataFrame(baseline_rows)
|
| 634 |
+
table_df = pd.concat([table_df, baseline_df], ignore_index=True)
|
| 635 |
+
|
| 636 |
+
# Create and display the styled table
|
| 637 |
+
styled_table = create_results_table(table_df, selected_dataset)
|
| 638 |
+
if styled_table is not None:
|
| 639 |
+
st.dataframe(styled_table, use_container_width=True, hide_index=True)
|
| 640 |
+
else:
|
| 641 |
+
st.info("No data available for the selected models.")
|
| 642 |
+
else:
|
| 643 |
+
st.info("No detailed metrics available for this dataset.")
|
| 644 |
|
| 645 |
if __name__ == "__main__":
|
| 646 |
main()
|