Anas Awadalla
commited on
Commit
·
fc25316
1
Parent(s):
a860139
add subset avg for pro baselines
Browse files- src/streamlit_app.py +54 -8
src/streamlit_app.py
CHANGED
|
@@ -41,12 +41,34 @@ BASELINES = {
|
|
| 41 |
"Qwen2.5-VL-7B-Instruct": {"desktop_text": 87.6, "desktop_icon": 65.7, "web_text": 90.2, "web_icon": 79.8, "overall": 81.9},
|
| 42 |
},
|
| 43 |
"screenspot-pro": {
|
| 44 |
-
"Qwen2.5-VL-3B-Instruct": {
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
},
|
| 51 |
"showdown-clicks": {
|
| 52 |
"UI-TARS-2B": {"overall": 59.8},
|
|
@@ -491,7 +513,18 @@ def main():
|
|
| 491 |
|
| 492 |
# Dataset filter
|
| 493 |
datasets = sorted(df['dataset'].unique())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
selected_dataset = st.sidebar.selectbox("Select Dataset", datasets)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
st.session_state['selected_dataset'] = selected_dataset
|
| 496 |
|
| 497 |
# Filter data
|
|
@@ -511,7 +544,7 @@ def main():
|
|
| 511 |
st.session_state['selected_models'] = []
|
| 512 |
|
| 513 |
# Initialize selected models if not in session state
|
| 514 |
-
if 'selected_models' not in st.session_state:
|
| 515 |
st.session_state['selected_models'] = all_models
|
| 516 |
|
| 517 |
# Multi-select widget for models
|
|
@@ -568,8 +601,14 @@ def main():
|
|
| 568 |
'web_text': 'Web (Text)',
|
| 569 |
'web_icon': 'Web (Icon)',
|
| 570 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
else:
|
| 572 |
-
# For
|
| 573 |
metric_options = {
|
| 574 |
'overall': 'Overall Average'
|
| 575 |
}
|
|
@@ -623,7 +662,14 @@ def main():
|
|
| 623 |
baseline_row['text_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['web_text']) / 2
|
| 624 |
if 'desktop_icon' in baseline_metrics and 'web_icon' in baseline_metrics:
|
| 625 |
baseline_row['icon_avg'] = (baseline_metrics['desktop_icon'] + baseline_metrics['web_icon']) / 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 626 |
else:
|
|
|
|
| 627 |
baseline_row['overall'] = baseline_metrics.get('overall', 0)
|
| 628 |
|
| 629 |
baseline_rows.append(baseline_row)
|
|
|
|
| 41 |
"Qwen2.5-VL-7B-Instruct": {"desktop_text": 87.6, "desktop_icon": 65.7, "web_text": 90.2, "web_icon": 79.8, "overall": 81.9},
|
| 42 |
},
|
| 43 |
"screenspot-pro": {
|
| 44 |
+
"Qwen2.5-VL-3B-Instruct": {
|
| 45 |
+
"overall": 16.1,
|
| 46 |
+
"text": 23.6,
|
| 47 |
+
"icon": 3.8
|
| 48 |
+
},
|
| 49 |
+
"Qwen2.5-VL-7B-Instruct": {
|
| 50 |
+
"overall": 26.8,
|
| 51 |
+
"text": 38.9,
|
| 52 |
+
"icon": 7.1
|
| 53 |
+
},
|
| 54 |
+
"Qwen2.5-VL-72B-Instruct": {
|
| 55 |
+
"overall": 53.3,
|
| 56 |
+
},
|
| 57 |
+
"UI-TARS-2B": {
|
| 58 |
+
"overall": 27.7,
|
| 59 |
+
"text": 39.6,
|
| 60 |
+
"icon": 8.4
|
| 61 |
+
},
|
| 62 |
+
"UI-TARS-7B": {
|
| 63 |
+
"overall": 35.7,
|
| 64 |
+
"text": 47.8,
|
| 65 |
+
"icon": 16.2
|
| 66 |
+
},
|
| 67 |
+
"UI-TARS-72B": {
|
| 68 |
+
"overall": 38.1,
|
| 69 |
+
"text": 50.9,
|
| 70 |
+
"icon": 17.6
|
| 71 |
+
}
|
| 72 |
},
|
| 73 |
"showdown-clicks": {
|
| 74 |
"UI-TARS-2B": {"overall": 59.8},
|
|
|
|
| 513 |
|
| 514 |
# Dataset filter
|
| 515 |
datasets = sorted(df['dataset'].unique())
|
| 516 |
+
|
| 517 |
+
# Check if dataset has changed
|
| 518 |
+
if 'previous_dataset' not in st.session_state:
|
| 519 |
+
st.session_state['previous_dataset'] = None
|
| 520 |
+
|
| 521 |
selected_dataset = st.sidebar.selectbox("Select Dataset", datasets)
|
| 522 |
+
|
| 523 |
+
# Reset selected models if dataset changed
|
| 524 |
+
if selected_dataset != st.session_state.get('previous_dataset'):
|
| 525 |
+
st.session_state['selected_models'] = None # This will trigger default selection
|
| 526 |
+
st.session_state['previous_dataset'] = selected_dataset
|
| 527 |
+
|
| 528 |
st.session_state['selected_dataset'] = selected_dataset
|
| 529 |
|
| 530 |
# Filter data
|
|
|
|
| 544 |
st.session_state['selected_models'] = []
|
| 545 |
|
| 546 |
# Initialize selected models if not in session state
|
| 547 |
+
if 'selected_models' not in st.session_state or st.session_state['selected_models'] is None:
|
| 548 |
st.session_state['selected_models'] = all_models
|
| 549 |
|
| 550 |
# Multi-select widget for models
|
|
|
|
| 601 |
'web_text': 'Web (Text)',
|
| 602 |
'web_icon': 'Web (Icon)',
|
| 603 |
}
|
| 604 |
+
elif selected_dataset == 'screenspot-pro':
|
| 605 |
+
metric_options = {
|
| 606 |
+
'overall': 'Overall Average',
|
| 607 |
+
'text': 'Text',
|
| 608 |
+
'icon': 'Icon'
|
| 609 |
+
}
|
| 610 |
else:
|
| 611 |
+
# For showdown-clicks, only show overall average
|
| 612 |
metric_options = {
|
| 613 |
'overall': 'Overall Average'
|
| 614 |
}
|
|
|
|
| 662 |
baseline_row['text_avg'] = (baseline_metrics['desktop_text'] + baseline_metrics['web_text']) / 2
|
| 663 |
if 'desktop_icon' in baseline_metrics and 'web_icon' in baseline_metrics:
|
| 664 |
baseline_row['icon_avg'] = (baseline_metrics['desktop_icon'] + baseline_metrics['web_icon']) / 2
|
| 665 |
+
elif selected_dataset == 'screenspot-pro':
|
| 666 |
+
baseline_row.update({
|
| 667 |
+
'overall': baseline_metrics.get('overall', 0),
|
| 668 |
+
'text': baseline_metrics.get('text', 0),
|
| 669 |
+
'icon': baseline_metrics.get('icon', 0)
|
| 670 |
+
})
|
| 671 |
else:
|
| 672 |
+
# For other datasets (showdown-clicks, etc.)
|
| 673 |
baseline_row['overall'] = baseline_metrics.get('overall', 0)
|
| 674 |
|
| 675 |
baseline_rows.append(baseline_row)
|