Anas Awadalla
commited on
Commit
·
d37faa6
1
Parent(s):
c94fd08
some fixes
Browse files- README.md +21 -14
- src/streamlit_app.py +29 -79
README.md
CHANGED
|
@@ -56,7 +56,7 @@ The app will open in your browser at `http://localhost:8501`
|
|
| 56 |
- Individual UI type metrics: Desktop (Text), Desktop (Icon), Web (Text), Web (Icon)
|
| 57 |
- Text and Icon averages across environments
|
| 58 |
- Baseline model comparisons shown in orange
|
| 59 |
-
- Models marked with * indicate the best checkpoint is not the
|
| 60 |
|
| 61 |
4. **Explore Details**:
|
| 62 |
- Expand "Model Details" to see training metadata
|
|
@@ -89,19 +89,26 @@ To minimize local storage requirements, the app:
|
|
| 89 |
|
| 90 |
## Baseline Models
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
-
|
| 96 |
-
- UI-TARS-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
- Qwen2.5-VL-
|
| 102 |
-
-
|
| 103 |
-
-
|
| 104 |
-
- UI-TARS-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
## Checkpoint Handling
|
| 107 |
|
|
|
|
| 56 |
- Individual UI type metrics: Desktop (Text), Desktop (Icon), Web (Text), Web (Icon)
|
| 57 |
- Text and Icon averages across environments
|
| 58 |
- Baseline model comparisons shown in orange
|
| 59 |
+
- Models marked with * indicate the best checkpoint is not the last one
|
| 60 |
|
| 61 |
4. **Explore Details**:
|
| 62 |
- Expand "Model Details" to see training metadata
|
|
|
|
| 89 |
|
| 90 |
## Baseline Models
|
| 91 |
|
| 92 |
+
The dashboard includes baseline performance from established models:
|
| 93 |
+
|
| 94 |
+
### ScreenSpot-v2 Baselines
|
| 95 |
+
- **Qwen2-VL-7B**: 38.0% overall
|
| 96 |
+
- **UI-TARS-2B**: 82.8% overall
|
| 97 |
+
- **UI-TARS-7B**: 92.2% overall
|
| 98 |
+
- **UI-TARS-72B**: 88.3% overall
|
| 99 |
+
|
| 100 |
+
### ScreenSpot-Pro Baselines
|
| 101 |
+
- **Qwen2.5-VL-3B-Instruct**: 16.1% overall
|
| 102 |
+
- **Qwen2.5-VL-7B-Instruct**: 26.8% overall
|
| 103 |
+
- **Qwen2.5-VL-72B-Instruct**: 53.3% overall
|
| 104 |
+
- **UI-TARS-2B**: 27.7% overall
|
| 105 |
+
- **UI-TARS-7B**: 35.7% overall
|
| 106 |
+
- **UI-TARS-72B**: 38.1% overall
|
| 107 |
+
|
| 108 |
+
### ShowDown-Clicks Baselines
|
| 109 |
+
- **Qwen2.5-VL-72B-Instruct**: 24.8% overall
|
| 110 |
+
- **UI-TARS-72B-SFT**: 54.4% overall
|
| 111 |
+
- **Molmo-72B-0924**: 54.8% overall
|
| 112 |
|
| 113 |
## Checkpoint Handling
|
| 114 |
|
src/streamlit_app.py
CHANGED
|
@@ -26,53 +26,30 @@ GROUNDING_PATH = "grounding"
|
|
| 26 |
BASELINES = {
|
| 27 |
"screenspot-v2": {
|
| 28 |
"Qwen2-VL-7B": {
|
| 29 |
-
"desktop_text": 52.01,
|
| 30 |
-
"desktop_icon": 44.98,
|
| 31 |
-
"web_text": 33.04,
|
| 32 |
-
"web_icon": 21.84,
|
| 33 |
-
"overall": 37.96
|
| 34 |
},
|
| 35 |
"UI-TARS-2B": {
|
| 36 |
-
"desktop_text": 90.7,
|
| 37 |
-
"desktop_icon": 68.6,
|
| 38 |
-
"web_text": 87.2,
|
| 39 |
-
"web_icon": 84.7,
|
| 40 |
-
"overall": 82.8
|
| 41 |
},
|
| 42 |
"UI-TARS-7B": {
|
| 43 |
-
"desktop_text": 95.4,
|
| 44 |
-
"desktop_icon": 87.8,
|
| 45 |
-
"web_text": 93.8,
|
| 46 |
-
"web_icon": 91.6,
|
| 47 |
-
"overall": 92.2
|
| 48 |
},
|
| 49 |
"UI-TARS-72B": {
|
| 50 |
-
"desktop_text": 91.2,
|
| 51 |
-
"desktop_icon": 87.8,
|
| 52 |
-
"web_text": 87.7,
|
| 53 |
-
"web_icon": 86.3,
|
| 54 |
-
"overall": 88.3
|
| 55 |
}
|
| 56 |
},
|
| 57 |
"screenspot-pro": {
|
| 58 |
-
"Qwen2.5-VL-3B-Instruct": {
|
| 59 |
-
|
| 60 |
-
},
|
| 61 |
-
"
|
| 62 |
-
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
},
|
| 67 |
-
"UI-TARS-
|
| 68 |
-
|
| 69 |
-
},
|
| 70 |
-
"UI-TARS-7B": {
|
| 71 |
-
"overall": 35.7
|
| 72 |
-
},
|
| 73 |
-
"UI-TARS-72B": {
|
| 74 |
-
"overall": 38.1
|
| 75 |
-
}
|
| 76 |
}
|
| 77 |
}
|
| 78 |
|
|
@@ -472,6 +449,11 @@ def main():
|
|
| 472 |
'text_avg': 'Text Average',
|
| 473 |
'icon_avg': 'Icon Average'
|
| 474 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
else:
|
| 476 |
metric_options = {
|
| 477 |
'overall': 'Overall Average',
|
|
@@ -499,7 +481,7 @@ def main():
|
|
| 499 |
else:
|
| 500 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
| 501 |
|
| 502 |
-
# Show all metrics in an expandable section
|
| 503 |
with st.expander("View All Metrics"):
|
| 504 |
if selected_dataset == 'screenspot-v2':
|
| 505 |
# First row: Overall, Desktop, Web averages
|
|
@@ -556,35 +538,8 @@ def main():
|
|
| 556 |
if chart:
|
| 557 |
st.altair_chart(chart, use_container_width=True)
|
| 558 |
else:
|
| 559 |
-
# For
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
with col1:
|
| 563 |
-
# Overall Average
|
| 564 |
-
chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average')
|
| 565 |
-
if chart:
|
| 566 |
-
st.altair_chart(chart, use_container_width=True)
|
| 567 |
-
|
| 568 |
-
# Desktop Average
|
| 569 |
-
chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
|
| 570 |
-
if chart:
|
| 571 |
-
st.altair_chart(chart, use_container_width=True)
|
| 572 |
-
|
| 573 |
-
# Text Average
|
| 574 |
-
chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (UI-Type)')
|
| 575 |
-
if chart:
|
| 576 |
-
st.altair_chart(chart, use_container_width=True)
|
| 577 |
-
|
| 578 |
-
with col2:
|
| 579 |
-
# Web Average
|
| 580 |
-
chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
|
| 581 |
-
if chart:
|
| 582 |
-
st.altair_chart(chart, use_container_width=True)
|
| 583 |
-
|
| 584 |
-
# Icon Average
|
| 585 |
-
chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (UI-Type)')
|
| 586 |
-
if chart:
|
| 587 |
-
st.altair_chart(chart, use_container_width=True)
|
| 588 |
|
| 589 |
# Checkpoint progression visualization
|
| 590 |
with st.expander("Checkpoint Progression Analysis"):
|
|
@@ -798,9 +753,9 @@ def main():
|
|
| 798 |
else:
|
| 799 |
st.info("No models with multiple checkpoints available for progression analysis")
|
| 800 |
|
| 801 |
-
# Detailed breakdown
|
| 802 |
-
|
| 803 |
-
|
| 804 |
# Create a heatmap-style table
|
| 805 |
detailed_metrics = []
|
| 806 |
for _, row in ui_metrics_df.iterrows():
|
|
@@ -815,6 +770,9 @@ def main():
|
|
| 815 |
|
| 816 |
if detailed_metrics:
|
| 817 |
st.dataframe(pd.DataFrame(detailed_metrics), use_container_width=True)
|
|
|
|
|
|
|
|
|
|
| 818 |
|
| 819 |
else:
|
| 820 |
# For non-ScreenSpot datasets, show a simple bar chart
|
|
@@ -833,14 +791,6 @@ def main():
|
|
| 833 |
)
|
| 834 |
|
| 835 |
st.altair_chart(chart, use_container_width=True)
|
| 836 |
-
|
| 837 |
-
# Model details table
|
| 838 |
-
with st.expander("Model Details"):
|
| 839 |
-
display_df = filtered_df[['model', 'overall_accuracy', 'total_samples', 'checkpoint_steps', 'training_loss', 'timestamp']].copy()
|
| 840 |
-
display_df.columns = ['Model', 'Accuracy (%)', 'Samples', 'Checkpoint Steps', 'Training Loss', 'Timestamp']
|
| 841 |
-
display_df['Accuracy (%)'] = display_df['Accuracy (%)'].apply(lambda x: f"{x:.2f}")
|
| 842 |
-
display_df['Training Loss'] = display_df['Training Loss'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
|
| 843 |
-
st.dataframe(display_df, use_container_width=True)
|
| 844 |
|
| 845 |
if __name__ == "__main__":
|
| 846 |
main()
|
|
|
|
| 26 |
BASELINES = {
|
| 27 |
"screenspot-v2": {
|
| 28 |
"Qwen2-VL-7B": {
|
| 29 |
+
"desktop_text": 52.01, "desktop_icon": 44.98, "web_text": 33.04, "web_icon": 21.84, "overall": 37.96
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
},
|
| 31 |
"UI-TARS-2B": {
|
| 32 |
+
"desktop_text": 90.7, "desktop_icon": 68.6, "web_text": 87.2, "web_icon": 84.7, "overall": 82.8
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
},
|
| 34 |
"UI-TARS-7B": {
|
| 35 |
+
"desktop_text": 95.4, "desktop_icon": 87.8, "web_text": 93.8, "web_icon": 91.6, "overall": 92.2
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
},
|
| 37 |
"UI-TARS-72B": {
|
| 38 |
+
"desktop_text": 91.2, "desktop_icon": 87.8, "web_text": 87.7, "web_icon": 86.3, "overall": 88.3
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
}
|
| 40 |
},
|
| 41 |
"screenspot-pro": {
|
| 42 |
+
"Qwen2.5-VL-3B-Instruct": {"overall": 16.1},
|
| 43 |
+
"Qwen2.5-VL-7B-Instruct": {"overall": 26.8},
|
| 44 |
+
"Qwen2.5-VL-72B-Instruct": {"overall": 53.3},
|
| 45 |
+
"UI-TARS-2B": {"overall": 27.7},
|
| 46 |
+
"UI-TARS-7B": {"overall": 35.7},
|
| 47 |
+
"UI-TARS-72B": {"overall": 38.1}
|
| 48 |
+
},
|
| 49 |
+
"showdown-clicks": {
|
| 50 |
+
"Qwen2.5-VL-72B-Instruct": {"overall": 24.78},
|
| 51 |
+
"UI-TARS-72B-SFT": {"overall": 54.4},
|
| 52 |
+
"Molmo-72B-0924": {"overall": 54.76}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
}
|
| 54 |
}
|
| 55 |
|
|
|
|
| 449 |
'text_avg': 'Text Average',
|
| 450 |
'icon_avg': 'Icon Average'
|
| 451 |
}
|
| 452 |
+
elif selected_dataset in ['screenspot-pro', 'showdown-clicks']:
|
| 453 |
+
# For screenspot-pro and showdown-clicks, only show overall average
|
| 454 |
+
metric_options = {
|
| 455 |
+
'overall': 'Overall Average'
|
| 456 |
+
}
|
| 457 |
else:
|
| 458 |
metric_options = {
|
| 459 |
'overall': 'Overall Average',
|
|
|
|
| 481 |
else:
|
| 482 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
| 483 |
|
| 484 |
+
# Show all metrics in an expandable section - available for all datasets
|
| 485 |
with st.expander("View All Metrics"):
|
| 486 |
if selected_dataset == 'screenspot-v2':
|
| 487 |
# First row: Overall, Desktop, Web averages
|
|
|
|
| 538 |
if chart:
|
| 539 |
st.altair_chart(chart, use_container_width=True)
|
| 540 |
else:
|
| 541 |
+
# For screenspot-pro and showdown-clicks
|
| 542 |
+
st.info("No additional UI type metrics available for this dataset. Only overall accuracy is reported.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
|
| 544 |
# Checkpoint progression visualization
|
| 545 |
with st.expander("Checkpoint Progression Analysis"):
|
|
|
|
| 753 |
else:
|
| 754 |
st.info("No models with multiple checkpoints available for progression analysis")
|
| 755 |
|
| 756 |
+
# Detailed breakdown - show for all datasets
|
| 757 |
+
with st.expander("Detailed UI Type Breakdown"):
|
| 758 |
+
if selected_dataset == 'screenspot-v2':
|
| 759 |
# Create a heatmap-style table
|
| 760 |
detailed_metrics = []
|
| 761 |
for _, row in ui_metrics_df.iterrows():
|
|
|
|
| 770 |
|
| 771 |
if detailed_metrics:
|
| 772 |
st.dataframe(pd.DataFrame(detailed_metrics), use_container_width=True)
|
| 773 |
+
else:
|
| 774 |
+
# For screenspot-pro and showdown-clicks
|
| 775 |
+
st.info("Detailed UI type breakdown is only available for ScreenSpot-v2 dataset.")
|
| 776 |
|
| 777 |
else:
|
| 778 |
# For non-ScreenSpot datasets, show a simple bar chart
|
|
|
|
| 791 |
)
|
| 792 |
|
| 793 |
st.altair_chart(chart, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 794 |
|
| 795 |
if __name__ == "__main__":
|
| 796 |
main()
|