Anas Awadalla
commited on
Commit
·
402e797
1
Parent(s):
1ddd951
fix baselines for showdown-clicks
Browse files- src/streamlit_app.py +3 -69
src/streamlit_app.py
CHANGED
|
@@ -53,7 +53,7 @@ BASELINES = {
|
|
| 53 |
}
|
| 54 |
}
|
| 55 |
|
| 56 |
-
@st.cache_data(ttl=
|
| 57 |
def fetch_leaderboard_data():
|
| 58 |
"""Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
|
| 59 |
api = HfApi()
|
|
@@ -433,33 +433,21 @@ def main():
|
|
| 433 |
|
| 434 |
# Add metric selector for screenspot datasets
|
| 435 |
selected_metric = 'overall' # Default metric
|
| 436 |
-
if not ui_metrics_df.empty
|
| 437 |
# Metric selector dropdown
|
| 438 |
if selected_dataset == 'screenspot-v2':
|
| 439 |
metric_options = {
|
| 440 |
'overall': 'Overall Average (Desktop + Web) / 2',
|
| 441 |
-
'desktop_avg': 'Desktop Average',
|
| 442 |
-
'web_avg': 'Web Average',
|
| 443 |
'desktop_text': 'Desktop (Text)',
|
| 444 |
'desktop_icon': 'Desktop (Icon)',
|
| 445 |
'web_text': 'Web (Text)',
|
| 446 |
'web_icon': 'Web (Icon)',
|
| 447 |
-
'text_avg': 'Text Average',
|
| 448 |
-
'icon_avg': 'Icon Average'
|
| 449 |
}
|
| 450 |
-
|
| 451 |
# For screenspot-pro and showdown-clicks, only show overall average
|
| 452 |
metric_options = {
|
| 453 |
'overall': 'Overall Average'
|
| 454 |
}
|
| 455 |
-
else:
|
| 456 |
-
metric_options = {
|
| 457 |
-
'overall': 'Overall Average',
|
| 458 |
-
'desktop_avg': 'Desktop Average',
|
| 459 |
-
'web_avg': 'Web Average',
|
| 460 |
-
'text_avg': 'Text Average',
|
| 461 |
-
'icon_avg': 'Icon Average'
|
| 462 |
-
}
|
| 463 |
|
| 464 |
selected_metric = st.selectbox(
|
| 465 |
"Select metric to visualize:",
|
|
@@ -478,60 +466,6 @@ def main():
|
|
| 478 |
st.altair_chart(chart, use_container_width=True)
|
| 479 |
else:
|
| 480 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
| 481 |
-
|
| 482 |
-
else:
|
| 483 |
-
# For non-ScreenSpot datasets, show a simple bar chart
|
| 484 |
-
# Prepare data list for chart with evaluated models and baselines (if any)
|
| 485 |
-
chart_rows = []
|
| 486 |
-
|
| 487 |
-
# Add evaluated models
|
| 488 |
-
for _, row in filtered_df.iterrows():
|
| 489 |
-
chart_rows.append({
|
| 490 |
-
'Model': row['model'],
|
| 491 |
-
'Score': row['overall_accuracy'],
|
| 492 |
-
'Type': 'Evaluated'
|
| 493 |
-
})
|
| 494 |
-
|
| 495 |
-
# Add baselines if defined for this dataset
|
| 496 |
-
if selected_dataset in BASELINES:
|
| 497 |
-
for baseline_name, baseline_metrics in BASELINES[selected_dataset].items():
|
| 498 |
-
if 'overall' in baseline_metrics:
|
| 499 |
-
chart_rows.append({
|
| 500 |
-
'Model': baseline_name,
|
| 501 |
-
'Score': baseline_metrics['overall'],
|
| 502 |
-
'Type': 'Baseline'
|
| 503 |
-
})
|
| 504 |
-
|
| 505 |
-
if chart_rows:
|
| 506 |
-
chart_df = pd.DataFrame(chart_rows)
|
| 507 |
-
|
| 508 |
-
# Create the bar chart similar to create_bar_chart
|
| 509 |
-
chart = alt.Chart(chart_df).mark_bar().encode(
|
| 510 |
-
x=alt.X('Model:N', sort=alt.EncodingSortField(field='Score', order='descending'),
|
| 511 |
-
axis=alt.Axis(labelAngle=-45)),
|
| 512 |
-
y=alt.Y('Score:Q', scale=alt.Scale(domain=[0, 100]),
|
| 513 |
-
axis=alt.Axis(title='Score (%)')),
|
| 514 |
-
color=alt.Color('Type:N',
|
| 515 |
-
scale=alt.Scale(domain=['Evaluated', 'Baseline'],
|
| 516 |
-
range=['#4ECDC4', '#FFA726'])),
|
| 517 |
-
tooltip=['Model', 'Score', 'Type']
|
| 518 |
-
).properties(
|
| 519 |
-
width=800,
|
| 520 |
-
height=400
|
| 521 |
-
)
|
| 522 |
-
|
| 523 |
-
# Add value labels
|
| 524 |
-
text = chart.mark_text(
|
| 525 |
-
align='center',
|
| 526 |
-
baseline='bottom',
|
| 527 |
-
dy=-5
|
| 528 |
-
).encode(
|
| 529 |
-
text=alt.Text('Score:Q', format='.1f')
|
| 530 |
-
)
|
| 531 |
-
|
| 532 |
-
st.altair_chart(chart + text, use_container_width=True)
|
| 533 |
-
else:
|
| 534 |
-
st.warning("No data available for the selected dataset.")
|
| 535 |
|
| 536 |
if __name__ == "__main__":
|
| 537 |
main()
|
|
|
|
| 53 |
}
|
| 54 |
}
|
| 55 |
|
| 56 |
+
@st.cache_data(ttl=300) # Cache for 5 minutes
|
| 57 |
def fetch_leaderboard_data():
|
| 58 |
"""Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
|
| 59 |
api = HfApi()
|
|
|
|
| 433 |
|
| 434 |
# Add metric selector for screenspot datasets
|
| 435 |
selected_metric = 'overall' # Default metric
|
| 436 |
+
if not ui_metrics_df.empty:
|
| 437 |
# Metric selector dropdown
|
| 438 |
if selected_dataset == 'screenspot-v2':
|
| 439 |
metric_options = {
|
| 440 |
'overall': 'Overall Average (Desktop + Web) / 2',
|
|
|
|
|
|
|
| 441 |
'desktop_text': 'Desktop (Text)',
|
| 442 |
'desktop_icon': 'Desktop (Icon)',
|
| 443 |
'web_text': 'Web (Text)',
|
| 444 |
'web_icon': 'Web (Icon)',
|
|
|
|
|
|
|
| 445 |
}
|
| 446 |
+
else:
|
| 447 |
# For screenspot-pro and showdown-clicks, only show overall average
|
| 448 |
metric_options = {
|
| 449 |
'overall': 'Overall Average'
|
| 450 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
selected_metric = st.selectbox(
|
| 453 |
"Select metric to visualize:",
|
|
|
|
| 466 |
st.altair_chart(chart, use_container_width=True)
|
| 467 |
else:
|
| 468 |
st.warning(f"No data available for {metric_options[selected_metric]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
if __name__ == "__main__":
|
| 471 |
main()
|