Anas Awadalla
commited on
Commit
·
1ddd951
1
Parent(s):
6ebe143
fix baselines for showdown-clicks
Browse files- src/streamlit_app.py +54 -15
src/streamlit_app.py
CHANGED
|
@@ -53,7 +53,7 @@ BASELINES = {
|
|
| 53 |
}
|
| 54 |
}
|
| 55 |
|
| 56 |
-
@st.cache_data(ttl=
|
| 57 |
def fetch_leaderboard_data():
|
| 58 |
"""Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
|
| 59 |
api = HfApi()
|
|
@@ -366,7 +366,8 @@ def create_bar_chart(data: pd.DataFrame, metric: str, title: str):
|
|
| 366 |
range=['#4ECDC4', '#FFA726'])),
|
| 367 |
tooltip=['Model', 'Score', 'Type']
|
| 368 |
).properties(
|
| 369 |
-
|
|
|
|
| 370 |
height=400
|
| 371 |
)
|
| 372 |
|
|
@@ -480,19 +481,57 @@ def main():
|
|
| 480 |
|
| 481 |
else:
|
| 482 |
# For non-ScreenSpot datasets, show a simple bar chart
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
|
| 497 |
if __name__ == "__main__":
|
| 498 |
main()
|
|
|
|
| 53 |
}
|
| 54 |
}
|
| 55 |
|
| 56 |
+
@st.cache_data(ttl=1200) # Cache for 20 minutes
|
| 57 |
def fetch_leaderboard_data():
|
| 58 |
"""Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
|
| 59 |
api = HfApi()
|
|
|
|
| 366 |
range=['#4ECDC4', '#FFA726'])),
|
| 367 |
tooltip=['Model', 'Score', 'Type']
|
| 368 |
).properties(
|
| 369 |
+
title=title,
|
| 370 |
+
width=500,
|
| 371 |
height=400
|
| 372 |
)
|
| 373 |
|
|
|
|
| 481 |
|
| 482 |
else:
|
| 483 |
# For non-ScreenSpot datasets, show a simple bar chart
|
| 484 |
+
# Prepare data list for chart with evaluated models and baselines (if any)
|
| 485 |
+
chart_rows = []
|
| 486 |
+
|
| 487 |
+
# Add evaluated models
|
| 488 |
+
for _, row in filtered_df.iterrows():
|
| 489 |
+
chart_rows.append({
|
| 490 |
+
'Model': row['model'],
|
| 491 |
+
'Score': row['overall_accuracy'],
|
| 492 |
+
'Type': 'Evaluated'
|
| 493 |
+
})
|
| 494 |
+
|
| 495 |
+
# Add baselines if defined for this dataset
|
| 496 |
+
if selected_dataset in BASELINES:
|
| 497 |
+
for baseline_name, baseline_metrics in BASELINES[selected_dataset].items():
|
| 498 |
+
if 'overall' in baseline_metrics:
|
| 499 |
+
chart_rows.append({
|
| 500 |
+
'Model': baseline_name,
|
| 501 |
+
'Score': baseline_metrics['overall'],
|
| 502 |
+
'Type': 'Baseline'
|
| 503 |
+
})
|
| 504 |
+
|
| 505 |
+
if chart_rows:
|
| 506 |
+
chart_df = pd.DataFrame(chart_rows)
|
| 507 |
+
|
| 508 |
+
# Create the bar chart similar to create_bar_chart
|
| 509 |
+
chart = alt.Chart(chart_df).mark_bar().encode(
|
| 510 |
+
x=alt.X('Model:N', sort=alt.EncodingSortField(field='Score', order='descending'),
|
| 511 |
+
axis=alt.Axis(labelAngle=-45)),
|
| 512 |
+
y=alt.Y('Score:Q', scale=alt.Scale(domain=[0, 100]),
|
| 513 |
+
axis=alt.Axis(title='Score (%)')),
|
| 514 |
+
color=alt.Color('Type:N',
|
| 515 |
+
scale=alt.Scale(domain=['Evaluated', 'Baseline'],
|
| 516 |
+
range=['#4ECDC4', '#FFA726'])),
|
| 517 |
+
tooltip=['Model', 'Score', 'Type']
|
| 518 |
+
).properties(
|
| 519 |
+
width=800,
|
| 520 |
+
height=400
|
| 521 |
+
)
|
| 522 |
+
|
| 523 |
+
# Add value labels
|
| 524 |
+
text = chart.mark_text(
|
| 525 |
+
align='center',
|
| 526 |
+
baseline='bottom',
|
| 527 |
+
dy=-5
|
| 528 |
+
).encode(
|
| 529 |
+
text=alt.Text('Score:Q', format='.1f')
|
| 530 |
+
)
|
| 531 |
+
|
| 532 |
+
st.altair_chart(chart + text, use_container_width=True)
|
| 533 |
+
else:
|
| 534 |
+
st.warning("No data available for the selected dataset.")
|
| 535 |
|
| 536 |
if __name__ == "__main__":
|
| 537 |
main()
|