Spaces:

mlfoundations-cua-dev
/

leaderboard-viewer

Running

App Files Files Community

Anas Awadalla commited on Jul 24

Commit

c148460

1 Parent(s): 41dce85

fix caching of elements

Browse files

Files changed (1) hide show

src/streamlit_app.py +202 -214

src/streamlit_app.py CHANGED Viewed

@@ -54,7 +54,7 @@ BASELINES = {
 }
 @st.cache_data(ttl=300)  # Cache for 5 minutes
-def fetch_leaderboard_data_cached():
     """Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
     api = HfApi()
     fs = HfFileSystem()
@@ -66,8 +66,17 @@ def fetch_leaderboard_data_cached():
         results = []
         for idx, file_path in enumerate(grounding_files):
             try:
                 # Stream the JSON file content directly from HuggingFace
                 file_url = f"datasets/{REPO_ID}/{file_path}"
@@ -146,6 +155,10 @@ def fetch_leaderboard_data_cached():
                 st.warning(f"Error loading {file_path}: {str(e)}")
                 continue
         # Create DataFrame
         df = pd.DataFrame(results)
@@ -194,10 +207,6 @@ def fetch_leaderboard_data_cached():
         st.error(f"Error fetching leaderboard data: {str(e)}")
         return pd.DataFrame()
-def fetch_leaderboard_data():
-    """Wrapper function to fetch leaderboard data with progress indicators."""
-    return fetch_leaderboard_data_cached()
 def parse_ui_type_metrics(df: pd.DataFrame, dataset_filter: str) -> pd.DataFrame:
     """Parse UI type metrics from the results dataframe."""
     metrics_list = []
@@ -377,23 +386,9 @@ def main():
     st.title("🎯 Grounding Benchmark Leaderboard")
     st.markdown("Visualization of model performance on grounding benchmarks")
-    # Initialize placeholders for dynamic content
-    progress_placeholder = st.empty()
-    header_placeholder = st.empty()
-    metrics_placeholder = st.empty()
-    metric_selector_placeholder = st.empty()
-    info_placeholder = st.empty()
-    main_chart_placeholder = st.empty()
-    expandable_placeholder = st.empty()
-    checkpoint_placeholder = st.empty()
     # Fetch data
-    with progress_placeholder.container():
-        with st.spinner("Loading leaderboard data..."):
-            df = fetch_leaderboard_data()
-    # Clear progress placeholder after loading
-    progress_placeholder.empty()
     if df.empty:
         st.warning("No data available in the leaderboard.")
@@ -416,24 +411,29 @@ def main():
     if selected_model != 'All':
         filtered_df = filtered_df[filtered_df['model'] == selected_model]
     # Main content
-    with header_placeholder.container():
-        st.header(f"Results for {selected_dataset}")
     # Overall metrics
-    with metrics_placeholder.container():
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("Models Evaluated", len(filtered_df))
-        with col2:
-            if not filtered_df.empty:
-                best_acc = filtered_df['overall_accuracy'].max()
-                best_model = filtered_df[filtered_df['overall_accuracy'] == best_acc]['model'].iloc[0]
-                st.metric("Best Overall Accuracy", f"{best_acc:.1f}%", help=f"Model: {best_model}")
-        with col3:
-            total_samples = filtered_df['total_samples'].sum()
-            st.metric("Total Samples Evaluated", f"{total_samples:,}")
     # Parse UI type metrics
     ui_metrics_df = parse_ui_type_metrics(filtered_df, selected_dataset)
@@ -442,177 +442,172 @@ def main():
     selected_metric = 'overall'  # Default metric
     if not ui_metrics_df.empty and 'screenspot' in selected_dataset.lower():
         # Metric selector dropdown
-        with metric_selector_placeholder.container():
-            if selected_dataset == 'screenspot-v2':
-                metric_options = {
-                    'overall': 'Overall Average (Desktop + Web) / 2',
-                    'desktop_avg': 'Desktop Average',
-                    'web_avg': 'Web Average',
-                    'desktop_text': 'Desktop (Text)',
-                    'desktop_icon': 'Desktop (Icon)',
-                    'web_text': 'Web (Text)',
-                    'web_icon': 'Web (Icon)',
-                    'text_avg': 'Text Average',
-                    'icon_avg': 'Icon Average'
-                }
-            elif selected_dataset in ['screenspot-pro', 'showdown-clicks']:
-                # For screenspot-pro and showdown-clicks, only show overall average
-                metric_options = {
-                    'overall': 'Overall Average'
-                }
-            else:
-                metric_options = {
-                    'overall': 'Overall Average',
-                    'desktop_avg': 'Desktop Average',
-                    'web_avg': 'Web Average',
-                    'text_avg': 'Text Average',
-                    'icon_avg': 'Icon Average'
-                }
-            selected_metric = st.selectbox(
-                "Select metric to visualize:",
-                options=list(metric_options.keys()),
-                format_func=lambda x: metric_options[x],
-                key="metric_selector"
-            )
         # Add note about asterisks
-        with info_placeholder.container():
-            if any(ui_metrics_df['is_best_not_last']):
-                st.info("* indicates the best checkpoint is not the last checkpoint")
         # Create single chart for selected metric
-        with main_chart_placeholder.container():
-            chart = create_bar_chart(ui_metrics_df, selected_metric, metric_options[selected_metric])
-            if chart:
-                st.altair_chart(chart, use_container_width=True)
-            else:
-                st.warning(f"No data available for {metric_options[selected_metric]}")
         # Show all metrics in an expandable section - available for all datasets
-        with expandable_placeholder.container():
-            with st.expander("View All Metrics"):
-                if selected_dataset == 'screenspot-v2':
-                    # First row: Overall, Desktop, Web averages
-                    col1, col2, col3 = st.columns(3)
-                    with col1:
-                        chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average (Desktop + Web) / 2')
-                        if chart:
-                            st.altair_chart(chart, use_container_width=True)
-                    with col2:
-                        chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
-                        if chart:
-                            st.altair_chart(chart, use_container_width=True)
-                    with col3:
-                        chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
-                        if chart:
-                            st.altair_chart(chart, use_container_width=True)
-                    # Second row: Individual UI type metrics
-                    col1, col2, col3, col4 = st.columns(4)
-                    with col1:
-                        chart = create_bar_chart(ui_metrics_df, 'desktop_text', 'Desktop (Text)')
-                        if chart:
-                            st.altair_chart(chart, use_container_width=True)
-                    with col2:
-                        chart = create_bar_chart(ui_metrics_df, 'desktop_icon', 'Desktop (Icon)')
-                        if chart:
-                            st.altair_chart(chart, use_container_width=True)
-                    with col3:
-                        chart = create_bar_chart(ui_metrics_df, 'web_text', 'Web (Text)')
-                        if chart:
-                            st.altair_chart(chart, use_container_width=True)
-                    with col4:
-                        chart = create_bar_chart(ui_metrics_df, 'web_icon', 'Web (Icon)')
-                        if chart:
-                            st.altair_chart(chart, use_container_width=True)
-                    # Third row: Text vs Icon averages
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (Desktop + Web)')
-                        if chart:
-                            st.altair_chart(chart, use_container_width=True)
-                    with col2:
-                        chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (Desktop + Web)')
-                        if chart:
-                            st.altair_chart(chart, use_container_width=True)
-                else:
-                    # For screenspot-pro and showdown-clicks
-                    st.info("No additional UI type metrics available for this dataset. Only overall accuracy is reported.")
         # Checkpoint progression visualization
-        with checkpoint_placeholder.container():
-            with st.expander("Checkpoint Progression Analysis"):
-                # Select a model with checkpoints
-                models_with_checkpoints = ui_metrics_df[ui_metrics_df['all_checkpoints'].apply(lambda x: len(x) > 1)]
-                if not models_with_checkpoints.empty:
-                    selected_checkpoint_model = st.selectbox(
-                        "Select a model to view checkpoint progression:",
-                        models_with_checkpoints['model'].str.replace('*', '').unique()
-                    )
-                    # Get checkpoint data for selected model
-                    model_row = models_with_checkpoints[models_with_checkpoints['model'].str.replace('*', '') == selected_checkpoint_model].iloc[0]
-                    checkpoint_data = model_row['all_checkpoints']
-                    # Create DataFrame from checkpoint data
-                    checkpoint_df = pd.DataFrame(checkpoint_data)
-                    # Prepare data for visualization
-                    checkpoint_metrics = []
-                    for _, cp in checkpoint_df.iterrows():
-                        ui_results = cp.get('ui_type_results', {})
-                        dataset_type_results = cp.get('dataset_type_results', {})
-                        results_by_file = cp.get('results_by_file', {})
-                        # Check if we have desktop/web breakdown in results_by_file
-                        desktop_file = None
-                        web_file = None
-                        for filename, file_results in results_by_file.items():
-                            if 'desktop' in filename.lower():
-                                desktop_file = file_results
-                            elif 'web' in filename.lower():
-                                web_file = file_results
-                        if desktop_file and web_file:
-                            # We have desktop/web breakdown
-                            desktop_text = desktop_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
-                            desktop_icon = desktop_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
-                            web_text = web_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
-                            web_icon = web_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
-                        else:
-                            # Fallback to simple UI type results
-                            desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
-                            desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
-                            web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
-                            web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
-                            # If still all zeros, try dataset_type_results
-                            if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
-                                for dataset_key in dataset_type_results:
-                                    if 'screenspot' in dataset_key.lower():
-                                        dataset_data = dataset_type_results[dataset_key]
-                                        if 'by_ui_type' in dataset_data:
-                                            ui_data = dataset_data['by_ui_type']
-                                            # For simple text/icon without desktop/web
-                                            text_val = ui_data.get('text', {}).get('correct', 0) / max(ui_data.get('text', {}).get('total', 1), 1) * 100
-                                            icon_val = ui_data.get('icon', {}).get('correct', 0) / max(ui_data.get('icon', {}).get('total', 1), 1) * 100
-                                            # Assign same values to desktop and web as we don't have the breakdown
-                                            desktop_text = web_text = text_val
-                                            desktop_icon = web_icon = icon_val
-                                            break
                     desktop_avg = (desktop_text + desktop_icon) / 2
                     web_avg = (web_text + web_icon) / 2
@@ -765,26 +760,19 @@ def main():
     else:
         # For non-ScreenSpot datasets, show a simple bar chart
-        # Clear unused placeholders
-        metric_selector_placeholder.empty()
-        info_placeholder.empty()
-        expandable_placeholder.empty()
-        checkpoint_placeholder.empty()
-        with main_chart_placeholder.container():
-            chart_data = filtered_df[['model', 'overall_accuracy']].copy()
-            chart_data.columns = ['Model', 'Accuracy']
-            chart = alt.Chart(chart_data).mark_bar().encode(
-                x=alt.X('Model:N', sort='-y', axis=alt.Axis(labelAngle=-45)),
-                y=alt.Y('Accuracy:Q', scale=alt.Scale(domain=[0, 100])),
-                tooltip=['Model', 'Accuracy']
-            ).properties(
-                width=800,
-                height=400
-            )
-            st.altair_chart(chart, use_container_width=True)
 if __name__ == "__main__":
     main()

 }
 @st.cache_data(ttl=300)  # Cache for 5 minutes
+def fetch_leaderboard_data():
     """Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
     api = HfApi()
     fs = HfFileSystem()
         results = []
+        # Create progress bar for loading
+        progress_bar = st.progress(0)
+        status_text = st.empty()
         for idx, file_path in enumerate(grounding_files):
             try:
+                # Update progress
+                progress = (idx + 1) / len(grounding_files)
+                progress_bar.progress(progress)
+                status_text.text(f"Loading {idx + 1}/{len(grounding_files)} files...")
                 # Stream the JSON file content directly from HuggingFace
                 file_url = f"datasets/{REPO_ID}/{file_path}"
                 st.warning(f"Error loading {file_path}: {str(e)}")
                 continue
+        # Clear progress indicators
+        progress_bar.empty()
+        status_text.empty()
         # Create DataFrame
         df = pd.DataFrame(results)
         st.error(f"Error fetching leaderboard data: {str(e)}")
         return pd.DataFrame()
 def parse_ui_type_metrics(df: pd.DataFrame, dataset_filter: str) -> pd.DataFrame:
     """Parse UI type metrics from the results dataframe."""
     metrics_list = []
     st.title("🎯 Grounding Benchmark Leaderboard")
     st.markdown("Visualization of model performance on grounding benchmarks")
     # Fetch data
+    with st.spinner("Loading leaderboard data..."):
+        df = fetch_leaderboard_data()
     if df.empty:
         st.warning("No data available in the leaderboard.")
     if selected_model != 'All':
         filtered_df = filtered_df[filtered_df['model'] == selected_model]
+    # Create placeholders for components that update when dataset or metric changes
+    header_placeholder = st.empty()
+    metrics_placeholder = st.empty()
+    chart_placeholder = st.empty()
+    view_metrics_expander_placeholder = st.empty()
+    progression_expander_placeholder = st.empty()
     # Main content
+    header_placeholder.header(f"Results for {selected_dataset}")
     # Overall metrics
+    col1, col2, col3 = metrics_placeholder.columns(3)
+    with col1:
+        st.metric("Models Evaluated", len(filtered_df))
+    with col2:
+        if not filtered_df.empty:
+            best_acc = filtered_df['overall_accuracy'].max()
+            best_model = filtered_df[filtered_df['overall_accuracy'] == best_acc]['model'].iloc[0]
+            st.metric("Best Overall Accuracy", f"{best_acc:.1f}%", help=f"Model: {best_model}")
+    with col3:
+        total_samples = filtered_df['total_samples'].sum()
+        st.metric("Total Samples Evaluated", f"{total_samples:,}")
     # Parse UI type metrics
     ui_metrics_df = parse_ui_type_metrics(filtered_df, selected_dataset)
     selected_metric = 'overall'  # Default metric
     if not ui_metrics_df.empty and 'screenspot' in selected_dataset.lower():
         # Metric selector dropdown
+        if selected_dataset == 'screenspot-v2':
+            metric_options = {
+                'overall': 'Overall Average (Desktop + Web) / 2',
+                'desktop_avg': 'Desktop Average',
+                'web_avg': 'Web Average',
+                'desktop_text': 'Desktop (Text)',
+                'desktop_icon': 'Desktop (Icon)',
+                'web_text': 'Web (Text)',
+                'web_icon': 'Web (Icon)',
+                'text_avg': 'Text Average',
+                'icon_avg': 'Icon Average'
+            }
+        elif selected_dataset in ['screenspot-pro', 'showdown-clicks']:
+            # For screenspot-pro and showdown-clicks, only show overall average
+            metric_options = {
+                'overall': 'Overall Average'
+            }
+        else:
+            metric_options = {
+                'overall': 'Overall Average',
+                'desktop_avg': 'Desktop Average',
+                'web_avg': 'Web Average',
+                'text_avg': 'Text Average',
+                'icon_avg': 'Icon Average'
+            }
+        selected_metric = st.selectbox(
+            "Select metric to visualize:",
+            options=list(metric_options.keys()),
+            format_func=lambda x: metric_options[x],
+            key="metric_selector"
+        )
         # Add note about asterisks
+        if any(ui_metrics_df['is_best_not_last']):
+            st.info("* indicates the best checkpoint is not the last checkpoint")
         # Create single chart for selected metric
+        chart = create_bar_chart(ui_metrics_df, selected_metric, metric_options[selected_metric])
+        if chart:
+            chart_placeholder.altair_chart(chart, use_container_width=True)
+        else:
+            st.warning(f"No data available for {metric_options[selected_metric]}")
         # Show all metrics in an expandable section - available for all datasets
+        with view_metrics_expander_placeholder.expander("View All Metrics"):
+            if selected_dataset == 'screenspot-v2':
+                # First row: Overall, Desktop, Web averages
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    chart = create_bar_chart(ui_metrics_df, 'overall', 'Overall Average (Desktop + Web) / 2')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col2:
+                    chart = create_bar_chart(ui_metrics_df, 'desktop_avg', 'Desktop Average')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col3:
+                    chart = create_bar_chart(ui_metrics_df, 'web_avg', 'Web Average')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                # Second row: Individual UI type metrics
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    chart = create_bar_chart(ui_metrics_df, 'desktop_text', 'Desktop (Text)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col2:
+                    chart = create_bar_chart(ui_metrics_df, 'desktop_icon', 'Desktop (Icon)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col3:
+                    chart = create_bar_chart(ui_metrics_df, 'web_text', 'Web (Text)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col4:
+                    chart = create_bar_chart(ui_metrics_df, 'web_icon', 'Web (Icon)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                # Third row: Text vs Icon averages
+                col1, col2 = st.columns(2)
+                with col1:
+                    chart = create_bar_chart(ui_metrics_df, 'text_avg', 'Text Average (Desktop + Web)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+                with col2:
+                    chart = create_bar_chart(ui_metrics_df, 'icon_avg', 'Icon Average (Desktop + Web)')
+                    if chart:
+                        st.altair_chart(chart, use_container_width=True)
+            else:
+                # For screenspot-pro and showdown-clicks
+                st.info("No additional UI type metrics available for this dataset. Only overall accuracy is reported.")
         # Checkpoint progression visualization
+        with progression_expander_placeholder.expander("Checkpoint Progression Analysis"):
+            # Select a model with checkpoints
+            models_with_checkpoints = ui_metrics_df[ui_metrics_df['all_checkpoints'].apply(lambda x: len(x) > 1)]
+            if not models_with_checkpoints.empty:
+                selected_checkpoint_model = st.selectbox(
+                    "Select a model to view checkpoint progression:",
+                    models_with_checkpoints['model'].str.replace('*', '').unique()
+                )
+                # Get checkpoint data for selected model
+                model_row = models_with_checkpoints[models_with_checkpoints['model'].str.replace('*', '') == selected_checkpoint_model].iloc[0]
+                checkpoint_data = model_row['all_checkpoints']
+                # Create DataFrame from checkpoint data
+                checkpoint_df = pd.DataFrame(checkpoint_data)
+                # Prepare data for visualization
+                checkpoint_metrics = []
+                for _, cp in checkpoint_df.iterrows():
+                    ui_results = cp.get('ui_type_results', {})
+                    dataset_type_results = cp.get('dataset_type_results', {})
+                    results_by_file = cp.get('results_by_file', {})
+                    # Check if we have desktop/web breakdown in results_by_file
+                    desktop_file = None
+                    web_file = None
+                    for filename, file_results in results_by_file.items():
+                        if 'desktop' in filename.lower():
+                            desktop_file = file_results
+                        elif 'web' in filename.lower():
+                            web_file = file_results
+                    if desktop_file and web_file:
+                        # We have desktop/web breakdown
+                        desktop_text = desktop_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
+                        desktop_icon = desktop_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(desktop_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
+                        web_text = web_file.get('by_ui_type', {}).get('text', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('text', {}).get('total', 1), 1) * 100
+                        web_icon = web_file.get('by_ui_type', {}).get('icon', {}).get('correct', 0) / max(web_file.get('by_ui_type', {}).get('icon', {}).get('total', 1), 1) * 100
+                    else:
+                        # Fallback to simple UI type results
+                        desktop_text = ui_results.get('desktop_text', {}).get('correct', 0) / max(ui_results.get('desktop_text', {}).get('total', 1), 1) * 100
+                        desktop_icon = ui_results.get('desktop_icon', {}).get('correct', 0) / max(ui_results.get('desktop_icon', {}).get('total', 1), 1) * 100
+                        web_text = ui_results.get('web_text', {}).get('correct', 0) / max(ui_results.get('web_text', {}).get('total', 1), 1) * 100
+                        web_icon = ui_results.get('web_icon', {}).get('correct', 0) / max(ui_results.get('web_icon', {}).get('total', 1), 1) * 100
+                        # If still all zeros, try dataset_type_results
+                        if desktop_text == 0 and desktop_icon == 0 and web_text == 0 and web_icon == 0:
+                            for dataset_key in dataset_type_results:
+                                if 'screenspot' in dataset_key.lower():
+                                    dataset_data = dataset_type_results[dataset_key]
+                                    if 'by_ui_type' in dataset_data:
+                                        ui_data = dataset_data['by_ui_type']
+                                        # For simple text/icon without desktop/web
+                                        text_val = ui_data.get('text', {}).get('correct', 0) / max(ui_data.get('text', {}).get('total', 1), 1) * 100
+                                        icon_val = ui_data.get('icon', {}).get('correct', 0) / max(ui_data.get('icon', {}).get('total', 1), 1) * 100
+                                        # Assign same values to desktop and web as we don't have the breakdown
+                                        desktop_text = web_text = text_val
+                                        desktop_icon = web_icon = icon_val
+                                        break
                     desktop_avg = (desktop_text + desktop_icon) / 2
                     web_avg = (web_text + web_icon) / 2
     else:
         # For non-ScreenSpot datasets, show a simple bar chart
+        chart_data = filtered_df[['model', 'overall_accuracy']].copy()
+        chart_data.columns = ['Model', 'Accuracy']
+        chart = alt.Chart(chart_data).mark_bar().encode(
+            x=alt.X('Model:N', sort='-y', axis=alt.Axis(labelAngle=-45)),
+            y=alt.Y('Accuracy:Q', scale=alt.Scale(domain=[0, 100])),
+            tooltip=['Model', 'Accuracy']
+        ).properties(
+            width=800,
+            height=400
+        )
+        chart_placeholder.altair_chart(chart, use_container_width=True)
 if __name__ == "__main__":
     main()