Spaces:

mlfoundations
/

OpenThoughts_data_explorer

Running

App Files Files Community

jmercat commited on Jun 2

Commit

ec23e9b

1 Parent(s): 5c0c5a9

Improvements in performance chart, hopefully fixed online missing data

Browse files

Files changed (1) hide show

app.py +145 -17

app.py CHANGED Viewed

@@ -702,47 +702,175 @@ def show_model_performance(df):
     # Model comparison
     st.subheader("Model Comparison")
     selected_models = st.multiselect(
-        "Select models to compare",
-        df_display.index.tolist(),
-        default=model_avg_scores.head(3).index.tolist()
     )
     if selected_models:
         comparison_data = df_display.loc[selected_models].T
         comparison_data.index = [clean_benchmark_name(idx) for idx in comparison_data.index]
-        # Radar chart
-        if len(selected_models) <= 5:  # Only for manageable number of models
             fig = go.Figure()
-            for model in selected_models:
-                model_data = df_display.loc[model].dropna()
-                benchmarks = [clean_benchmark_name(b) for b in model_data.index]
-                values = model_data.values.tolist()
-                # Close the radar chart
-                values += values[:1]
-                benchmarks += benchmarks[:1]
                 fig.add_trace(go.Scatterpolar(
-                    r=values,
-                    theta=benchmarks,
                     fill='toself',
-                    name=model.split('/')[-1]
                 ))
             fig.update_layout(
                 polar=dict(
                     radialaxis=dict(
                         visible=True,
-                        range=[0, 1]
                     )),
                 showlegend=True,
-                title="Model Performance Radar Chart"
             )
             st.plotly_chart(fig, use_container_width=True)
         # Detailed comparison table
         st.subheader("Detailed Comparison")

     # Model comparison
     st.subheader("Model Comparison")
+    # Benchmark selection for radar chart (always visible)
+    st.subheader("📊 Benchmark & Model Selection")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        available_benchmarks = list(df_display.columns)
+        default_benchmarks = available_benchmarks[:min(8, len(available_benchmarks))]  # Default to first 8 or all if fewer
+        selected_benchmarks_for_radar = st.multiselect(
+            "Select benchmarks for radar chart",
+            available_benchmarks,
+            default=default_benchmarks,
+            format_func=clean_benchmark_name,
+            help="Choose which benchmarks to display in the radar chart"
+        )
+    with col2:
+        complete_data_only = st.checkbox(
+            "Complete data only",
+            value=True,
+            help="Show only models that have data for ALL selected benchmarks"
+        )
+    # Filter available models based on benchmark selection and complete data requirement
+    if complete_data_only and selected_benchmarks_for_radar:
+        # Only show models that have data for all selected benchmarks
+        models_with_complete_data = []
+        for model in df_display.index:
+            has_all_data = True
+            for benchmark in selected_benchmarks_for_radar:
+                if pd.isna(df_display.loc[model, benchmark]):
+                    has_all_data = False
+                    break
+            if has_all_data:
+                models_with_complete_data.append(model)
+        available_models_for_selection = models_with_complete_data
+        models_info = f"({len(available_models_for_selection)} models with complete data)"
+    else:
+        available_models_for_selection = df_display.index.tolist()
+        models_info = f"({len(available_models_for_selection)} models total)"
+    # Model selection with filtered list
+    if available_models_for_selection:
+        # Get top performers from available models for default selection
+        available_model_avg_scores = df_display.loc[available_models_for_selection].mean(axis=1, skipna=True).sort_values(ascending=False)
+        default_selection = available_model_avg_scores.head(3).index.tolist()
+    else:
+        default_selection = []
     selected_models = st.multiselect(
+        f"Select models to compare {models_info}",
+        available_models_for_selection,
+        default=default_selection,
+        help="Models are filtered based on benchmark selection and complete data setting above"
     )
     if selected_models:
         comparison_data = df_display.loc[selected_models].T
         comparison_data.index = [clean_benchmark_name(idx) for idx in comparison_data.index]
+        # Performance Radar Chart
+        st.subheader("📊 Performance Radar Chart")
+        if not selected_benchmarks_for_radar:
+            st.info("Please select at least one benchmark above for the radar chart.")
+        elif len(selected_models) == 0:
+            st.info("Please select models above to see the radar chart comparison.")
+        elif len(selected_models) > 10:
+            st.warning(f"Too many models selected ({len(selected_models)}). Please select 10 or fewer models for the radar chart.")
+            st.info("💡 **Tip**: Use the search box above to filter models, then select a smaller subset for comparison.")
+        else:
+            # Show radar chart for 1-10 models
             fig = go.Figure()
+            # Use only selected benchmarks
+            clean_benchmark_names = [clean_benchmark_name(b) for b in selected_benchmarks_for_radar]
+            # Define colors for different models
+            colors_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
+                          '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
+            for i, model in enumerate(selected_models):
+                # Get model data for selected benchmarks only
+                model_scores = []
+                for benchmark in selected_benchmarks_for_radar:
+                    score = df_display.loc[model, benchmark]
+                    # Convert to float, use 0.0 for any remaining NaN values
+                    model_scores.append(0.0 if pd.isna(score) else float(score))
+                # Close the radar chart by adding the first value at the end
+                radar_values = model_scores + [model_scores[0]]
+                radar_benchmarks = clean_benchmark_names + [clean_benchmark_names[0]]
+                # Create model name for legend (remove path prefix if present)
+                model_display_name = model.split('/')[-1] if '/' in model else model
+                # Use color from list, cycling if needed
+                model_color = colors_list[i % len(colors_list)]
                 fig.add_trace(go.Scatterpolar(
+                    r=radar_values,
+                    theta=radar_benchmarks,
                     fill='toself',
+                    name=model_display_name,
+                    line_color=model_color,
+                    hovertemplate='<b>%{theta}</b><br>Score: %{r:.3f}<extra></extra>'
                 ))
+            # Calculate dynamic range for better visualization
+            all_values = []
+            for model in selected_models:
+                for benchmark in selected_benchmarks_for_radar:
+                    score = df_display.loc[model, benchmark]
+                    if not pd.isna(score):
+                        all_values.append(score)
+            if all_values:
+                min_val = min(all_values)
+                max_val = max(all_values)
+                # Add some padding
+                range_padding = (max_val - min_val) * 0.1
+                radar_min = max(0, min_val - range_padding)
+                radar_max = min(1, max_val + range_padding)
+            else:
+                radar_min, radar_max = 0, 1
+            # Adjust chart size based on number of models
+            chart_height = 600 if len(selected_models) <= 3 else 700
             fig.update_layout(
                 polar=dict(
                     radialaxis=dict(
                         visible=True,
+                        range=[radar_min, radar_max],
+                        tickformat='.2f'
                     )),
                 showlegend=True,
+                title=f"Model Performance Radar Chart ({len(selected_benchmarks_for_radar)} benchmarks, {len(selected_models)} models)",
+                width=700,
+                height=chart_height
             )
             st.plotly_chart(fig, use_container_width=True)
+            # Add explanation about missing values (only if not using complete data only)
+            if not complete_data_only:
+                missing_info = []
+                for model in selected_models:
+                    missing_benchmarks = []
+                    for benchmark in selected_benchmarks_for_radar:
+                        if pd.isna(df_display.loc[model, benchmark]):
+                            missing_benchmarks.append(clean_benchmark_name(benchmark))
+                    if missing_benchmarks:
+                        missing_info.append(f"• {model.split('/')[-1]}: {', '.join(missing_benchmarks)}")
+                if missing_info:
+                    with st.expander("ℹ️ Missing Data Information"):
+                        st.write("Missing values are shown as 0 in the radar chart:")
+                        for info in missing_info:
+                            st.write(info)
+            else:
+                # When complete data only is enabled, all selected models should have complete data
+                st.info("✅ All selected models have complete data for the chosen benchmarks.")
+            # Performance tips for large selections
+            if len(selected_models) > 5:
+                st.info(f"💡 **Viewing {len(selected_models)} models**: For better readability, consider selecting fewer models or use the detailed comparison table below.")
         # Detailed comparison table
         st.subheader("Detailed Comparison")