Spaces:

mlfoundations
/

OpenThoughts_data_explorer

Running

App Files Files Community

jmercat commited on Jun 2, 2025

Commit

5c0c5a9

1 Parent(s): 3a9cbd7

Fix bar plot rendering issues for cloud deployment - add data validation, type conversion, and robust error handling

Browse files

Files changed (2) hide show

app.py +101 -43
requirements.txt +6 -6

app.py CHANGED Viewed

@@ -60,7 +60,8 @@ st.markdown("""
 def load_comprehensive_data():
     """Load and clean the comprehensive benchmark data."""
     try:
-        df = pd.read_csv("comprehensive_benchmark_scores.csv", index_col=0)
         # Clean the data - handle list-like values stored as strings
         for col in df.columns:
@@ -69,28 +70,44 @@ def load_comprehensive_data():
                     return np.nan
                 if isinstance(x, str) and x.startswith('['):
                     try:
-                        return ast.literal_eval(x)[0]
-                    except:
                         return np.nan
-                return x
             df[col] = df[col].apply(extract_value)
             df[col] = pd.to_numeric(df[col], errors='coerce')
         # Filter to only models that have data for at least a few benchmarks
         min_benchmarks = 3
-        df = df.dropna(thresh=min_benchmarks, axis=0)
-        return df
     except FileNotFoundError:
         st.error("Could not find comprehensive_benchmark_scores.csv. Please ensure the data file exists.")
         return pd.DataFrame()
 @st.cache_data
 def load_stderr_data():
     """Load and clean standard error data."""
     try:
-        stderr_df = pd.read_csv("benchmark_standard_errors.csv", index_col=0)
         # Clean the data
         for col in stderr_df.columns:
@@ -99,17 +116,28 @@ def load_stderr_data():
                     return np.nan
                 if isinstance(x, str) and x.startswith('['):
                     try:
-                        return ast.literal_eval(x)[0]
-                    except:
                         return np.nan
-                return x
             stderr_df[col] = stderr_df[col].apply(extract_value)
             stderr_df[col] = pd.to_numeric(stderr_df[col], errors='coerce')
         return stderr_df
     except FileNotFoundError:
         return None
 def clean_benchmark_name(name):
     """Clean benchmark names for consistent display."""
@@ -302,22 +330,30 @@ def filter_target_benchmarks(df):
 def main():
     """Main application."""
-    # Header
-    st.markdown('<div class="main-header">🔬 OpenThoughts Evalchemy Benchmark Explorer</div>', unsafe_allow_html=True)
-    st.markdown("**Explore correlations and relationships between OpenThoughts model performance across different benchmarks**")
     # Load data
-    with st.spinner("Loading benchmark data..."):
-        df = load_comprehensive_data()
-        stderr_df = load_stderr_data()
     if df.empty:
-        st.error("No data available. Please check that the data files exist.")
         return
     # Filter to target benchmarks
-    df_filtered = filter_target_benchmarks(df)
     target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
     # Sidebar
@@ -345,13 +381,13 @@ def main():
     for category in selected_categories:
         for bench_name in benchmark_categories[category]:
             actual_name = target_benchmarks.get(bench_name)
-            if actual_name in df_filtered.columns:
                 filtered_benchmarks.append(actual_name)
     if filtered_benchmarks:
-        df_display = df_filtered[filtered_benchmarks].copy()
     else:
-        df_display = df_filtered.copy()
     # Zero filtering
     filter_zeros = st.sidebar.checkbox("Filter out zero/near-zero values", value=False)
@@ -430,29 +466,51 @@ def show_overview_dashboard(df, stderr_df):
     target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
     for col in df.columns:
-        coverage = df[col].notna().sum()
         category = col_to_category.get(col, 'Unknown')
         clean_name = clean_benchmark_name(col)
-        coverage_data.append({
-            'Benchmark': clean_name,
-            'Coverage': coverage,
-            'Percentage': coverage / len(df) * 100,
-            'Category': category
-        })
-    coverage_df = pd.DataFrame(coverage_data).sort_values('Coverage', ascending=True)
-    fig = px.bar(coverage_df,
-                 x='Coverage',
-                 y='Benchmark',
-                 color='Category',
-                 color_discrete_map=colors,
-                 title="Model Coverage by Benchmark",
-                 labels={'Coverage': 'Number of Models'},
-                 orientation='h')
-    fig.update_layout(height=400)
-    st.plotly_chart(fig, use_container_width=True)
     # Quick correlation insights
     st.subheader("Quick Correlation Insights")

 def load_comprehensive_data():
     """Load and clean the comprehensive benchmark data."""
     try:
+        # Use explicit encoding and error handling
+        df = pd.read_csv("comprehensive_benchmark_scores.csv", index_col=0, encoding='utf-8')
         # Clean the data - handle list-like values stored as strings
         for col in df.columns:
                     return np.nan
                 if isinstance(x, str) and x.startswith('['):
                     try:
+                        parsed = ast.literal_eval(x)
+                        if isinstance(parsed, list) and len(parsed) > 0:
+                            return float(parsed[0])  # Ensure float type
+                        else:
+                            return np.nan
+                    except (ValueError, SyntaxError):
                         return np.nan
+                try:
+                    return float(x)  # Ensure numeric values are float
+                except (ValueError, TypeError):
+                    return np.nan
             df[col] = df[col].apply(extract_value)
             df[col] = pd.to_numeric(df[col], errors='coerce')
         # Filter to only models that have data for at least a few benchmarks
         min_benchmarks = 3
+        df_filtered = df.dropna(thresh=min_benchmarks, axis=0)
+        # Ensure we have some data
+        if len(df_filtered) == 0:
+            st.error("No models found with sufficient benchmark data.")
+            return pd.DataFrame()
+        return df_filtered
     except FileNotFoundError:
         st.error("Could not find comprehensive_benchmark_scores.csv. Please ensure the data file exists.")
         return pd.DataFrame()
+    except Exception as e:
+        st.error(f"Error loading data: {str(e)}")
+        return pd.DataFrame()
 @st.cache_data
 def load_stderr_data():
     """Load and clean standard error data."""
     try:
+        stderr_df = pd.read_csv("benchmark_standard_errors.csv", index_col=0, encoding='utf-8')
         # Clean the data
         for col in stderr_df.columns:
                     return np.nan
                 if isinstance(x, str) and x.startswith('['):
                     try:
+                        parsed = ast.literal_eval(x)
+                        if isinstance(parsed, list) and len(parsed) > 0:
+                            return float(parsed[0])  # Ensure float type
+                        else:
+                            return np.nan
+                    except (ValueError, SyntaxError):
                         return np.nan
+                try:
+                    return float(x)  # Ensure numeric values are float
+                except (ValueError, TypeError):
+                    return np.nan
             stderr_df[col] = stderr_df[col].apply(extract_value)
             stderr_df[col] = pd.to_numeric(stderr_df[col], errors='coerce')
         return stderr_df
     except FileNotFoundError:
         return None
+    except Exception as e:
+        st.warning(f"Error loading standard error data: {str(e)}")
+        return None
 def clean_benchmark_name(name):
     """Clean benchmark names for consistent display."""
 def main():
     """Main application."""
+    st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
+                unsafe_allow_html=True)
     # Load data
+    df = load_comprehensive_data()
+    stderr_df = load_stderr_data()
+    # Debug information (hidden in an expander)
+    with st.expander("🔧 Debug Information", expanded=False):
+        st.write(f"**Data Shape:** {df.shape if not df.empty else 'No data'}")
+        st.write(f"**Columns:** {len(df.columns) if not df.empty else 0}")
+        st.write(f"**Models:** {len(df.index) if not df.empty else 0}")
+        if not df.empty:
+            st.write(f"**Sample columns:** {list(df.columns[:5])}")
+            st.write(f"**Data types:** {df.dtypes.value_counts().to_dict()}")
+            st.write(f"**Missing values per column:** {df.isnull().sum().sum()}")
+        st.write(f"**StdErr data available:** {'Yes' if stderr_df is not None else 'No'}")
     if df.empty:
+        st.error("No data available. Please check that the CSV files are properly uploaded and accessible.")
         return
     # Filter to target benchmarks
+    df = filter_target_benchmarks(df)
     target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
     # Sidebar
     for category in selected_categories:
         for bench_name in benchmark_categories[category]:
             actual_name = target_benchmarks.get(bench_name)
+            if actual_name in df.columns:
                 filtered_benchmarks.append(actual_name)
     if filtered_benchmarks:
+        df_display = df[filtered_benchmarks].copy()
     else:
+        df_display = df.copy()
     # Zero filtering
     filter_zeros = st.sidebar.checkbox("Filter out zero/near-zero values", value=False)
     target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
     for col in df.columns:
+        coverage = int(df[col].notna().sum())  # Ensure integer type
         category = col_to_category.get(col, 'Unknown')
         clean_name = clean_benchmark_name(col)
+        # Ensure we have valid data
+        if coverage >= 0:  # Only include valid coverage counts
+            coverage_data.append({
+                'Benchmark': str(clean_name),  # Ensure string type
+                'Coverage': coverage,
+                'Percentage': float(coverage / len(df) * 100),  # Ensure float type
+                'Category': str(category)  # Ensure string type
+            })
+    if coverage_data:  # Only create plot if we have data
+        coverage_df = pd.DataFrame(coverage_data).sort_values('Coverage', ascending=True)
+        # Ensure data types are correct
+        coverage_df['Coverage'] = coverage_df['Coverage'].astype(int)
+        coverage_df['Percentage'] = coverage_df['Percentage'].astype(float)
+        coverage_df['Benchmark'] = coverage_df['Benchmark'].astype(str)
+        coverage_df['Category'] = coverage_df['Category'].astype(str)
+        # Create bar plot with explicit parameters
+        fig = px.bar(coverage_df,
+                     x='Coverage',
+                     y='Benchmark',
+                     color='Category',
+                     color_discrete_map=colors,
+                     title="Model Coverage by Benchmark",
+                     labels={'Coverage': 'Number of Models'},
+                     orientation='h',
+                     text='Coverage')  # Add text labels to bars
+        # Update layout for better visibility
+        fig.update_traces(texttemplate='%{text}', textposition='outside')
+        fig.update_layout(
+            height=max(400, len(coverage_df) * 25),  # Dynamic height based on data
+            showlegend=True,
+            xaxis_title="Number of Models",
+            yaxis_title="Benchmark"
+        )
+        st.plotly_chart(fig, use_container_width=True)
+    else:
+        st.warning("No coverage data available to display.")
     # Quick correlation insights
     st.subheader("Quick Correlation Insights")

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 streamlit>=1.28.0
-pandas>=2.0.0
-numpy>=1.24.0
-plotly>=5.15.0
-scipy>=1.10.0
-matplotlib>=3.7.0
-seaborn>=0.12.0

 streamlit>=1.28.0
+pandas>=2.0.0,<2.3.0
+numpy>=1.24.0,<2.0.0
+plotly>=5.15.0,<6.0.0
+scipy>=1.10.0,<2.0.0
+matplotlib>=3.7.0,<4.0.0
+seaborn>=0.12.0,<1.0.0