jmercat commited on
Commit
5c0c5a9
·
1 Parent(s): 3a9cbd7

Fix bar plot rendering issues for cloud deployment - add data validation, type conversion, and robust error handling

Browse files
Files changed (2) hide show
  1. app.py +101 -43
  2. requirements.txt +6 -6
app.py CHANGED
@@ -60,7 +60,8 @@ st.markdown("""
60
  def load_comprehensive_data():
61
  """Load and clean the comprehensive benchmark data."""
62
  try:
63
- df = pd.read_csv("comprehensive_benchmark_scores.csv", index_col=0)
 
64
 
65
  # Clean the data - handle list-like values stored as strings
66
  for col in df.columns:
@@ -69,28 +70,44 @@ def load_comprehensive_data():
69
  return np.nan
70
  if isinstance(x, str) and x.startswith('['):
71
  try:
72
- return ast.literal_eval(x)[0]
73
- except:
 
 
 
 
74
  return np.nan
75
- return x
 
 
 
76
 
77
  df[col] = df[col].apply(extract_value)
78
  df[col] = pd.to_numeric(df[col], errors='coerce')
79
 
80
  # Filter to only models that have data for at least a few benchmarks
81
  min_benchmarks = 3
82
- df = df.dropna(thresh=min_benchmarks, axis=0)
 
 
 
 
 
 
 
83
 
84
- return df
85
  except FileNotFoundError:
86
  st.error("Could not find comprehensive_benchmark_scores.csv. Please ensure the data file exists.")
87
  return pd.DataFrame()
 
 
 
88
 
89
  @st.cache_data
90
  def load_stderr_data():
91
  """Load and clean standard error data."""
92
  try:
93
- stderr_df = pd.read_csv("benchmark_standard_errors.csv", index_col=0)
94
 
95
  # Clean the data
96
  for col in stderr_df.columns:
@@ -99,17 +116,28 @@ def load_stderr_data():
99
  return np.nan
100
  if isinstance(x, str) and x.startswith('['):
101
  try:
102
- return ast.literal_eval(x)[0]
103
- except:
 
 
 
 
104
  return np.nan
105
- return x
 
 
 
106
 
107
  stderr_df[col] = stderr_df[col].apply(extract_value)
108
  stderr_df[col] = pd.to_numeric(stderr_df[col], errors='coerce')
109
 
110
  return stderr_df
 
111
  except FileNotFoundError:
112
  return None
 
 
 
113
 
114
  def clean_benchmark_name(name):
115
  """Clean benchmark names for consistent display."""
@@ -302,22 +330,30 @@ def filter_target_benchmarks(df):
302
 
303
  def main():
304
  """Main application."""
305
-
306
- # Header
307
- st.markdown('<div class="main-header">🔬 OpenThoughts Evalchemy Benchmark Explorer</div>', unsafe_allow_html=True)
308
- st.markdown("**Explore correlations and relationships between OpenThoughts model performance across different benchmarks**")
309
 
310
  # Load data
311
- with st.spinner("Loading benchmark data..."):
312
- df = load_comprehensive_data()
313
- stderr_df = load_stderr_data()
 
 
 
 
 
 
 
 
 
 
314
 
315
  if df.empty:
316
- st.error("No data available. Please check that the data files exist.")
317
  return
318
 
319
  # Filter to target benchmarks
320
- df_filtered = filter_target_benchmarks(df)
321
  target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
322
 
323
  # Sidebar
@@ -345,13 +381,13 @@ def main():
345
  for category in selected_categories:
346
  for bench_name in benchmark_categories[category]:
347
  actual_name = target_benchmarks.get(bench_name)
348
- if actual_name in df_filtered.columns:
349
  filtered_benchmarks.append(actual_name)
350
 
351
  if filtered_benchmarks:
352
- df_display = df_filtered[filtered_benchmarks].copy()
353
  else:
354
- df_display = df_filtered.copy()
355
 
356
  # Zero filtering
357
  filter_zeros = st.sidebar.checkbox("Filter out zero/near-zero values", value=False)
@@ -430,29 +466,51 @@ def show_overview_dashboard(df, stderr_df):
430
  target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
431
 
432
  for col in df.columns:
433
- coverage = df[col].notna().sum()
434
  category = col_to_category.get(col, 'Unknown')
435
  clean_name = clean_benchmark_name(col)
436
- coverage_data.append({
437
- 'Benchmark': clean_name,
438
- 'Coverage': coverage,
439
- 'Percentage': coverage / len(df) * 100,
440
- 'Category': category
441
- })
442
-
443
- coverage_df = pd.DataFrame(coverage_data).sort_values('Coverage', ascending=True)
444
-
445
- fig = px.bar(coverage_df,
446
- x='Coverage',
447
- y='Benchmark',
448
- color='Category',
449
- color_discrete_map=colors,
450
- title="Model Coverage by Benchmark",
451
- labels={'Coverage': 'Number of Models'},
452
- orientation='h')
453
-
454
- fig.update_layout(height=400)
455
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
 
457
  # Quick correlation insights
458
  st.subheader("Quick Correlation Insights")
 
60
  def load_comprehensive_data():
61
  """Load and clean the comprehensive benchmark data."""
62
  try:
63
+ # Use explicit encoding and error handling
64
+ df = pd.read_csv("comprehensive_benchmark_scores.csv", index_col=0, encoding='utf-8')
65
 
66
  # Clean the data - handle list-like values stored as strings
67
  for col in df.columns:
 
70
  return np.nan
71
  if isinstance(x, str) and x.startswith('['):
72
  try:
73
+ parsed = ast.literal_eval(x)
74
+ if isinstance(parsed, list) and len(parsed) > 0:
75
+ return float(parsed[0]) # Ensure float type
76
+ else:
77
+ return np.nan
78
+ except (ValueError, SyntaxError):
79
  return np.nan
80
+ try:
81
+ return float(x) # Ensure numeric values are float
82
+ except (ValueError, TypeError):
83
+ return np.nan
84
 
85
  df[col] = df[col].apply(extract_value)
86
  df[col] = pd.to_numeric(df[col], errors='coerce')
87
 
88
  # Filter to only models that have data for at least a few benchmarks
89
  min_benchmarks = 3
90
+ df_filtered = df.dropna(thresh=min_benchmarks, axis=0)
91
+
92
+ # Ensure we have some data
93
+ if len(df_filtered) == 0:
94
+ st.error("No models found with sufficient benchmark data.")
95
+ return pd.DataFrame()
96
+
97
+ return df_filtered
98
 
 
99
  except FileNotFoundError:
100
  st.error("Could not find comprehensive_benchmark_scores.csv. Please ensure the data file exists.")
101
  return pd.DataFrame()
102
+ except Exception as e:
103
+ st.error(f"Error loading data: {str(e)}")
104
+ return pd.DataFrame()
105
 
106
  @st.cache_data
107
  def load_stderr_data():
108
  """Load and clean standard error data."""
109
  try:
110
+ stderr_df = pd.read_csv("benchmark_standard_errors.csv", index_col=0, encoding='utf-8')
111
 
112
  # Clean the data
113
  for col in stderr_df.columns:
 
116
  return np.nan
117
  if isinstance(x, str) and x.startswith('['):
118
  try:
119
+ parsed = ast.literal_eval(x)
120
+ if isinstance(parsed, list) and len(parsed) > 0:
121
+ return float(parsed[0]) # Ensure float type
122
+ else:
123
+ return np.nan
124
+ except (ValueError, SyntaxError):
125
  return np.nan
126
+ try:
127
+ return float(x) # Ensure numeric values are float
128
+ except (ValueError, TypeError):
129
+ return np.nan
130
 
131
  stderr_df[col] = stderr_df[col].apply(extract_value)
132
  stderr_df[col] = pd.to_numeric(stderr_df[col], errors='coerce')
133
 
134
  return stderr_df
135
+
136
  except FileNotFoundError:
137
  return None
138
+ except Exception as e:
139
+ st.warning(f"Error loading standard error data: {str(e)}")
140
+ return None
141
 
142
  def clean_benchmark_name(name):
143
  """Clean benchmark names for consistent display."""
 
330
 
331
  def main():
332
  """Main application."""
333
+ st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
334
+ unsafe_allow_html=True)
 
 
335
 
336
  # Load data
337
+ df = load_comprehensive_data()
338
+ stderr_df = load_stderr_data()
339
+
340
+ # Debug information (hidden in an expander)
341
+ with st.expander("🔧 Debug Information", expanded=False):
342
+ st.write(f"**Data Shape:** {df.shape if not df.empty else 'No data'}")
343
+ st.write(f"**Columns:** {len(df.columns) if not df.empty else 0}")
344
+ st.write(f"**Models:** {len(df.index) if not df.empty else 0}")
345
+ if not df.empty:
346
+ st.write(f"**Sample columns:** {list(df.columns[:5])}")
347
+ st.write(f"**Data types:** {df.dtypes.value_counts().to_dict()}")
348
+ st.write(f"**Missing values per column:** {df.isnull().sum().sum()}")
349
+ st.write(f"**StdErr data available:** {'Yes' if stderr_df is not None else 'No'}")
350
 
351
  if df.empty:
352
+ st.error("No data available. Please check that the CSV files are properly uploaded and accessible.")
353
  return
354
 
355
  # Filter to target benchmarks
356
+ df = filter_target_benchmarks(df)
357
  target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
358
 
359
  # Sidebar
 
381
  for category in selected_categories:
382
  for bench_name in benchmark_categories[category]:
383
  actual_name = target_benchmarks.get(bench_name)
384
+ if actual_name in df.columns:
385
  filtered_benchmarks.append(actual_name)
386
 
387
  if filtered_benchmarks:
388
+ df_display = df[filtered_benchmarks].copy()
389
  else:
390
+ df_display = df.copy()
391
 
392
  # Zero filtering
393
  filter_zeros = st.sidebar.checkbox("Filter out zero/near-zero values", value=False)
 
466
  target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
467
 
468
  for col in df.columns:
469
+ coverage = int(df[col].notna().sum()) # Ensure integer type
470
  category = col_to_category.get(col, 'Unknown')
471
  clean_name = clean_benchmark_name(col)
472
+
473
+ # Ensure we have valid data
474
+ if coverage >= 0: # Only include valid coverage counts
475
+ coverage_data.append({
476
+ 'Benchmark': str(clean_name), # Ensure string type
477
+ 'Coverage': coverage,
478
+ 'Percentage': float(coverage / len(df) * 100), # Ensure float type
479
+ 'Category': str(category) # Ensure string type
480
+ })
481
+
482
+ if coverage_data: # Only create plot if we have data
483
+ coverage_df = pd.DataFrame(coverage_data).sort_values('Coverage', ascending=True)
484
+
485
+ # Ensure data types are correct
486
+ coverage_df['Coverage'] = coverage_df['Coverage'].astype(int)
487
+ coverage_df['Percentage'] = coverage_df['Percentage'].astype(float)
488
+ coverage_df['Benchmark'] = coverage_df['Benchmark'].astype(str)
489
+ coverage_df['Category'] = coverage_df['Category'].astype(str)
490
+
491
+ # Create bar plot with explicit parameters
492
+ fig = px.bar(coverage_df,
493
+ x='Coverage',
494
+ y='Benchmark',
495
+ color='Category',
496
+ color_discrete_map=colors,
497
+ title="Model Coverage by Benchmark",
498
+ labels={'Coverage': 'Number of Models'},
499
+ orientation='h',
500
+ text='Coverage') # Add text labels to bars
501
+
502
+ # Update layout for better visibility
503
+ fig.update_traces(texttemplate='%{text}', textposition='outside')
504
+ fig.update_layout(
505
+ height=max(400, len(coverage_df) * 25), # Dynamic height based on data
506
+ showlegend=True,
507
+ xaxis_title="Number of Models",
508
+ yaxis_title="Benchmark"
509
+ )
510
+
511
+ st.plotly_chart(fig, use_container_width=True)
512
+ else:
513
+ st.warning("No coverage data available to display.")
514
 
515
  # Quick correlation insights
516
  st.subheader("Quick Correlation Insights")
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
  streamlit>=1.28.0
2
- pandas>=2.0.0
3
- numpy>=1.24.0
4
- plotly>=5.15.0
5
- scipy>=1.10.0
6
- matplotlib>=3.7.0
7
- seaborn>=0.12.0
 
1
  streamlit>=1.28.0
2
+ pandas>=2.0.0,<2.3.0
3
+ numpy>=1.24.0,<2.0.0
4
+ plotly>=5.15.0,<6.0.0
5
+ scipy>=1.10.0,<2.0.0
6
+ matplotlib>=3.7.0,<4.0.0
7
+ seaborn>=0.12.0,<1.0.0