Fix bar plot rendering issues for cloud deployment - add data validation, type conversion, and robust error handling
Browse files- app.py +101 -43
- requirements.txt +6 -6
app.py
CHANGED
|
@@ -60,7 +60,8 @@ st.markdown("""
|
|
| 60 |
def load_comprehensive_data():
|
| 61 |
"""Load and clean the comprehensive benchmark data."""
|
| 62 |
try:
|
| 63 |
-
|
|
|
|
| 64 |
|
| 65 |
# Clean the data - handle list-like values stored as strings
|
| 66 |
for col in df.columns:
|
|
@@ -69,28 +70,44 @@ def load_comprehensive_data():
|
|
| 69 |
return np.nan
|
| 70 |
if isinstance(x, str) and x.startswith('['):
|
| 71 |
try:
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
return np.nan
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
df[col] = df[col].apply(extract_value)
|
| 78 |
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 79 |
|
| 80 |
# Filter to only models that have data for at least a few benchmarks
|
| 81 |
min_benchmarks = 3
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
-
return df
|
| 85 |
except FileNotFoundError:
|
| 86 |
st.error("Could not find comprehensive_benchmark_scores.csv. Please ensure the data file exists.")
|
| 87 |
return pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
@st.cache_data
|
| 90 |
def load_stderr_data():
|
| 91 |
"""Load and clean standard error data."""
|
| 92 |
try:
|
| 93 |
-
stderr_df = pd.read_csv("benchmark_standard_errors.csv", index_col=0)
|
| 94 |
|
| 95 |
# Clean the data
|
| 96 |
for col in stderr_df.columns:
|
|
@@ -99,17 +116,28 @@ def load_stderr_data():
|
|
| 99 |
return np.nan
|
| 100 |
if isinstance(x, str) and x.startswith('['):
|
| 101 |
try:
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
return np.nan
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
stderr_df[col] = stderr_df[col].apply(extract_value)
|
| 108 |
stderr_df[col] = pd.to_numeric(stderr_df[col], errors='coerce')
|
| 109 |
|
| 110 |
return stderr_df
|
|
|
|
| 111 |
except FileNotFoundError:
|
| 112 |
return None
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
def clean_benchmark_name(name):
|
| 115 |
"""Clean benchmark names for consistent display."""
|
|
@@ -302,22 +330,30 @@ def filter_target_benchmarks(df):
|
|
| 302 |
|
| 303 |
def main():
|
| 304 |
"""Main application."""
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
st.markdown('<div class="main-header">🔬 OpenThoughts Evalchemy Benchmark Explorer</div>', unsafe_allow_html=True)
|
| 308 |
-
st.markdown("**Explore correlations and relationships between OpenThoughts model performance across different benchmarks**")
|
| 309 |
|
| 310 |
# Load data
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
if df.empty:
|
| 316 |
-
st.error("No data available. Please check that the
|
| 317 |
return
|
| 318 |
|
| 319 |
# Filter to target benchmarks
|
| 320 |
-
|
| 321 |
target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
|
| 322 |
|
| 323 |
# Sidebar
|
|
@@ -345,13 +381,13 @@ def main():
|
|
| 345 |
for category in selected_categories:
|
| 346 |
for bench_name in benchmark_categories[category]:
|
| 347 |
actual_name = target_benchmarks.get(bench_name)
|
| 348 |
-
if actual_name in
|
| 349 |
filtered_benchmarks.append(actual_name)
|
| 350 |
|
| 351 |
if filtered_benchmarks:
|
| 352 |
-
df_display =
|
| 353 |
else:
|
| 354 |
-
df_display =
|
| 355 |
|
| 356 |
# Zero filtering
|
| 357 |
filter_zeros = st.sidebar.checkbox("Filter out zero/near-zero values", value=False)
|
|
@@ -430,29 +466,51 @@ def show_overview_dashboard(df, stderr_df):
|
|
| 430 |
target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
|
| 431 |
|
| 432 |
for col in df.columns:
|
| 433 |
-
coverage = df[col].notna().sum()
|
| 434 |
category = col_to_category.get(col, 'Unknown')
|
| 435 |
clean_name = clean_benchmark_name(col)
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
|
| 457 |
# Quick correlation insights
|
| 458 |
st.subheader("Quick Correlation Insights")
|
|
|
|
| 60 |
def load_comprehensive_data():
|
| 61 |
"""Load and clean the comprehensive benchmark data."""
|
| 62 |
try:
|
| 63 |
+
# Use explicit encoding and error handling
|
| 64 |
+
df = pd.read_csv("comprehensive_benchmark_scores.csv", index_col=0, encoding='utf-8')
|
| 65 |
|
| 66 |
# Clean the data - handle list-like values stored as strings
|
| 67 |
for col in df.columns:
|
|
|
|
| 70 |
return np.nan
|
| 71 |
if isinstance(x, str) and x.startswith('['):
|
| 72 |
try:
|
| 73 |
+
parsed = ast.literal_eval(x)
|
| 74 |
+
if isinstance(parsed, list) and len(parsed) > 0:
|
| 75 |
+
return float(parsed[0]) # Ensure float type
|
| 76 |
+
else:
|
| 77 |
+
return np.nan
|
| 78 |
+
except (ValueError, SyntaxError):
|
| 79 |
return np.nan
|
| 80 |
+
try:
|
| 81 |
+
return float(x) # Ensure numeric values are float
|
| 82 |
+
except (ValueError, TypeError):
|
| 83 |
+
return np.nan
|
| 84 |
|
| 85 |
df[col] = df[col].apply(extract_value)
|
| 86 |
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 87 |
|
| 88 |
# Filter to only models that have data for at least a few benchmarks
|
| 89 |
min_benchmarks = 3
|
| 90 |
+
df_filtered = df.dropna(thresh=min_benchmarks, axis=0)
|
| 91 |
+
|
| 92 |
+
# Ensure we have some data
|
| 93 |
+
if len(df_filtered) == 0:
|
| 94 |
+
st.error("No models found with sufficient benchmark data.")
|
| 95 |
+
return pd.DataFrame()
|
| 96 |
+
|
| 97 |
+
return df_filtered
|
| 98 |
|
|
|
|
| 99 |
except FileNotFoundError:
|
| 100 |
st.error("Could not find comprehensive_benchmark_scores.csv. Please ensure the data file exists.")
|
| 101 |
return pd.DataFrame()
|
| 102 |
+
except Exception as e:
|
| 103 |
+
st.error(f"Error loading data: {str(e)}")
|
| 104 |
+
return pd.DataFrame()
|
| 105 |
|
| 106 |
@st.cache_data
|
| 107 |
def load_stderr_data():
|
| 108 |
"""Load and clean standard error data."""
|
| 109 |
try:
|
| 110 |
+
stderr_df = pd.read_csv("benchmark_standard_errors.csv", index_col=0, encoding='utf-8')
|
| 111 |
|
| 112 |
# Clean the data
|
| 113 |
for col in stderr_df.columns:
|
|
|
|
| 116 |
return np.nan
|
| 117 |
if isinstance(x, str) and x.startswith('['):
|
| 118 |
try:
|
| 119 |
+
parsed = ast.literal_eval(x)
|
| 120 |
+
if isinstance(parsed, list) and len(parsed) > 0:
|
| 121 |
+
return float(parsed[0]) # Ensure float type
|
| 122 |
+
else:
|
| 123 |
+
return np.nan
|
| 124 |
+
except (ValueError, SyntaxError):
|
| 125 |
return np.nan
|
| 126 |
+
try:
|
| 127 |
+
return float(x) # Ensure numeric values are float
|
| 128 |
+
except (ValueError, TypeError):
|
| 129 |
+
return np.nan
|
| 130 |
|
| 131 |
stderr_df[col] = stderr_df[col].apply(extract_value)
|
| 132 |
stderr_df[col] = pd.to_numeric(stderr_df[col], errors='coerce')
|
| 133 |
|
| 134 |
return stderr_df
|
| 135 |
+
|
| 136 |
except FileNotFoundError:
|
| 137 |
return None
|
| 138 |
+
except Exception as e:
|
| 139 |
+
st.warning(f"Error loading standard error data: {str(e)}")
|
| 140 |
+
return None
|
| 141 |
|
| 142 |
def clean_benchmark_name(name):
|
| 143 |
"""Clean benchmark names for consistent display."""
|
|
|
|
| 330 |
|
| 331 |
def main():
|
| 332 |
"""Main application."""
|
| 333 |
+
st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
|
| 334 |
+
unsafe_allow_html=True)
|
|
|
|
|
|
|
| 335 |
|
| 336 |
# Load data
|
| 337 |
+
df = load_comprehensive_data()
|
| 338 |
+
stderr_df = load_stderr_data()
|
| 339 |
+
|
| 340 |
+
# Debug information (hidden in an expander)
|
| 341 |
+
with st.expander("🔧 Debug Information", expanded=False):
|
| 342 |
+
st.write(f"**Data Shape:** {df.shape if not df.empty else 'No data'}")
|
| 343 |
+
st.write(f"**Columns:** {len(df.columns) if not df.empty else 0}")
|
| 344 |
+
st.write(f"**Models:** {len(df.index) if not df.empty else 0}")
|
| 345 |
+
if not df.empty:
|
| 346 |
+
st.write(f"**Sample columns:** {list(df.columns[:5])}")
|
| 347 |
+
st.write(f"**Data types:** {df.dtypes.value_counts().to_dict()}")
|
| 348 |
+
st.write(f"**Missing values per column:** {df.isnull().sum().sum()}")
|
| 349 |
+
st.write(f"**StdErr data available:** {'Yes' if stderr_df is not None else 'No'}")
|
| 350 |
|
| 351 |
if df.empty:
|
| 352 |
+
st.error("No data available. Please check that the CSV files are properly uploaded and accessible.")
|
| 353 |
return
|
| 354 |
|
| 355 |
# Filter to target benchmarks
|
| 356 |
+
df = filter_target_benchmarks(df)
|
| 357 |
target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
|
| 358 |
|
| 359 |
# Sidebar
|
|
|
|
| 381 |
for category in selected_categories:
|
| 382 |
for bench_name in benchmark_categories[category]:
|
| 383 |
actual_name = target_benchmarks.get(bench_name)
|
| 384 |
+
if actual_name in df.columns:
|
| 385 |
filtered_benchmarks.append(actual_name)
|
| 386 |
|
| 387 |
if filtered_benchmarks:
|
| 388 |
+
df_display = df[filtered_benchmarks].copy()
|
| 389 |
else:
|
| 390 |
+
df_display = df.copy()
|
| 391 |
|
| 392 |
# Zero filtering
|
| 393 |
filter_zeros = st.sidebar.checkbox("Filter out zero/near-zero values", value=False)
|
|
|
|
| 466 |
target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
|
| 467 |
|
| 468 |
for col in df.columns:
|
| 469 |
+
coverage = int(df[col].notna().sum()) # Ensure integer type
|
| 470 |
category = col_to_category.get(col, 'Unknown')
|
| 471 |
clean_name = clean_benchmark_name(col)
|
| 472 |
+
|
| 473 |
+
# Ensure we have valid data
|
| 474 |
+
if coverage >= 0: # Only include valid coverage counts
|
| 475 |
+
coverage_data.append({
|
| 476 |
+
'Benchmark': str(clean_name), # Ensure string type
|
| 477 |
+
'Coverage': coverage,
|
| 478 |
+
'Percentage': float(coverage / len(df) * 100), # Ensure float type
|
| 479 |
+
'Category': str(category) # Ensure string type
|
| 480 |
+
})
|
| 481 |
+
|
| 482 |
+
if coverage_data: # Only create plot if we have data
|
| 483 |
+
coverage_df = pd.DataFrame(coverage_data).sort_values('Coverage', ascending=True)
|
| 484 |
+
|
| 485 |
+
# Ensure data types are correct
|
| 486 |
+
coverage_df['Coverage'] = coverage_df['Coverage'].astype(int)
|
| 487 |
+
coverage_df['Percentage'] = coverage_df['Percentage'].astype(float)
|
| 488 |
+
coverage_df['Benchmark'] = coverage_df['Benchmark'].astype(str)
|
| 489 |
+
coverage_df['Category'] = coverage_df['Category'].astype(str)
|
| 490 |
+
|
| 491 |
+
# Create bar plot with explicit parameters
|
| 492 |
+
fig = px.bar(coverage_df,
|
| 493 |
+
x='Coverage',
|
| 494 |
+
y='Benchmark',
|
| 495 |
+
color='Category',
|
| 496 |
+
color_discrete_map=colors,
|
| 497 |
+
title="Model Coverage by Benchmark",
|
| 498 |
+
labels={'Coverage': 'Number of Models'},
|
| 499 |
+
orientation='h',
|
| 500 |
+
text='Coverage') # Add text labels to bars
|
| 501 |
+
|
| 502 |
+
# Update layout for better visibility
|
| 503 |
+
fig.update_traces(texttemplate='%{text}', textposition='outside')
|
| 504 |
+
fig.update_layout(
|
| 505 |
+
height=max(400, len(coverage_df) * 25), # Dynamic height based on data
|
| 506 |
+
showlegend=True,
|
| 507 |
+
xaxis_title="Number of Models",
|
| 508 |
+
yaxis_title="Benchmark"
|
| 509 |
+
)
|
| 510 |
+
|
| 511 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 512 |
+
else:
|
| 513 |
+
st.warning("No coverage data available to display.")
|
| 514 |
|
| 515 |
# Quick correlation insights
|
| 516 |
st.subheader("Quick Correlation Insights")
|
requirements.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
streamlit>=1.28.0
|
| 2 |
-
pandas>=2.0.0
|
| 3 |
-
numpy>=1.24.0
|
| 4 |
-
plotly>=5.15.0
|
| 5 |
-
scipy>=1.10.0
|
| 6 |
-
matplotlib>=3.7.0
|
| 7 |
-
seaborn>=0.12.0
|
|
|
|
| 1 |
streamlit>=1.28.0
|
| 2 |
+
pandas>=2.0.0,<2.3.0
|
| 3 |
+
numpy>=1.24.0,<2.0.0
|
| 4 |
+
plotly>=5.15.0,<6.0.0
|
| 5 |
+
scipy>=1.10.0,<2.0.0
|
| 6 |
+
matplotlib>=3.7.0,<4.0.0
|
| 7 |
+
seaborn>=0.12.0,<1.0.0
|