Spaces:

mlfoundations
/

OpenThoughts_data_explorer

Running

App Files Files Community

jmercat commited on Jun 2, 2025

Commit

8daa4df

1 Parent(s): 8a4c76f

Initial commit

Browse files

Files changed (6) hide show

.gitattributes +5 -35
README.md +75 -13
app.py +946 -0
benchmark_standard_errors.csv +3 -0
comprehensive_benchmark_scores.csv +3 -0
requirements.txt +16 -3

.gitattributes CHANGED Viewed

@@ -1,35 +1,5 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.csv filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,20 +1,82 @@
 ---
-title: OpenThoughts Data Explorer
-emoji: 🚀
-colorFrom: red
 colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
 pinned: false
-short_description: Exploring correlations  between LLMs performance
-license: apache-2.0
 ---
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 ---
+title: OpenThoughts Model Benchmark Explorer
+emoji: 📊
+colorFrom: blue
 colorTo: red
+sdk: streamlit
+sdk_version: 1.28.0
+app_file: benchmark_explorer_app.py
 pinned: false
+license: mit
 ---
+# 🔬 OpenThoughts Evalchemy Benchmark Explorer
+Exploring correlations and relationships between LLMs performance across different reasoning benchmarks.
+This explorer is built on top of the [OpenThoughts](https://github.com/open-thoughts/open-thoughts) project to explore the model that we have trained and evaluated as well as external models that we have evaluated.
+All evaluation results were produced and logged using [Evalchemy](https://github.com/mlfoundations/evalchemy).
+## Features
+### 📊 Overview Dashboard
+- Key metrics and dataset statistics
+- Benchmark coverage visualization
+- Quick correlation insights
+- Category-based analysis
+### 🔥 Interactive Heatmap
+- Multiple correlation methods (Pearson, Spearman, Kendall)
+- Interactive hover tooltips
+- Real-time correlation statistics
+- Distribution analysis
+### 📈 Scatter Plot Explorer
+- Dynamic benchmark selection
+- Interactive scatter plots with regression lines
+- Multiple correlation coefficients
+- Data point exploration
+### 🎯 Model Performance Analysis
+- Model search and filtering
+- Performance rankings
+- Radar chart comparisons
+- Side-by-side model analysis
+### 📋 Statistical Summary
+- Comprehensive dataset statistics
+- Benchmark-wise analysis
+- Export capabilities
+- Correlation summaries
+### 🔬 Uncertainty Analysis
+- Measurement precision analysis
+- Error bar visualizations with 95% CI
+- Signal-to-noise ratios
+- Uncertainty-aware correlations
+## Benchmark Categories
+- **Math** (red): AIME24, AIME25, AMC23, MATH500
+- **Code** (blue): CodeElo, CodeForces, LiveCodeBench v2 & v5
+- **Science** (green): GPQADiamond, JEEBench
+- **General** (orange): MMLUPro, HLE
+## Data Filtering Options
+- Category-based filtering
+- Zero-value filtering with threshold
+- Minimum coverage requirements
+- Dynamic slider ranges based on actual data
+## Architecture
+- **Frontend**: Streamlit with Plotly interactive visualizations
+- **Backend**: Pandas/NumPy for data processing, SciPy for statistics
+- **Caching**: Smart caching for performance optimization
+- **Real-time**: On-the-fly correlation computation for dynamic filtering
+## Usage
+The application automatically loads benchmark data and provides six specialized analysis modules. Use the sidebar controls to filter data and customize the analysis based on your needs.
+Perfect for researchers, practitioners, and anyone interested in understanding the relationships between different AI evaluation benchmarks.

app.py ADDED Viewed

	@@ -0,0 +1,946 @@

+#!/usr/bin/env python3
+"""
+Interactive Benchmark Explorer
+A comprehensive web application for exploring OpenThoughts benchmark correlations and model performance
+"""
+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import seaborn as sns
+import matplotlib.pyplot as plt
+from scipy.stats import pearsonr, spearmanr, kendalltau
+from scipy.optimize import minimize
+import ast
+import io
+import base64
+from itertools import combinations
+import warnings
+warnings.filterwarnings('ignore')
+# Configure page
+st.set_page_config(
+    page_title="OpenThoughts Evalchemy Benchmark Explorer",
+    page_icon="📊",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS for better styling
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: bold;
+        color: #1f77b4;
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .metric-card {
+        background-color: #f8f9fa;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        border-left: 4px solid #1f77b4;
+        margin: 0.5rem 0;
+    }
+    .correlation-high { color: #d73027; font-weight: bold; }
+    .correlation-medium { color: #fdae61; font-weight: bold; }
+    .correlation-low { color: #4575b4; font-weight: bold; }
+    .category-math { color: #d73027; font-weight: bold; }
+    .category-code { color: #1f78b4; font-weight: bold; }
+    .category-science { color: #33a02c; font-weight: bold; }
+    .category-general { color: #ff7f00; font-weight: bold; }
+</style>
+""", unsafe_allow_html=True)
+@st.cache_data
+def load_comprehensive_data():
+    """Load and clean the comprehensive benchmark data."""
+    try:
+        df = pd.read_csv("comprehensive_benchmark_scores.csv", index_col=0)
+        # Clean the data - handle list-like values stored as strings
+        for col in df.columns:
+            def extract_value(x):
+                if pd.isna(x):
+                    return np.nan
+                if isinstance(x, str) and x.startswith('['):
+                    try:
+                        return ast.literal_eval(x)[0]
+                    except:
+                        return np.nan
+                return x
+            df[col] = df[col].apply(extract_value)
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+        # Filter to only models that have data for at least a few benchmarks
+        min_benchmarks = 3
+        df = df.dropna(thresh=min_benchmarks, axis=0)
+        return df
+    except FileNotFoundError:
+        st.error("Could not find comprehensive_benchmark_scores.csv. Please ensure the data file exists.")
+        return pd.DataFrame()
+@st.cache_data
+def load_stderr_data():
+    """Load and clean standard error data."""
+    try:
+        stderr_df = pd.read_csv("benchmark_standard_errors.csv", index_col=0)
+        # Clean the data
+        for col in stderr_df.columns:
+            def extract_value(x):
+                if pd.isna(x):
+                    return np.nan
+                if isinstance(x, str) and x.startswith('['):
+                    try:
+                        return ast.literal_eval(x)[0]
+                    except:
+                        return np.nan
+                return x
+            stderr_df[col] = stderr_df[col].apply(extract_value)
+            stderr_df[col] = pd.to_numeric(stderr_df[col], errors='coerce')
+        return stderr_df
+    except FileNotFoundError:
+        return None
+def clean_benchmark_name(name):
+    """Clean benchmark names for consistent display."""
+    return (name.replace("LiveCodeBench_accuracy_avg", "LiveCodeBenchv2")
+            .replace('_accuracy_avg', '')
+            .replace('_accuracy', '')
+            .replace('LiveCodeBench', 'LCB')
+            .replace('GPQADiamond', 'GPQAD')
+            )
+def get_focused_benchmark_mapping():
+    """Define the target benchmarks and categories."""
+    target_benchmarks = {
+        # Math benchmarks
+        'AIME24': 'AIME24_accuracy_avg',
+        'AIME25': 'AIME25_accuracy_avg',
+        'AMC23': 'AMC23_accuracy_avg',
+        'MATH500': 'MATH500_accuracy',
+        # Code benchmarks
+        'CodeElo': 'CodeElo_accuracy_avg',
+        'CodeForces': 'CodeForces_accuracy_avg',
+        'LCBv2': 'LiveCodeBench_accuracy_avg',
+        'LCBv5': 'LiveCodeBenchv5_accuracy_avg',
+        # Science benchmarks
+        'GPQADiamond': 'GPQADiamond_accuracy_avg',
+        'JEEBench': 'JEEBench_accuracy_avg',
+        # General benchmarks
+        'MMLUPro': 'MMLUPro_accuracy_avg',
+        'HLE': 'HLE_accuracy_avg'
+    }
+    benchmark_categories = {
+        'Math': ['AIME24', 'AIME25', 'AMC23', 'MATH500'],
+        'Code': ['CodeElo', 'CodeForces', 'LCBv2', 'LCBv5'],
+        'Science': ['GPQADiamond', 'JEEBench'],
+        'General': ['MMLUPro', 'HLE']
+    }
+    colors = {'Math': '#d73027', 'Code': '#1f78b4', 'Science': '#33a02c', 'General': '#ff7f00'}
+    # Create reverse mapping
+    col_to_category = {}
+    for category, bench_list in benchmark_categories.items():
+        for bench_name in bench_list:
+            actual_name = target_benchmarks.get(bench_name)
+            if actual_name:
+                col_to_category[actual_name] = category
+    return target_benchmarks, benchmark_categories, colors, col_to_category
+def compute_correlations(df, method='pearson'):
+    """Compute correlation matrix with the specified method."""
+    if method == 'pearson':
+        return df.corr(method='pearson')
+    elif method == 'spearman':
+        return df.corr(method='spearman')
+    elif method == 'kendall':
+        return df.corr(method='kendall')
+def create_interactive_heatmap(corr_matrix, title="Correlation Heatmap"):
+    """Create an interactive correlation heatmap using Plotly."""
+    target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
+    # Get clean names for display
+    clean_names = [clean_benchmark_name(name) for name in corr_matrix.columns]
+    # Convert to percentages for display
+    corr_matrix_pct = (corr_matrix * 100).round(1)
+    # Create hover text
+    hover_text = []
+    for i, bench1 in enumerate(corr_matrix.columns):
+        hover_row = []
+        for j, bench2 in enumerate(corr_matrix.columns):
+            if i == j:
+                hover_row.append(f"{clean_names[i]}<br>Reliability: 100%")
+            else:
+                corr_val = corr_matrix_pct.iloc[i, j]
+                if pd.isna(corr_val):
+                    hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>No data")
+                else:
+                    hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>Correlation: {corr_val:.1f}%")
+        hover_text.append(hover_row)
+    # Create the heatmap
+    fig = go.Figure(data=go.Heatmap(
+        z=corr_matrix.values,
+        x=clean_names,
+        y=clean_names,
+        colorscale='RdBu_r',
+        zmid=0,
+        text=corr_matrix_pct.values,
+        texttemplate="%{text}",
+        textfont={"size": 10},
+        hoverinfo='text',
+        hovertext=hover_text,
+        colorbar=dict(title="Correlation", tickformat=".2f")
+    ))
+    # Update layout
+    fig.update_layout(
+        title=title,
+        xaxis_title="",
+        yaxis_title="",
+        width=800,
+        height=800,
+        font=dict(size=12)
+    )
+    # Color the axis labels by category
+    for i, bench in enumerate(corr_matrix.columns):
+        category = col_to_category.get(bench, 'Unknown')
+        color = colors.get(category, 'black')
+    return fig
+def create_scatter_plot(df, x_bench, y_bench, stderr_df=None):
+    """Create an interactive scatter plot between two benchmarks."""
+    if x_bench not in df.columns or y_bench not in df.columns:
+        return None
+    # Get common data
+    common_data = df[[x_bench, y_bench]].dropna()
+    if len(common_data) < 3:
+        return None
+    x_vals = common_data[x_bench]
+    y_vals = common_data[y_bench]
+    # Calculate correlation
+    corr, p_val = pearsonr(x_vals, y_vals)
+    # Create figure
+    fig = go.Figure()
+    # Add scatter points
+    fig.add_trace(go.Scatter(
+        x=x_vals,
+        y=y_vals,
+        mode='markers',
+        text=common_data.index,
+        hovertemplate=(
+            "<b>%{text}</b><br>" +
+            f"{clean_benchmark_name(x_bench)}: %{{x:.3f}}<br>" +
+            f"{clean_benchmark_name(y_bench)}: %{{y:.3f}}<br>" +
+            "<extra></extra>"
+        ),
+        marker=dict(size=8, opacity=0.7, color='steelblue')
+    ))
+    # Add regression line
+    z = np.polyfit(x_vals, y_vals, 1)
+    p = np.poly1d(z)
+    x_line = np.linspace(x_vals.min(), x_vals.max(), 100)
+    fig.add_trace(go.Scatter(
+        x=x_line,
+        y=p(x_line),
+        mode='lines',
+        name=f'r = {corr:.3f}, p = {p_val:.3f}',
+        line=dict(color='red', dash='dash')
+    ))
+    # Update layout
+    fig.update_layout(
+        title=f"{clean_benchmark_name(y_bench)} vs {clean_benchmark_name(x_bench)}",
+        xaxis_title=clean_benchmark_name(x_bench),
+        yaxis_title=clean_benchmark_name(y_bench),
+        showlegend=True,
+        width=600,
+        height=500
+    )
+    return fig
+def filter_target_benchmarks(df):
+    """Filter dataframe to only include target benchmarks."""
+    target_benchmarks, _, _, _ = get_focused_benchmark_mapping()
+    available_benchmarks = []
+    for display_name, actual_name in target_benchmarks.items():
+        if actual_name in df.columns:
+            available_benchmarks.append(actual_name)
+    return df[available_benchmarks].copy()
+def main():
+    """Main application."""
+    # Header
+    st.markdown('<div class="main-header">🔬 OpenThoughts Evalchemy Benchmark Explorer</div>', unsafe_allow_html=True)
+    st.markdown("**Explore correlations and relationships between OpenThoughts model performance across different benchmarks**")
+    # Load data
+    with st.spinner("Loading benchmark data..."):
+        df = load_comprehensive_data()
+        stderr_df = load_stderr_data()
+    if df.empty:
+        st.error("No data available. Please check that the data files exist.")
+        return
+    # Filter to target benchmarks
+    df_filtered = filter_target_benchmarks(df)
+    target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
+    # Sidebar
+    st.sidebar.header("🎛️ Controls")
+    # Analysis mode selection
+    analysis_mode = st.sidebar.selectbox(
+        "Choose Analysis Mode",
+        ["📊 Overview Dashboard", "🔥 Interactive Heatmap", "📈 Scatter Plot Explorer",
+         "🎯 Model Performance", "📋 Statistical Summary", "🔬 Uncertainty Analysis"]
+    )
+    # Data filtering options
+    st.sidebar.subheader("Data Filters")
+    # Category filter
+    selected_categories = st.sidebar.multiselect(
+        "Select Benchmark Categories",
+        list(benchmark_categories.keys()),
+        default=list(benchmark_categories.keys())
+    )
+    # Filter benchmarks based on selected categories
+    filtered_benchmarks = []
+    for category in selected_categories:
+        for bench_name in benchmark_categories[category]:
+            actual_name = target_benchmarks.get(bench_name)
+            if actual_name in df_filtered.columns:
+                filtered_benchmarks.append(actual_name)
+    if filtered_benchmarks:
+        df_display = df_filtered[filtered_benchmarks].copy()
+    else:
+        df_display = df_filtered.copy()
+    # Zero filtering
+    filter_zeros = st.sidebar.checkbox("Filter out zero/near-zero values", value=False)
+    if filter_zeros:
+        for col in df_display.columns:
+            df_display.loc[(df_display[col] == 0) | (df_display[col] < 0.01), col] = np.nan
+    # Minimum data points filter
+    coverage_counts = [df_display[col].notna().sum() for col in df_display.columns]
+    if coverage_counts:
+        min_coverage = min(coverage_counts)
+        max_coverage = max(coverage_counts)
+        default_min = max(10, min_coverage)  # Default to at least 10 or minimum available
+        min_models = st.sidebar.slider(
+            "Minimum models per benchmark",
+            min_value=min_coverage,
+            max_value=max_coverage,
+            value=default_min,
+            help=f"Range: {min_coverage} to {max_coverage} models"
+        )
+    else:
+        min_models = 10
+    # Apply the minimum models filter
+    valid_benchmarks = []
+    for col in df_display.columns:
+        if df_display[col].notna().sum() >= min_models:
+            valid_benchmarks.append(col)
+    df_display = df_display[valid_benchmarks]
+    # Main content based on analysis mode
+    if analysis_mode == "📊 Overview Dashboard":
+        show_overview_dashboard(df_display, stderr_df)
+    elif analysis_mode == "🔥 Interactive Heatmap":
+        show_interactive_heatmap(df_display)
+    elif analysis_mode == "📈 Scatter Plot Explorer":
+        show_scatter_explorer(df_display, stderr_df)
+    elif analysis_mode == "🎯 Model Performance":
+        show_model_performance(df_display)
+    elif analysis_mode == "📋 Statistical Summary":
+        show_statistical_summary(df_display)
+    elif analysis_mode == "🔬 Uncertainty Analysis":
+        show_uncertainty_analysis(df_display, stderr_df)
+def show_overview_dashboard(df, stderr_df):
+    """Show the overview dashboard."""
+    st.header("📊 Overview Dashboard")
+    # Key metrics
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("Models", len(df))
+    with col2:
+        st.metric("Benchmarks", len(df.columns))
+    with col3:
+        total_evals = df.notna().sum().sum()
+        st.metric("Total Evaluations", f"{total_evals:,}")
+    with col4:
+        avg_coverage = (df.notna().sum() / len(df)).mean() * 100
+        st.metric("Avg Coverage", f"{avg_coverage:.1f}%")
+    # Benchmark coverage chart
+    st.subheader("Benchmark Coverage")
+    coverage_data = []
+    target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
+    for col in df.columns:
+        coverage = df[col].notna().sum()
+        category = col_to_category.get(col, 'Unknown')
+        clean_name = clean_benchmark_name(col)
+        coverage_data.append({
+            'Benchmark': clean_name,
+            'Coverage': coverage,
+            'Percentage': coverage / len(df) * 100,
+            'Category': category
+        })
+    coverage_df = pd.DataFrame(coverage_data).sort_values('Coverage', ascending=True)
+    fig = px.bar(coverage_df,
+                 x='Coverage',
+                 y='Benchmark',
+                 color='Category',
+                 color_discrete_map=colors,
+                 title="Model Coverage by Benchmark",
+                 labels={'Coverage': 'Number of Models'},
+                 orientation='h')
+    fig.update_layout(height=400)
+    st.plotly_chart(fig, use_container_width=True)
+    # Quick correlation insights
+    st.subheader("Quick Correlation Insights")
+    corr_matrix = compute_correlations(df, 'pearson')
+    # Get top correlations
+    pairs = []
+    for i, bench1 in enumerate(corr_matrix.columns):
+        for j, bench2 in enumerate(corr_matrix.columns[i+1:], i+1):
+            if not pd.isna(corr_matrix.iloc[i, j]):
+                cat1 = col_to_category.get(bench1, 'Unknown')
+                cat2 = col_to_category.get(bench2, 'Unknown')
+                pairs.append((bench1, bench2, corr_matrix.iloc[i, j], cat1, cat2))
+    pairs.sort(key=lambda x: abs(x[2]), reverse=True)
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("**🔥 Top 5 Highest Correlations**")
+        for i, (bench1, bench2, corr, cat1, cat2) in enumerate(pairs[:5]):
+            same_cat = "✅" if cat1 == cat2 else "🔀"
+            st.write(f"{i+1}. {clean_benchmark_name(bench1)} ↔ {clean_benchmark_name(bench2)}")
+            st.write(f"   r = {corr:.3f} {same_cat}")
+    with col2:
+        st.markdown("**📊 Category Analysis**")
+        within_cat = [p[2] for p in pairs if p[3] == p[4]]
+        across_cat = [p[2] for p in pairs if p[3] != p[4]]
+        if within_cat:
+            st.write(f"Within-category avg: {np.mean(within_cat):.3f}")
+        if across_cat:
+            st.write(f"Across-category avg: {np.mean(across_cat):.3f}")
+        st.write(f"Total pairs analyzed: {len(pairs)}")
+def show_interactive_heatmap(df):
+    """Show the interactive heatmap."""
+    st.header("🔥 Interactive Correlation Heatmap")
+    # Correlation method selection
+    col1, col2 = st.columns([3, 1])
+    with col2:
+        corr_method = st.selectbox(
+            "Correlation Method",
+            ["pearson", "spearman", "kendall"]
+        )
+    # Compute correlation matrix
+    corr_matrix = compute_correlations(df, corr_method)
+    # Create and display heatmap
+    fig = create_interactive_heatmap(corr_matrix, f"{corr_method.capitalize()} Correlation Matrix")
+    st.plotly_chart(fig, use_container_width=True)
+    # Correlation statistics
+    st.subheader("Correlation Statistics")
+    # Get all off-diagonal correlations
+    mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
+    corr_values = corr_matrix.where(mask).stack().dropna()
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("Mean Correlation", f"{corr_values.mean():.3f}")
+    with col2:
+        st.metric("Median Correlation", f"{corr_values.median():.3f}")
+    with col3:
+        st.metric("Max Correlation", f"{corr_values.max():.3f}")
+    with col4:
+        st.metric("Min Correlation", f"{corr_values.min():.3f}")
+    # Distribution of correlations
+    st.subheader("Correlation Distribution")
+    fig = px.histogram(corr_values,
+                       nbins=20,
+                       title="Distribution of Pairwise Correlations",
+                       labels={'value': 'Correlation Coefficient', 'count': 'Frequency'})
+    st.plotly_chart(fig, use_container_width=True)
+def show_scatter_explorer(df, stderr_df):
+    """Show the scatter plot explorer."""
+    st.header("📈 Scatter Plot Explorer")
+    # Benchmark selection
+    col1, col2 = st.columns(2)
+    with col1:
+        x_benchmark = st.selectbox(
+            "X-axis Benchmark",
+            df.columns,
+            format_func=clean_benchmark_name
+        )
+    with col2:
+        y_benchmark = st.selectbox(
+            "Y-axis Benchmark",
+            df.columns,
+            index=1 if len(df.columns) > 1 else 0,
+            format_func=clean_benchmark_name
+        )
+    if x_benchmark and y_benchmark and x_benchmark != y_benchmark:
+        # Create scatter plot
+        fig = create_scatter_plot(df, x_benchmark, y_benchmark, stderr_df)
+        if fig:
+            st.plotly_chart(fig, use_container_width=True)
+            # Additional statistics
+            common_data = df[[x_benchmark, y_benchmark]].dropna()
+            if len(common_data) >= 3:
+                col1, col2, col3 = st.columns(3)
+                # Correlation metrics
+                pearson_r, pearson_p = pearsonr(common_data[x_benchmark], common_data[y_benchmark])
+                spearman_r, spearman_p = spearmanr(common_data[x_benchmark], common_data[y_benchmark])
+                kendall_r, kendall_p = kendalltau(common_data[x_benchmark], common_data[y_benchmark])
+                with col1:
+                    st.metric("Pearson r", f"{pearson_r:.3f}")
+                    st.caption(f"p = {pearson_p:.3f}")
+                with col2:
+                    st.metric("Spearman ρ", f"{spearman_r:.3f}")
+                    st.caption(f"p = {spearman_p:.3f}")
+                with col3:
+                    st.metric("Kendall τ", f"{kendall_r:.3f}")
+                    st.caption(f"p = {kendall_p:.3f}")
+                # Show data table
+                st.subheader("Data Points")
+                display_data = common_data.copy()
+                display_data.columns = [clean_benchmark_name(col) for col in display_data.columns]
+                st.dataframe(display_data, use_container_width=True)
+        else:
+            st.warning("Insufficient data for the selected benchmark pair.")
+    else:
+        st.info("Please select two different benchmarks to compare.")
+def show_model_performance(df):
+    """Show model performance analysis."""
+    st.header("🎯 Model Performance Analysis")
+    # Model search
+    search_term = st.text_input("🔍 Search for models", placeholder="Enter model name or part of name")
+    if search_term:
+        matching_models = df.index[df.index.str.contains(search_term, case=False, na=False)]
+        if len(matching_models) > 0:
+            df_display = df.loc[matching_models]
+        else:
+            st.warning(f"No models found matching '{search_term}'")
+            df_display = df
+    else:
+        df_display = df
+    # Performance ranking
+    st.subheader("Model Rankings")
+    # Calculate average performance (excluding NaN)
+    model_avg_scores = df_display.mean(axis=1, skipna=True).sort_values(ascending=False)
+    # Top performers
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("**🏆 Top 10 Models (by average score)**")
+        for i, (model, score) in enumerate(model_avg_scores.head(10).items()):
+            st.write(f"{i+1}. {model.split('/')[-1]}: {score:.3f}")
+    with col2:
+        st.markdown("**📊 Performance Distribution**")
+        fig = px.histogram(model_avg_scores,
+                          nbins=20,
+                          title="Distribution of Average Model Scores")
+        st.plotly_chart(fig, use_container_width=True)
+    # Model comparison
+    st.subheader("Model Comparison")
+    selected_models = st.multiselect(
+        "Select models to compare",
+        df_display.index.tolist(),
+        default=model_avg_scores.head(3).index.tolist()
+    )
+    if selected_models:
+        comparison_data = df_display.loc[selected_models].T
+        comparison_data.index = [clean_benchmark_name(idx) for idx in comparison_data.index]
+        # Radar chart
+        if len(selected_models) <= 5:  # Only for manageable number of models
+            fig = go.Figure()
+            for model in selected_models:
+                model_data = df_display.loc[model].dropna()
+                benchmarks = [clean_benchmark_name(b) for b in model_data.index]
+                values = model_data.values.tolist()
+                # Close the radar chart
+                values += values[:1]
+                benchmarks += benchmarks[:1]
+                fig.add_trace(go.Scatterpolar(
+                    r=values,
+                    theta=benchmarks,
+                    fill='toself',
+                    name=model.split('/')[-1]
+                ))
+            fig.update_layout(
+                polar=dict(
+                    radialaxis=dict(
+                        visible=True,
+                        range=[0, 1]
+                    )),
+                showlegend=True,
+                title="Model Performance Radar Chart"
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        # Detailed comparison table
+        st.subheader("Detailed Comparison")
+        st.dataframe(comparison_data, use_container_width=True)
+def show_statistical_summary(df):
+    """Show statistical summary."""
+    st.header("📋 Statistical Summary")
+    # Overall statistics
+    st.subheader("Dataset Statistics")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("**Data Coverage**")
+        total_possible = len(df) * len(df.columns)
+        total_actual = df.notna().sum().sum()
+        coverage_pct = (total_actual / total_possible) * 100
+        st.write(f"Total possible evaluations: {total_possible:,}")
+        st.write(f"Actual evaluations: {total_actual:,}")
+        st.write(f"Overall coverage: {coverage_pct:.1f}%")
+    with col2:
+        st.markdown("**Score Statistics**")
+        all_scores = df.values.flatten()
+        all_scores = all_scores[~pd.isna(all_scores)]
+        st.write(f"Mean score: {np.mean(all_scores):.3f}")
+        st.write(f"Median score: {np.median(all_scores):.3f}")
+        st.write(f"Std deviation: {np.std(all_scores):.3f}")
+    # Benchmark-wise statistics
+    st.subheader("Benchmark Statistics")
+    benchmark_stats = []
+    target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()
+    for col in df.columns:
+        scores = df[col].dropna()
+        if len(scores) > 0:
+            benchmark_stats.append({
+                'Benchmark': clean_benchmark_name(col),
+                'Category': col_to_category.get(col, 'Unknown'),
+                'Count': len(scores),
+                'Mean': scores.mean(),
+                'Median': scores.median(),
+                'Std': scores.std(),
+                'Min': scores.min(),
+                'Max': scores.max(),
+                'Range': scores.max() - scores.min()
+            })
+    stats_df = pd.DataFrame(benchmark_stats)
+    st.dataframe(stats_df, use_container_width=True)
+    # Correlation summary
+    st.subheader("Correlation Analysis Summary")
+    for method in ['pearson', 'spearman', 'kendall']:
+        corr_matrix = compute_correlations(df, method)
+        # Get all off-diagonal correlations
+        mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
+        corr_values = corr_matrix.where(mask).stack().dropna()
+        st.write(f"**{method.capitalize()} Correlations:**")
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("Mean", f"{corr_values.mean():.3f}")
+        with col2:
+            st.metric("Median", f"{corr_values.median():.3f}")
+        with col3:
+            st.metric("Max", f"{corr_values.max():.3f}")
+        with col4:
+            st.metric("Min", f"{corr_values.min():.3f}")
+def show_uncertainty_analysis(df, stderr_df):
+    """Show uncertainty analysis if standard error data is available."""
+    st.header("🔬 Uncertainty Analysis")
+    if stderr_df is None:
+        st.warning("Standard error data not available. This analysis requires benchmark_standard_errors.csv")
+        return
+    st.info("This section analyzes measurement uncertainty and reliability of benchmark evaluations.")
+    # Match benchmarks with standard errors
+    matched_benchmarks = []
+    for score_col in df.columns:
+        # Try to find matching stderr column
+        potential_stderr_cols = [
+            f"{score_col}_std_err",
+            f"{score_col.replace('_accuracy', '_accuracy_std_err')}",
+            f"{score_col.replace('_accuracy_avg', '_accuracy_std_err')}"
+        ]
+        for stderr_col in potential_stderr_cols:
+            if stderr_col in stderr_df.columns:
+                matched_benchmarks.append((score_col, stderr_col))
+                break
+    if not matched_benchmarks:
+        st.warning("No matching standard error data found for the selected benchmarks.")
+        return
+    st.success(f"Found standard error data for {len(matched_benchmarks)} benchmarks.")
+    # Measurement precision analysis
+    st.subheader("Measurement Precision")
+    precision_data = []
+    for score_col, stderr_col in matched_benchmarks:
+        scores = df[score_col].dropna()
+        stderrs = stderr_df[stderr_col].dropna()
+        if len(stderrs) > 0:
+            mean_stderr = stderrs.mean()
+            median_stderr = stderrs.median()
+            # Signal-to-noise ratio
+            if len(scores) > 0:
+                signal_std = scores.std()
+                snr = signal_std / mean_stderr if mean_stderr > 0 else float('inf')
+            else:
+                snr = 0
+            precision_data.append({
+                'Benchmark': clean_benchmark_name(score_col),
+                'Mean StdErr': mean_stderr,
+                'Median StdErr': median_stderr,
+                'Signal/Noise': snr,
+                'N Models': len(stderrs)
+            })
+    if precision_data:
+        precision_df = pd.DataFrame(precision_data)
+        st.dataframe(precision_df, use_container_width=True)
+        # Visualization
+        fig = px.scatter(precision_df,
+                        x='Mean StdErr',
+                        y='Signal/Noise',
+                        size='N Models',
+                        hover_name='Benchmark',
+                        title="Measurement Precision: Signal-to-Noise vs Standard Error",
+                        labels={'Signal/Noise': 'Signal-to-Noise Ratio'})
+        st.plotly_chart(fig, use_container_width=True)
+    # Uncertainty-aware scatter plot
+    st.subheader("Uncertainty-Aware Scatter Plot")
+    # Let user select benchmarks with stderr data
+    available_benchmarks = [score_col for score_col, _ in matched_benchmarks]
+    col1, col2 = st.columns(2)
+    with col1:
+        x_bench = st.selectbox(
+            "X-axis Benchmark (with uncertainty)",
+            available_benchmarks,
+            format_func=clean_benchmark_name
+        )
+    with col2:
+        y_bench = st.selectbox(
+            "Y-axis Benchmark (with uncertainty)",
+            available_benchmarks,
+            index=1 if len(available_benchmarks) > 1 else 0,
+            format_func=clean_benchmark_name
+        )
+    if x_bench and y_bench and x_bench != y_bench:
+        # Find corresponding stderr columns
+        x_stderr_col = None
+        y_stderr_col = None
+        for score_col, stderr_col in matched_benchmarks:
+            if score_col == x_bench:
+                x_stderr_col = stderr_col
+            if score_col == y_bench:
+                y_stderr_col = stderr_col
+        if x_stderr_col and y_stderr_col:
+            # Get data
+            x_scores = df[x_bench]
+            y_scores = df[y_bench]
+            x_err = stderr_df[x_stderr_col]
+            y_err = stderr_df[y_stderr_col]
+            # Find common valid data
+            valid_mask = ~(x_scores.isna() | y_scores.isna() | x_err.isna() | y_err.isna())
+            if valid_mask.sum() >= 3:
+                x_clean = x_scores[valid_mask]
+                y_clean = y_scores[valid_mask]
+                x_err_clean = x_err[valid_mask]
+                y_err_clean = y_err[valid_mask]
+                # Create uncertainty scatter plot
+                fig = go.Figure()
+                # Add error bars
+                fig.add_trace(go.Scatter(
+                    x=x_clean,
+                    y=y_clean,
+                    error_x=dict(
+                        type='data',
+                        array=1.96 * x_err_clean,  # 95% CI
+                        visible=True
+                    ),
+                    error_y=dict(
+                        type='data',
+                        array=1.96 * y_err_clean,  # 95% CI
+                        visible=True
+                    ),
+                    mode='markers',
+                    text=x_clean.index,
+                    hovertemplate=(
+                        "<b>%{text}</b><br>" +
+                        f"{clean_benchmark_name(x_bench)}: %{{x:.3f}} ± %{{error_x:.3f}}<br>" +
+                        f"{clean_benchmark_name(y_bench)}: %{{y:.3f}} ± %{{error_y:.3f}}<br>" +
+                        "<extra></extra>"
+                    ),
+                    marker=dict(size=8, opacity=0.7),
+                    name='Models'
+                ))
+                # Add regression line
+                corr, p_val = pearsonr(x_clean, y_clean)
+                z = np.polyfit(x_clean, y_clean, 1)
+                p = np.poly1d(z)
+                x_line = np.linspace(x_clean.min(), x_clean.max(), 100)
+                fig.add_trace(go.Scatter(
+                    x=x_line,
+                    y=p(x_line),
+                    mode='lines',
+                    name=f'r = {corr:.3f}, p = {p_val:.3f}',
+                    line=dict(color='red', dash='dash')
+                ))
+                fig.update_layout(
+                    title=f"Uncertainty-Aware Correlation: {clean_benchmark_name(y_bench)} vs {clean_benchmark_name(x_bench)}",
+                    xaxis_title=f"{clean_benchmark_name(x_bench)} (±95% CI)",
+                    yaxis_title=f"{clean_benchmark_name(y_bench)} (±95% CI)",
+                    showlegend=True
+                )
+                st.plotly_chart(fig, use_container_width=True)
+                st.info(f"Showing {len(x_clean)} models with both score and uncertainty data. Error bars represent 95% confidence intervals.")
+            else:
+                st.warning("Insufficient data with uncertainty estimates for the selected benchmark pair.")
+if __name__ == "__main__":
+    main()

benchmark_standard_errors.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e01a3a6ba4b029de038b88c663b83609a744697ac12ed08cc217096c2f8fda18
+size 630831

comprehensive_benchmark_scores.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5c2fa13f14167b5169d4d56d635629c158789b5c23a3673f1a94529a8ee0de0
+size 415701

requirements.txt CHANGED Viewed

@@ -1,3 +1,16 @@
-altair
-pandas
-streamlit

+fastapi
+uvicorn
+requests
+sqlalchemy
+asyncpg
+aiohttp
+python-json-logger
+psycopg2-binary
+antlr4-python3-runtime==4.11
+streamlit>=1.28.0
+pandas>=2.0.0
+numpy>=1.24.0
+plotly>=5.15.0
+scipy>=1.10.0
+matplotlib>=3.7.0
+seaborn>=0.12.0