Spaces:

firobeid
/

EventDrivenTradingClassifierLeaderBoard

Sleeping

App Files Files Community

firobeid commited on Nov 12, 2025

Commit

f184642

verified ·

1 Parent(s): 66304c3

Upload leaderboard files

Browse files

Files changed (2) hide show

leaderboard.py +474 -0
requirements.txt +3 -0

leaderboard.py ADDED Viewed

	@@ -0,0 +1,474 @@

+import gradio as gr
+import plotly.graph_objects as go
+import plotly.express as px
+import pandas as pd
+from plotly.subplots import make_subplots
+import numpy as np
+import io
+# Default sample data (will be replaced when CSV is uploaded)
+default_data = pd.DataFrame({
+    'model': ['L1_Sentiment_Analysis'] * 24 + ['L2_Advanced_Classifier'] * 24,
+    'partition': (['inference'] * 8 + ['test'] * 8 + ['train'] * 8) * 2,
+    'topic': (['OVERALL', 'Earnings_Ratings', 'Entertainment', 'Financial_Funds', 'Investment_Banking',
+               'Mechanical_Transportation', 'Pharmaceutical', 'Technology'] * 3) * 2,
+    'FPR': [0.7603, 0.7831, 0.6161, 0.7772, 0.7010, 0.6869, 0.7979, 0.8701,
+            0.7664, 0.8374, 0.6022, 0.8635, 0.6505, 0.6567, 0.7614, 0.8711,
+            0.7333, 0.7534, 0.6299, 0.7594, 0.6468, 0.6164, 0.7575, 0.8825] +
+           [0.8103, 0.8331, 0.6661, 0.8272, 0.7510, 0.7369, 0.8479, 0.9201,
+            0.8164, 0.8874, 0.6522, 0.9135, 0.7005, 0.7067, 0.8114, 0.9211,
+            0.7833, 0.8034, 0.6799, 0.8094, 0.6968, 0.6664, 0.8075, 0.9325],
+    'Confidence': [0.2397, 0.2169, 0.3839, 0.2228, 0.2990, 0.3131, 0.2021, 0.1299,
+                   0.2336, 0.1626, 0.3978, 0.1365, 0.3495, 0.3433, 0.2386, 0.1289,
+                   0.2667, 0.2466, 0.3701, 0.2406, 0.3532, 0.3836, 0.2425, 0.1175] +
+                  [0.1897, 0.1669, 0.3339, 0.1728, 0.2490, 0.2631, 0.1521, 0.0799,
+                   0.1836, 0.1126, 0.3478, 0.0865, 0.2995, 0.2933, 0.1886, 0.0789,
+                   0.2167, 0.1966, 0.3201, 0.1906, 0.3032, 0.3336, 0.1925, 0.0675],
+    'FDR': [0.3812, 0.3916, 0.4233, 0.3421, 0.3886, 0.3487, 0.4363, 0.3631,
+            0.4867, 0.4326, 0.5000, 0.4899, 0.4845, 0.4903, 0.5217, 0.4767,
+            0.4653, 0.4592, 0.4652, 0.4615, 0.4672, 0.4749, 0.4727, 0.4607] +
+           [0.3312, 0.3416, 0.3733, 0.2921, 0.3386, 0.2987, 0.3863, 0.3131,
+            0.4367, 0.3826, 0.4500, 0.4399, 0.4345, 0.4403, 0.4717, 0.4267,
+            0.4153, 0.4092, 0.4152, 0.4115, 0.4172, 0.4249, 0.4227, 0.4107],
+    'Precision': [0.6188, 0.6084, 0.5767, 0.6579, 0.6114, 0.6513, 0.5637, 0.6369,
+                  0.5133, 0.5674, 0.5000, 0.5101, 0.5155, 0.5097, 0.4783, 0.5233,
+                  0.5347, 0.5408, 0.5348, 0.5385, 0.5328, 0.5251, 0.5273, 0.5393] +
+                 [0.6688, 0.6584, 0.6267, 0.7079, 0.6614, 0.7013, 0.6137, 0.6869,
+                  0.5633, 0.6174, 0.5500, 0.5601, 0.5655, 0.5597, 0.5283, 0.5733,
+                  0.5847, 0.5908, 0.5848, 0.5885, 0.5828, 0.5751, 0.5773, 0.5893],
+    'Recall_Power': [0.7715, 0.7014, 0.6225, 0.8112, 0.6948, 0.6865, 0.8189, 0.9073,
+                     0.7914, 0.8321, 0.6680, 0.8550, 0.6623, 0.7439, 0.7534, 0.9049,
+                     0.7427, 0.7582, 0.6250, 0.7760, 0.6491, 0.6336, 0.7650, 0.8897] +
+                    [0.8215, 0.7514, 0.6725, 0.8612, 0.7448, 0.7365, 0.8689, 0.9573,
+                     0.8414, 0.8821, 0.7180, 0.9050, 0.7123, 0.7939, 0.8034, 0.9549,
+                     0.7927, 0.8082, 0.6750, 0.8260, 0.6991, 0.6836, 0.8150, 0.9397],
+    'Accuracy': [0.5670, 0.5242, 0.5209, 0.6042, 0.5418, 0.5563, 0.5459, 0.6174,
+                 0.5155, 0.5435, 0.5259, 0.5048, 0.5093, 0.5350, 0.4862, 0.5276,
+                 0.5197, 0.5225, 0.5069, 0.5260, 0.5106, 0.5131, 0.5167, 0.5324] +
+                [0.6170, 0.5742, 0.5709, 0.6542, 0.5918, 0.6063, 0.5959, 0.6674,
+                 0.5655, 0.5935, 0.5759, 0.5548, 0.5593, 0.5850, 0.5362, 0.5776,
+                 0.5697, 0.5725, 0.5569, 0.5760, 0.5606, 0.5631, 0.5667, 0.5824],
+    'G_mean': [0.430033, 0.390043, 0.488854, 0.425130, 0.455791, 0.463620, 0.406817, 0.343305,
+               0.429966, 0.367831, 0.515490, 0.341625, 0.481117, 0.505352, 0.423983, 0.341528,
+               0.445060, 0.432403, 0.480950, 0.432094, 0.478813, 0.493000, 0.430712, 0.323326] +
+              [0.480033, 0.440043, 0.538854, 0.475130, 0.505791, 0.513620, 0.456817, 0.393305,
+               0.479966, 0.417831, 0.565490, 0.391625, 0.531117, 0.555352, 0.473983, 0.391528,
+               0.495060, 0.482403, 0.530950, 0.482094, 0.528813, 0.543000, 0.480712, 0.373326]
+})
+def load_csv_data(file):
+    """Load and validate CSV data"""
+    if file is None:
+        return default_data, "Using default sample data"
+    try:
+        df = pd.read_csv(file.name)
+        # Validate required columns
+        required_cols = ['model', 'partition', 'topic', 'FPR', 'Confidence', 'FDR',
+                        'Precision', 'Recall_Power', 'Accuracy', 'G_mean']
+        missing_cols = [col for col in required_cols if col not in df.columns]
+        if missing_cols:
+            return default_data, f"❌ Missing columns: {missing_cols}. Using default data."
+        # Clean data
+        df = df.dropna()
+        return df, f"✅ Successfully loaded {len(df)} records with {df['model'].nunique()} models"
+    except Exception as e:
+        return default_data, f"❌ Error loading CSV: {str(e)}. Using default data."
+def create_model_leaderboard(df, partition_filter='all', topic_filter='OVERALL'):
+    """Create leaderboard comparing all models"""
+    filtered_df = df.copy()
+    if partition_filter != 'all':
+        filtered_df = filtered_df[filtered_df['partition'] == partition_filter]
+    if topic_filter != 'all':
+        filtered_df = filtered_df[filtered_df['topic'] == topic_filter]
+    # Calculate average metrics per model
+    metrics = ['Precision', 'Recall_Power', 'Accuracy', 'G_mean']
+    leaderboard = filtered_df.groupby('model')[metrics].mean().reset_index()
+    # Calculate overall score (average of key metrics)
+    leaderboard['Overall_Score'] = leaderboard[['Precision', 'Recall_Power', 'Accuracy']].mean(axis=1)
+    leaderboard = leaderboard.sort_values('Overall_Score', ascending=False)
+    # Create subplot for each metric
+    fig = make_subplots(
+        rows=1, cols=len(metrics) + 1,
+        subplot_titles=metrics + ['Overall Score']
+    )
+    colors = px.colors.qualitative.Set3[:len(leaderboard)]
+    for i, metric in enumerate(metrics + ['Overall_Score']):
+        for j, (_, row) in enumerate(leaderboard.iterrows()):
+            fig.add_trace(
+                go.Bar(
+                    x=[row['model']],
+                    y=[row[metric]],
+                    name=row['model'] if i == 0 else "",
+                    marker_color=colors[j],
+                    showlegend=True if i == 0 else False,
+                    text=f"{row[metric]:.3f}",
+                    textposition="outside"
+                ),
+                row=1, col=i+1
+            )
+    fig.update_layout(
+        title=f"Model Leaderboard - {partition_filter.title()} | {topic_filter}",
+        height=500,
+        showlegend=True
+    )
+    # Update y-axes
+    for i in range(1, len(metrics) + 2):
+        fig.update_yaxes(range=[0, 1], row=1, col=i)
+    return fig
+def create_topic_comparison(df, models_selected=None, metric='Accuracy', partition_filter='all'):
+    """Compare selected models across topics"""
+    if models_selected is None or len(models_selected) == 0:
+        models_selected = df['model'].unique()[:3]  # Default to first 3 models
+    # Filter data
+    filtered_df = df[df['model'].isin(models_selected)].copy()
+    if partition_filter != 'all':
+        filtered_df = filtered_df[filtered_df['partition'] == partition_filter]
+    # Average across partitions for each model-topic combination
+    topic_performance = filtered_df.groupby(['model', 'topic'])[metric].mean().reset_index()
+    # Create grouped bar chart
+    fig = go.Figure()
+    colors = px.colors.qualitative.Set3[:len(models_selected)]
+    topics = sorted(topic_performance['topic'].unique())
+    for i, model in enumerate(models_selected):
+        model_data = topic_performance[topic_performance['model'] == model]
+        fig.add_trace(go.Bar(
+            name=model,
+            x=topics,
+            y=model_data[metric],
+            marker_color=colors[i],
+            text=[f"{val:.3f}" for val in model_data[metric]],
+            textposition='outside'
+        ))
+    fig.update_layout(
+        title=f"Model Comparison Across Topics ({metric}) - {partition_filter.title()}",
+        xaxis_title="Topics",
+        yaxis_title=metric,
+        barmode='group',
+        height=500,
+        xaxis_tickangle=-45,
+        yaxis=dict(range=[0, 1])
+    )
+    return fig
+def create_partition_analysis(df, models_selected=None):
+    """Analyze model performance across partitions"""
+    if models_selected is None or len(models_selected) == 0:
+        models_selected = df['model'].unique()[:3]
+    filtered_df = df[df['model'].isin(models_selected)].copy()
+    # Average across topics for each model-partition combination
+    metrics = ['FPR', 'Confidence', 'FDR', 'Precision', 'Recall_Power', 'Accuracy', 'G_mean']
+    partition_performance = filtered_df.groupby(['model', 'partition'])[metrics].mean().reset_index()
+    # Create subplots for each metric
+    fig = make_subplots(
+        rows=2, cols=4,
+        subplot_titles=metrics + [''],  # Extra empty title for 8th subplot
+        specs=[[{"colspan": 1}, {"colspan": 1}, {"colspan": 1}, {"colspan": 1}],
+               [{"colspan": 1}, {"colspan": 1}, {"colspan": 1}, None]]  # 7 subplots total
+    )
+    colors = px.colors.qualitative.Set3[:len(models_selected)]
+    partitions = ['train', 'test', 'inference']
+    # Plot each metric
+    for i, metric in enumerate(metrics):
+        row = 1 if i < 4 else 2
+        col = (i % 4) + 1
+        for j, model in enumerate(models_selected):
+            model_data = partition_performance[partition_performance['model'] == model]
+            model_data = model_data.sort_values('partition')  # Ensure consistent ordering
+            fig.add_trace(
+                go.Bar(
+                    name=model if i == 0 else "",
+                    x=model_data['partition'],
+                    y=model_data[metric],
+                    marker_color=colors[j],
+                    showlegend=True if i == 0 else False,
+                    text=[f"{val:.3f}" for val in model_data[metric]],
+                    textposition='outside'
+                ),
+                row=row, col=col
+            )
+    fig.update_layout(
+        title="Model Performance Across Partitions - All Metrics",
+        height=800,
+        barmode='group'
+    )
+    # Update y-axes for all subplots
+    for i in range(1, 8):  # 7 subplots
+        row = 1 if i <= 4 else 2
+        col = i if i <= 4 else i - 4
+        if i <= 7:  # Only update existing subplots
+            fig.update_yaxes(range=[0, 1], row=row, col=col)
+    return fig
+def create_performance_summary_table(df):
+    """Create summary table with key statistics"""
+    # Calculate summary statistics
+    summary_stats = []
+    for model in df['model'].unique():
+        model_data = df[df['model'] == model]
+        stats = {
+            'Model': model,
+            'Avg_Accuracy': model_data['Accuracy'].mean(),
+            'Avg_Precision': model_data['Precision'].mean(),
+            'Avg_Recall': model_data['Recall_Power'].mean(),
+            'Avg_G_mean': model_data['G_mean'].mean(),
+            'Best_Topic_Accuracy': model_data.loc[model_data['Accuracy'].idxmax(), 'topic'],
+            'Best_Topic_Score': model_data['Accuracy'].max(),
+            'Worst_Topic_Accuracy': model_data.loc[model_data['Accuracy'].idxmin(), 'topic'],
+            'Worst_Topic_Score': model_data['Accuracy'].min(),
+            'Performance_Variance': model_data['Accuracy'].var()
+        }
+        summary_stats.append(stats)
+    summary_df = pd.DataFrame(summary_stats)
+    summary_df = summary_df.round(4)
+    summary_df = summary_df.sort_values('Avg_Accuracy', ascending=False)
+    return summary_df
+# Create the Gradio interface
+with gr.Blocks(title="Multi-Model Classifier Dashboard", theme=gr.themes.Soft()) as demo:
+    gr.HTML("<h1 style='text-align: center; color: #2E86AB;'>🏆 Multi-Model Classifier Dashboard</h1>")
+    # Data loading section
+    with gr.Row():
+        with gr.Column():
+            csv_file = gr.File(
+                label="📁 Upload CSV File",
+                file_types=['.csv']
+            )
+            data_status = gr.Textbox(
+                label="Data Status",
+                value="Using default sample data with 2 models",
+                interactive=False
+            )
+    # Store current data
+    current_data = gr.State(value=default_data)
+    with gr.Tabs():
+        with gr.TabItem("🏆 Model Leaderboard"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    partition_filter = gr.Dropdown(
+                        choices=['all', 'inference', 'test', 'train'],
+                        value='all',
+                        label="Filter by Partition"
+                    )
+                    topic_filter = gr.Dropdown(
+                        choices=['all', 'OVERALL'],
+                        value='OVERALL',
+                        label="Filter by Topic"
+                    )
+                with gr.Column(scale=3):
+                    leaderboard_chart = gr.Plot()
+        with gr.TabItem("📊 Topic Comparison"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    models_selector = gr.CheckboxGroup(
+                        choices=[],
+                        label="Select Models to Compare",
+                        value=[]
+                    )
+                    metric_selector = gr.Dropdown(
+                        choices=['FPR', 'Confidence', 'FDR', 'Precision', 'Recall_Power', 'Accuracy', 'G_mean'],
+                        value='Accuracy',
+                        label="Select Metric"
+                    )
+                    partition_filter_topic = gr.Dropdown(
+                        choices=['all', 'inference', 'test', 'train'],
+                        value='all',
+                        label="Filter by Partition"
+                    )
+                with gr.Column(scale=3):
+                    topic_comparison_chart = gr.Plot()
+        with gr.TabItem("🔄 Partition Analysis"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    models_selector_partition = gr.CheckboxGroup(
+                        choices=[],
+                        label="Select Models to Analyze",
+                        value=[]
+                    )
+                with gr.Column(scale=3):
+                    partition_analysis_chart = gr.Plot()
+        with gr.TabItem("📈 Performance Summary"):
+            summary_table = gr.DataFrame(
+                label="Model Performance Summary",
+                interactive=False
+            )
+        with gr.TabItem("📋 Raw Data"):
+            raw_data_table = gr.DataFrame(
+                label="Complete Dataset",
+                interactive=True
+            )
+    def update_dashboard(file):
+        """Update all dashboard components when new data is loaded"""
+        df, status = load_csv_data(file)
+        # Update model choices
+        model_choices = sorted(df['model'].unique())
+        topic_choices = ['all'] + sorted(df['topic'].unique())
+        # Create initial plots
+        leaderboard = create_model_leaderboard(df)
+        topic_comp = create_topic_comparison(df, model_choices[:3])
+        partition_analysis = create_partition_analysis(df, model_choices[:3])
+        summary = create_performance_summary_table(df)
+        return (
+            df, status,
+            gr.update(choices=topic_choices, value='OVERALL'),
+            gr.update(choices=model_choices, value=model_choices[:3]),
+            gr.update(choices=model_choices, value=model_choices[:3]),
+            leaderboard, topic_comp, partition_analysis, summary, df
+        )
+    # Event handlers
+    csv_file.change(
+        fn=update_dashboard,
+        inputs=[csv_file],
+        outputs=[
+            current_data, data_status, topic_filter,
+            models_selector, models_selector_partition,
+            leaderboard_chart, topic_comparison_chart,
+            partition_analysis_chart, summary_table, raw_data_table
+        ]
+    )
+    # Update leaderboard when filters change
+    def update_leaderboard(data, partition, topic):
+        return create_model_leaderboard(data, partition, topic)
+    partition_filter.change(
+        fn=update_leaderboard,
+        inputs=[current_data, partition_filter, topic_filter],
+        outputs=leaderboard_chart
+    )
+    topic_filter.change(
+        fn=update_leaderboard,
+        inputs=[current_data, partition_filter, topic_filter],
+        outputs=leaderboard_chart
+    )
+    # Update topic comparison when models, metric, or partition change
+    def update_topic_comparison(data, selected_models, metric, partition):
+        return create_topic_comparison(data, selected_models, metric, partition)
+    models_selector.change(
+        fn=update_topic_comparison,
+        inputs=[current_data, models_selector, metric_selector, partition_filter_topic],
+        outputs=topic_comparison_chart
+    )
+    metric_selector.change(
+        fn=update_topic_comparison,
+        inputs=[current_data, models_selector, metric_selector, partition_filter_topic],
+        outputs=topic_comparison_chart
+    )
+    partition_filter_topic.change(
+        fn=update_topic_comparison,
+        inputs=[current_data, models_selector, metric_selector, partition_filter_topic],
+        outputs=topic_comparison_chart
+    )
+    # Update partition analysis when models change
+    def update_partition_analysis(data, selected_models):
+        return create_partition_analysis(data, selected_models)
+    models_selector_partition.change(
+        fn=update_partition_analysis,
+        inputs=[current_data, models_selector_partition],
+        outputs=partition_analysis_chart
+    )
+    # Initialize dashboard with default data
+    demo.load(
+        fn=lambda: update_dashboard(None),
+        outputs=[
+            current_data, data_status, topic_filter,
+            models_selector, models_selector_partition,
+            leaderboard_chart, topic_comparison_chart,
+            partition_analysis_chart, summary_table, raw_data_table
+        ]
+    )
+    gr.Markdown("""
+    ### 💡 Dashboard Features
+    **📁 Data Loading**: Upload your CSV file with classifier results. The app automatically detects all models and creates comparisons.
+    **🏆 Model Leaderboard**:
+    - Compare all models side-by-side across key metrics
+    - Filter by partition and topic for specific comparisons
+    - Overall score calculated from precision, recall, and accuracy
+    **📊 Topic Comparison**:
+    - Select specific models to compare across all topics
+    - Choose any metric (FPR, Confidence, FDR, Precision, Recall_Power, Accuracy, G_mean)
+    - Filter by partition to focus on specific evaluation splits
+    - Visual comparison across business categories
+    **🔄 Partition Analysis**:
+    - Analyze all metrics across train/test/inference partitions
+    - Compare multiple models across different evaluation splits
+    - Monitor generalization capabilities and detect overfitting
+    - Comprehensive view of all 7 performance metrics
+    **📈 Performance Summary**:
+    - Statistical overview of each model's performance
+    - Best and worst performing topics for each model
+    - Performance variance analysis
+    **CSV Format**: Your file should have columns: `model`, `partition`, `topic`, `FPR`, `Confidence`, `FDR`, `Precision`, `Recall_Power`, `Accuracy`, `G_mean`
+    """)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+numpy==1.24.4
+plotly==6.0.1
+pandas==1.5.3