Spaces:

akera
/

leaderboard

Sleeping

App Files Files Community

akera commited on Jun 12, 2025

Commit

cfbcff1

verified ·

1 Parent(s): 944a871

Create plotting.py

Browse files

Files changed (1) hide show

src/plotting.py +529 -0

src/plotting.py ADDED Viewed

	@@ -0,0 +1,529 @@

+# src/plotting.py
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+import matplotlib.colors as mcolors
+from colorsys import rgb_to_hls, hls_to_rgb
+import plotly.graph_objects as go
+import plotly.express as px
+from plotly.subplots import make_subplots
+import pandas as pd
+import numpy as np
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
+from config import LANGUAGE_NAMES, ALL_UG40_LANGUAGES, GOOGLE_SUPPORTED_LANGUAGES, METRICS_CONFIG
+plt.style.use('default')
+plt.rcParams['figure.facecolor'] = 'white'
+plt.rcParams['axes.facecolor'] = 'white'
+def create_leaderboard_ranking_plot(df: pd.DataFrame, metric: str = 'quality_score', top_n: int = 15) -> go.Figure:
+    """Create interactive leaderboard ranking plot using Plotly."""
+    if df.empty:
+        fig = go.Figure()
+        fig.add_annotation(
+            text="No data available",
+            xref="paper", yref="paper",
+            x=0.5, y=0.5, showarrow=False,
+            font=dict(size=16)
+        )
+        return fig
+    # Get top N models
+    top_models = df.head(top_n)
+    # Create color scale based on scores
+    colors = px.colors.qualitative.Set3[:len(top_models)]
+    # Create horizontal bar chart
+    fig = go.Figure(data=[
+        go.Bar(
+            y=top_models['model_name'],
+            x=top_models[metric],
+            orientation='h',
+            marker=dict(
+                color=top_models[metric],
+                colorscale='Viridis',
+                showscale=True,
+                colorbar=dict(title=metric.replace('_', ' ').title())
+            ),
+            text=[f"{score:.3f}" for score in top_models[metric]],
+            textposition='auto',
+            hovertemplate=(
+                "<b>%{y}</b><br>" +
+                f"{metric.replace('_', ' ').title()}: %{{x:.4f}}<br>" +
+                "Author: %{customdata[0]}<br>" +
+                "Coverage: %{customdata[1]:.1%}<br>" +
+                "<extra></extra>"
+            ),
+            customdata=list(zip(top_models['author'], top_models['coverage_rate']))
+        )
+    ])
+    fig.update_layout(
+        title=f"🏆 SALT Translation Leaderboard - {metric.replace('_', ' ').title()}",
+        xaxis_title=f"{metric.replace('_', ' ').title()} Score",
+        yaxis_title="Models",
+        height=max(400, len(top_models) * 30 + 100),
+        margin=dict(l=20, r=20, t=60, b=20),
+        plot_bgcolor='white',
+        paper_bgcolor='white'
+    )
+    # Reverse y-axis to show best model at top
+    fig.update_yaxes(autorange="reversed")
+    return fig
+def create_metrics_comparison_plot(df: pd.DataFrame, models: List[str] = None, max_models: int = 8) -> go.Figure:
+    """Create radar chart comparing multiple metrics across models."""
+    if df.empty:
+        return go.Figure().add_annotation(text="No data available", x=0.5, y=0.5)
+    # Select models to compare
+    if models is None:
+        selected_models = df.head(max_models)
+    else:
+        selected_models = df[df['model_name'].isin(models)].head(max_models)
+    if len(selected_models) == 0:
+        return go.Figure().add_annotation(text="No models found", x=0.5, y=0.5)
+    # Metrics to include in radar chart
+    metrics = ['quality_score', 'bleu', 'chrf', 'rouge1', 'rougeL']
+    metric_labels = ['Quality Score', 'BLEU (/100)', 'ChrF', 'ROUGE-1', 'ROUGE-L']
+    fig = go.Figure()
+    colors = px.colors.qualitative.Set1[:len(selected_models)]
+    for i, (_, model) in enumerate(selected_models.iterrows()):
+        # Normalize BLEU to 0-1 scale for radar chart
+        values = []
+        for metric in metrics:
+            value = model[metric]
+            if metric == 'bleu':
+                value = value / 100.0  # Normalize BLEU
+            values.append(value)
+        # Close the radar chart
+        values += values[:1]
+        metric_labels_closed = metric_labels + [metric_labels[0]]
+        fig.add_trace(go.Scatterpolar(
+            r=values,
+            theta=metric_labels_closed,
+            fill='toself',
+            name=model['model_name'],
+            line_color=colors[i % len(colors)],
+            fillcolor=colors[i % len(colors)],
+            opacity=0.6
+        ))
+    fig.update_layout(
+        polar=dict(
+            radialaxis=dict(
+                visible=True,
+                range=[0, 1]
+            )
+        ),
+        showlegend=True,
+        title="📊 Multi-Metric Model Comparison",
+        height=600
+    )
+    return fig
+def create_language_pair_heatmap(results_dict: Dict, metric: str = 'quality_score') -> go.Figure:
+    """Create heatmap showing performance across language pairs."""
+    if not results_dict or 'pair_metrics' not in results_dict:
+        return go.Figure().add_annotation(text="No language pair data available", x=0.5, y=0.5)
+    pair_metrics = results_dict['pair_metrics']
+    # Create matrix for heatmap
+    languages = ALL_UG40_LANGUAGES
+    matrix = np.zeros((len(languages), len(languages)))
+    for i, src_lang in enumerate(languages):
+        for j, tgt_lang in enumerate(languages):
+            if src_lang != tgt_lang:
+                pair_key = f"{src_lang}_to_{tgt_lang}"
+                if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
+                    matrix[i, j] = pair_metrics[pair_key][metric]
+                else:
+                    matrix[i, j] = np.nan
+            else:
+                matrix[i, j] = np.nan
+    # Create language labels
+    lang_labels = [LANGUAGE_NAMES.get(lang, lang) for lang in languages]
+    fig = go.Figure(data=go.Heatmap(
+        z=matrix,
+        x=lang_labels,
+        y=lang_labels,
+        colorscale='Viridis',
+        showscale=True,
+        colorbar=dict(title=metric.replace('_', ' ').title()),
+        hoverinfotemplate=(
+            "Source: %{y}<br>" +
+            "Target: %{x}<br>" +
+            f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>" +
+            "<extra></extra>"
+        )
+    ))
+    fig.update_layout(
+        title=f"🗺️ Language Pair Performance - {metric.replace('_', ' ').title()}",
+        xaxis_title="Target Language",
+        yaxis_title="Source Language",
+        height=600,
+        width=700
+    )
+    return fig
+def create_coverage_analysis_plot(df: pd.DataFrame) -> go.Figure:
+    """Create plot analyzing test set coverage across submissions."""
+    if df.empty:
+        return go.Figure().add_annotation(text="No data available", x=0.5, y=0.5)
+    fig = make_subplots(
+        rows=2, cols=2,
+        subplot_titles=(
+            "Coverage Distribution",
+            "Language Pairs Covered",
+            "Sample Count vs Quality",
+            "Google Comparable Coverage"
+        ),
+        specs=[[{"type": "bar"}, {"type": "scatter"}],
+               [{"type": "scatter"}, {"type": "bar"}]]
+    )
+    # Coverage distribution
+    coverage_bins = pd.cut(df['coverage_rate'],
+                          bins=[0, 0.5, 0.8, 0.9, 0.95, 1.0],
+                          labels=['<50%', '50-80%', '80-90%', '90-95%', '95-100%'])
+    coverage_counts = coverage_bins.value_counts()
+    fig.add_trace(
+        go.Bar(x=coverage_counts.index, y=coverage_counts.values, name="Coverage"),
+        row=1, col=1
+    )
+    # Language pairs covered vs quality
+    fig.add_trace(
+        go.Scatter(
+            x=df['language_pairs_covered'],
+            y=df['quality_score'],
+            mode='markers',
+            text=df['model_name'],
+            name="Quality vs Coverage"
+        ),
+        row=1, col=2
+    )
+    # Sample count vs quality
+    fig.add_trace(
+        go.Scatter(
+            x=df['total_samples'],
+            y=df['quality_score'],
+            mode='markers',
+            text=df['model_name'],
+            name="Quality vs Samples"
+        ),
+        row=2, col=1
+    )
+    # Google comparable coverage
+    google_coverage = df['google_pairs_covered'].value_counts().sort_index()
+    fig.add_trace(
+        go.Bar(x=google_coverage.index, y=google_coverage.values, name="Google Coverage"),
+        row=2, col=2
+    )
+    fig.update_layout(
+        title="📈 Test Set Coverage Analysis",
+        height=800,
+        showlegend=False
+    )
+    return fig
+def create_model_performance_timeline(df: pd.DataFrame) -> go.Figure:
+    """Create timeline showing model performance over time."""
+    if df.empty:
+        return go.Figure().add_annotation(text="No data available", x=0.5, y=0.5)
+    # Convert submission_date to datetime
+    df_copy = df.copy()
+    df_copy['submission_date'] = pd.to_datetime(df_copy['submission_date'])
+    df_copy = df_copy.sort_values('submission_date')
+    fig = go.Figure()
+    # Add scatter plot for each submission
+    fig.add_trace(go.Scatter(
+        x=df_copy['submission_date'],
+        y=df_copy['quality_score'],
+        mode='markers+lines',
+        marker=dict(
+            size=10,
+            color=df_copy['quality_score'],
+            colorscale='Viridis',
+            showscale=True,
+            colorbar=dict(title="Quality Score")
+        ),
+        text=df_copy['model_name'],
+        hovertemplate=(
+            "<b>%{text}</b><br>" +
+            "Date: %{x}<br>" +
+            "Quality Score: %{y:.4f}<br>" +
+            "<extra></extra>"
+        ),
+        name="Models"
+    ))
+    # Add trend line
+    if len(df_copy) > 1:
+        z = np.polyfit(range(len(df_copy)), df_copy['quality_score'], 1)
+        trend_line = np.poly1d(z)(range(len(df_copy)))
+        fig.add_trace(go.Scatter(
+            x=df_copy['submission_date'],
+            y=trend_line,
+            mode='lines',
+            line=dict(dash='dash', color='red'),
+            name="Trend",
+            hoverinfo='skip'
+        ))
+    fig.update_layout(
+        title="📅 Model Performance Timeline",
+        xaxis_title="Submission Date",
+        yaxis_title="Quality Score",
+        height=500
+    )
+    return fig
+def create_google_comparison_plot(df: pd.DataFrame) -> go.Figure:
+    """Create plot comparing models on Google Translate-comparable language pairs."""
+    # Filter models that have Google comparable results
+    google_models = df[df['google_pairs_covered'] > 0].copy()
+    if google_models.empty:
+        return go.Figure().add_annotation(
+            text="No models with Google Translate comparable results",
+            x=0.5, y=0.5
+        )
+    fig = go.Figure()
+    # Create scatter plot
+    fig.add_trace(go.Scatter(
+        x=google_models['google_bleu'],
+        y=google_models['google_quality_score'],
+        mode='markers+text',
+        marker=dict(
+            size=12,
+            color=google_models['google_chrf'],
+            colorscale='Plasma',
+            showscale=True,
+            colorbar=dict(title="ChrF Score")
+        ),
+        text=google_models['model_name'],
+        textposition="top center",
+        hovertemplate=(
+            "<b>%{text}</b><br>" +
+            "BLEU: %{x:.2f}<br>" +
+            "Quality: %{y:.4f}<br>" +
+            "ChrF: %{marker.color:.4f}<br>" +
+            "<extra></extra>"
+        ),
+        name="Models"
+    ))
+    fig.update_layout(
+        title="🤖 Google Translate Comparable Performance",
+        xaxis_title="BLEU Score",
+        yaxis_title="Quality Score",
+        height=500
+    )
+    return fig
+def create_detailed_model_analysis(model_results: Dict, model_name: str) -> go.Figure:
+    """Create detailed analysis plot for a specific model."""
+    if not model_results or 'pair_metrics' not in model_results:
+        return go.Figure().add_annotation(text="No detailed results available", x=0.5, y=0.5)
+    pair_metrics = model_results['pair_metrics']
+    # Extract language pair data
+    pairs = []
+    bleu_scores = []
+    quality_scores = []
+    sample_counts = []
+    google_comparable = []
+    for pair_key, metrics in pair_metrics.items():
+        if 'sample_count' in metrics and metrics['sample_count'] > 0:
+            src, tgt = pair_key.split('_to_')
+            pair_label = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
+            pairs.append(pair_label)
+            bleu_scores.append(metrics.get('bleu', 0))
+            quality_scores.append(metrics.get('quality_score', 0))
+            sample_counts.append(metrics.get('sample_count', 0))
+            is_google = (src in GOOGLE_SUPPORTED_LANGUAGES and tgt in GOOGLE_SUPPORTED_LANGUAGES)
+            google_comparable.append(is_google)
+    if not pairs:
+        return go.Figure().add_annotation(text="No language pair data found", x=0.5, y=0.5)
+    # Create subplot
+    fig = make_subplots(
+        rows=2, cols=1,
+        subplot_titles=(
+            f"{model_name} - BLEU Scores by Language Pair",
+            f"{model_name} - Quality Scores by Language Pair"
+        ),
+        vertical_spacing=0.1
+    )
+    # Color code by Google comparable
+    colors = ['#1f77b4' if gc else '#ff7f0e' for gc in google_comparable]
+    # BLEU scores
+    fig.add_trace(
+        go.Bar(
+            x=pairs,
+            y=bleu_scores,
+            marker_color=colors,
+            name="BLEU",
+            text=[f"{score:.1f}" for score in bleu_scores],
+            textposition='auto'
+        ),
+        row=1, col=1
+    )
+    # Quality scores
+    fig.add_trace(
+        go.Bar(
+            x=pairs,
+            y=quality_scores,
+            marker_color=colors,
+            name="Quality",
+            text=[f"{score:.3f}" for score in quality_scores],
+            textposition='auto',
+            showlegend=False
+        ),
+        row=2, col=1
+    )
+    fig.update_layout(
+        height=800,
+        title=f"📊 Detailed Analysis: {model_name}",
+        showlegend=True
+    )
+    # Rotate x-axis labels
+    fig.update_xaxes(tickangle=45)
+    # Add legend for colors
+    fig.add_trace(
+        go.Scatter(
+            x=[None], y=[None],
+            mode='markers',
+            marker=dict(size=10, color='#1f77b4'),
+            name="Google Comparable",
+            showlegend=True
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=[None], y=[None],
+            mode='markers',
+            marker=dict(size=10, color='#ff7f0e'),
+            name="UG40 Only",
+            showlegend=True
+        )
+    )
+    return fig
+def create_submission_summary_plot(validation_info: Dict, evaluation_results: Dict) -> go.Figure:
+    """Create summary plot for a new submission."""
+    fig = make_subplots(
+        rows=2, cols=2,
+        subplot_titles=(
+            "Coverage by Language Pair",
+            "Primary Metrics",
+            "Error Analysis",
+            "Sample Distribution"
+        ),
+        specs=[[{"type": "bar"}, {"type": "bar"}],
+               [{"type": "bar"}, {"type": "pie"}]]
+    )
+    # Coverage by language pair
+    if 'pair_coverage' in validation_info:
+        pair_data = validation_info['pair_coverage']
+        pairs = list(pair_data.keys())[:10]  # Top 10 pairs
+        coverage_rates = [pair_data[p]['coverage_rate'] for p in pairs]
+        fig.add_trace(
+            go.Bar(x=pairs, y=coverage_rates, name="Coverage"),
+            row=1, col=1
+        )
+    # Primary metrics
+    if 'summary' in evaluation_results:
+        metrics_data = evaluation_results['summary']['primary_metrics']
+        metric_names = list(metrics_data.keys())
+        metric_values = list(metrics_data.values())
+        fig.add_trace(
+            go.Bar(x=metric_names, y=metric_values, name="Metrics"),
+            row=1, col=2
+        )
+    # Error analysis (CER, WER)
+    if 'averages' in evaluation_results:
+        error_metrics = ['cer', 'wer']
+        error_values = [evaluation_results['averages'].get(m, 0) for m in error_metrics]
+        fig.add_trace(
+            go.Bar(x=error_metrics, y=error_values, name="Errors"),
+            row=2, col=1
+        )
+    # Sample distribution (placeholder)
+    fig.add_trace(
+        go.Pie(
+            labels=["Evaluated", "Missing"],
+            values=[validation_info.get('coverage', 0.8) * 100,
+                   (1 - validation_info.get('coverage', 0.8)) * 100],
+            name="Samples"
+        ),
+        row=2, col=2
+    )
+    fig.update_layout(
+        title="📋 Submission Summary",
+        height=700,
+        showlegend=False
+    )
+    return fig