Spaces:

danielrosehill
/

Single-Podcast-ASR-Eval

Sleeping

File size: 9,547 Bytes

import json
from pathlib import Path

import gradio as gr
import pandas as pd
import plotly.graph_objects as go

BASE_DIR = Path(__file__).parent
RESULTS_DIR = BASE_DIR / "data" / "results"


def _load_json_file(filename: str) -> dict:
    """Load a JSON file from the local results directory."""
    path = RESULTS_DIR / filename
    if not path.exists():
        raise FileNotFoundError(f"Required data file not found: {path}")
    with path.open("r", encoding="utf-8") as file:
        return json.load(file)


# Load data from local JSON files
def load_benchmark_data():
    return _load_json_file("benchmark_results.json")


def load_punctuation_data():
    return _load_json_file("punctuation_results.json")

# Create leaderboard dataframe from benchmark results
def create_leaderboard_df(benchmark_data):
    rows = []
    for result in benchmark_data["results"]:
        row = {
            "Provider": result["provider"],
            "Model": result["model"],
            "Type": result["run_type"],
            "WER (%)": result["metrics"]["wer"],
            "CER (%)": result["metrics"]["cer"],
            "Word Accuracy (%)": result["metrics"]["word_accuracy"],
            "Insertions": result["metrics"]["insertions"],
            "Deletions": result["metrics"]["deletions"],
            "Substitutions": result["metrics"]["substitutions"],
            "Hits": result["metrics"]["hits"]
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    # Remove duplicate model/provider entries so each model shows once in the charts
    df = df.drop_duplicates(subset=["Provider", "Model", "Type"], keep="first")
    # Sort by WER (lower is better)
    df = df.sort_values("WER (%)", ascending=True)
    return df

# Create punctuation dataframe
def create_punctuation_df(punct_data):
    rows = []
    for result in punct_data["results"]:
        row = {
            "Provider": result["provider"],
            "Model": result["model"],
            "Overall Score (%)": result["metrics"]["overall_punctuation_score"],
            "Context Match (%)": result["metrics"]["context_match_accuracy"],
            "Total Punctuation": result["metrics"]["total_punctuation"]["hypothesis"],
            "Reference": result["metrics"]["total_punctuation"]["reference"],
            "Difference": result["metrics"]["total_punctuation"]["difference"]
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    df = df.drop_duplicates(subset=["Provider", "Model"], keep="first")
    # Sort by overall score (higher is better)
    df = df.sort_values("Overall Score (%)", ascending=False)
    return df

# Create WER comparison chart
def create_wer_chart(df):
    fig = go.Figure()

    colors = ['#2E86AB' if t == 'local-stt' else '#A23B72' for t in df['Type']]

    fig.add_trace(go.Bar(
        x=df['Model'],
        y=df['WER (%)'],
        text=df['WER (%)'].round(2),
        textposition='outside',
        marker_color=colors,
        hovertemplate='<b>%{x}</b><br>WER: %{y:.2f}%<extra></extra>'
    ))

    fig.update_layout(
        title="Word Error Rate (WER) Comparison - Lower is Better",
        xaxis_title="Model",
        yaxis_title="WER (%)",
        height=500,
        showlegend=False,
        template="plotly_white"
    )

    return fig

# Create accuracy comparison chart
def create_accuracy_chart(df):
    fig = go.Figure()

    fig.add_trace(go.Bar(
        name='Word Accuracy',
        x=df['Model'],
        y=df['Word Accuracy (%)'],
        text=df['Word Accuracy (%)'].round(2),
        textposition='outside',
        marker_color='#06A77D'
    ))

    fig.update_layout(
        title="Word Accuracy Comparison - Higher is Better",
        xaxis_title="Model",
        yaxis_title="Accuracy (%)",
        height=500,
        template="plotly_white"
    )

    return fig

# Create error breakdown chart
def create_error_breakdown_chart(df):
    fig = go.Figure()

    fig.add_trace(go.Bar(name='Insertions', x=df['Model'], y=df['Insertions'], marker_color='#F18F01'))
    fig.add_trace(go.Bar(name='Deletions', x=df['Model'], y=df['Deletions'], marker_color='#C73E1D'))
    fig.add_trace(go.Bar(name='Substitutions', x=df['Model'], y=df['Substitutions'], marker_color='#6A4C93'))

    fig.update_layout(
        title="Error Type Breakdown by Model",
        xaxis_title="Model",
        yaxis_title="Count",
        barmode='group',
        height=500,
        template="plotly_white"
    )

    return fig

# Create punctuation score chart
def create_punctuation_chart(punct_df):
    fig = go.Figure()

    fig.add_trace(go.Bar(
        name='Overall Punctuation Score',
        x=punct_df['Model'],
        y=punct_df['Overall Score (%)'],
        text=punct_df['Overall Score (%)'].round(2),
        textposition='outside',
        marker_color='#5F0F40'
    ))

    fig.add_trace(go.Bar(
        name='Context Match Accuracy',
        x=punct_df['Model'],
        y=punct_df['Context Match (%)'],
        text=punct_df['Context Match (%)'].round(2),
        textposition='outside',
        marker_color='#9A031E'
    ))

    fig.update_layout(
        title="Punctuation Performance Comparison",
        xaxis_title="Model",
        yaxis_title="Score (%)",
        barmode='group',
        height=500,
        template="plotly_white"
    )

    return fig

# Load data
benchmark_data = load_benchmark_data()
punct_data = load_punctuation_data()
leaderboard_df = create_leaderboard_df(benchmark_data)
punct_df = create_punctuation_df(punct_data)

# Create Gradio interface
with gr.Blocks(title="Podcast ASR Evaluation Results", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎙️ Podcast ASR Evaluation Results

    Comprehensive evaluation of Automatic Speech Recognition (ASR) systems on podcast audio.
    This benchmark compares both local and cloud-based ASR models across multiple metrics.

    **Evaluated on:** Single podcast episode ground truth

    **Models Tested:** 8 ASR systems (3 local Whisper models, 5 cloud services)
    """)

    with gr.Tabs():
        with gr.Tab("📊 Overall Leaderboard"):
            gr.Markdown("### Transcription Accuracy Leaderboard")
            gr.Markdown("Sorted by Word Error Rate (WER) - lower is better")
            gr.Dataframe(
                value=leaderboard_df,
                interactive=False,
                wrap=True
            )

        with gr.Tab("📈 Accuracy Metrics"):
            gr.Markdown("### Word Error Rate (WER) Comparison")
            gr.Plot(create_wer_chart(leaderboard_df))

            gr.Markdown("### Word Accuracy Comparison")
            gr.Plot(create_accuracy_chart(leaderboard_df))

        with gr.Tab("🔍 Error Analysis"):
            gr.Markdown("### Error Type Breakdown")
            gr.Markdown("""
            - **Insertions**: Words added that weren't in the original
            - **Deletions**: Words missed from the original
            - **Substitutions**: Words incorrectly transcribed
            """)
            gr.Plot(create_error_breakdown_chart(leaderboard_df))

            gr.Markdown("### Detailed Error Metrics")
            error_df = leaderboard_df[['Provider', 'Model', 'Insertions', 'Deletions', 'Substitutions', 'Hits']]
            gr.Dataframe(value=error_df, interactive=False)

        with gr.Tab("✏️ Punctuation Analysis"):
            gr.Markdown("### Punctuation Performance")
            gr.Markdown("""
            Evaluates how well each model handles punctuation marks.
            - **Overall Score**: Combined punctuation accuracy metric
            - **Context Match**: Punctuation placed in correct context
            """)
            gr.Plot(create_punctuation_chart(punct_df))

            gr.Markdown("### Detailed Punctuation Metrics")
            gr.Dataframe(value=punct_df, interactive=False)

        with gr.Tab("📖 About"):
            gr.Markdown("""
            ## About This Evaluation

            This benchmark evaluates ASR systems on long-form podcast audio, measuring:

            ### Accuracy Metrics
            - **WER (Word Error Rate)**: Primary accuracy metric - percentage of word errors
            - **CER (Character Error Rate)**: Character-level accuracy
            - **Word Accuracy**: Percentage of correctly transcribed words

            ### Error Types
            - **Insertions**: Extra words added
            - **Deletions**: Missing words
            - **Substitutions**: Incorrect word substitutions

            ### Punctuation Metrics
            - Overall punctuation accuracy
            - Context-based punctuation matching
            - Per-mark accuracy (periods, commas, quotes, etc.)

            ### Models Tested

            **Local Models (via Buzz):**
            - Whisper Base
            - Whisper Tiny

            **Cloud Services:**
            - Gladia (Solaria-1)
            - Deepgram (Nova-3)
            - AssemblyAI (Best)
            - Speechmatics (SLAM-1 Global English)
            - OpenAI (Whisper-1)

            ### Dataset

            Full evaluation data available at: [danielrosehill/Podcast-ASR-Evaluation](https://huggingface.co/datasets/danielrosehill/Podcast-ASR-Evaluation)

            ### Key Findings

            1. **Best Overall Accuracy**: Local Whisper Base achieved lowest WER (17.52%)
            2. **Best Punctuation**: Deepgram Nova-3 scored highest (51.17%)
            3. **Local vs Cloud**: Local models competitive on accuracy, cloud better on punctuation
            """)

if __name__ == "__main__":
    demo.launch()