import json from pathlib import Path import gradio as gr import pandas as pd import plotly.graph_objects as go BASE_DIR = Path(__file__).parent RESULTS_DIR = BASE_DIR / "data" / "results" def _load_json_file(filename: str) -> dict: """Load a JSON file from the local results directory.""" path = RESULTS_DIR / filename if not path.exists(): raise FileNotFoundError(f"Required data file not found: {path}") with path.open("r", encoding="utf-8") as file: return json.load(file) # Load data from local JSON files def load_benchmark_data(): return _load_json_file("benchmark_results.json") def load_punctuation_data(): return _load_json_file("punctuation_results.json") # Create leaderboard dataframe from benchmark results def create_leaderboard_df(benchmark_data): rows = [] for result in benchmark_data["results"]: row = { "Provider": result["provider"], "Model": result["model"], "Type": result["run_type"], "WER (%)": result["metrics"]["wer"], "CER (%)": result["metrics"]["cer"], "Word Accuracy (%)": result["metrics"]["word_accuracy"], "Insertions": result["metrics"]["insertions"], "Deletions": result["metrics"]["deletions"], "Substitutions": result["metrics"]["substitutions"], "Hits": result["metrics"]["hits"] } rows.append(row) df = pd.DataFrame(rows) # Remove duplicate model/provider entries so each model shows once in the charts df = df.drop_duplicates(subset=["Provider", "Model", "Type"], keep="first") # Sort by WER (lower is better) df = df.sort_values("WER (%)", ascending=True) return df # Create punctuation dataframe def create_punctuation_df(punct_data): rows = [] for result in punct_data["results"]: row = { "Provider": result["provider"], "Model": result["model"], "Overall Score (%)": result["metrics"]["overall_punctuation_score"], "Context Match (%)": result["metrics"]["context_match_accuracy"], "Total Punctuation": result["metrics"]["total_punctuation"]["hypothesis"], "Reference": result["metrics"]["total_punctuation"]["reference"], "Difference": result["metrics"]["total_punctuation"]["difference"] } rows.append(row) df = pd.DataFrame(rows) df = df.drop_duplicates(subset=["Provider", "Model"], keep="first") # Sort by overall score (higher is better) df = df.sort_values("Overall Score (%)", ascending=False) return df # Create WER comparison chart def create_wer_chart(df): fig = go.Figure() colors = ['#2E86AB' if t == 'local-stt' else '#A23B72' for t in df['Type']] fig.add_trace(go.Bar( x=df['Model'], y=df['WER (%)'], text=df['WER (%)'].round(2), textposition='outside', marker_color=colors, hovertemplate='%{x}
WER: %{y:.2f}%' )) fig.update_layout( title="Word Error Rate (WER) Comparison - Lower is Better", xaxis_title="Model", yaxis_title="WER (%)", height=500, showlegend=False, template="plotly_white" ) return fig # Create accuracy comparison chart def create_accuracy_chart(df): fig = go.Figure() fig.add_trace(go.Bar( name='Word Accuracy', x=df['Model'], y=df['Word Accuracy (%)'], text=df['Word Accuracy (%)'].round(2), textposition='outside', marker_color='#06A77D' )) fig.update_layout( title="Word Accuracy Comparison - Higher is Better", xaxis_title="Model", yaxis_title="Accuracy (%)", height=500, template="plotly_white" ) return fig # Create error breakdown chart def create_error_breakdown_chart(df): fig = go.Figure() fig.add_trace(go.Bar(name='Insertions', x=df['Model'], y=df['Insertions'], marker_color='#F18F01')) fig.add_trace(go.Bar(name='Deletions', x=df['Model'], y=df['Deletions'], marker_color='#C73E1D')) fig.add_trace(go.Bar(name='Substitutions', x=df['Model'], y=df['Substitutions'], marker_color='#6A4C93')) fig.update_layout( title="Error Type Breakdown by Model", xaxis_title="Model", yaxis_title="Count", barmode='group', height=500, template="plotly_white" ) return fig # Create punctuation score chart def create_punctuation_chart(punct_df): fig = go.Figure() fig.add_trace(go.Bar( name='Overall Punctuation Score', x=punct_df['Model'], y=punct_df['Overall Score (%)'], text=punct_df['Overall Score (%)'].round(2), textposition='outside', marker_color='#5F0F40' )) fig.add_trace(go.Bar( name='Context Match Accuracy', x=punct_df['Model'], y=punct_df['Context Match (%)'], text=punct_df['Context Match (%)'].round(2), textposition='outside', marker_color='#9A031E' )) fig.update_layout( title="Punctuation Performance Comparison", xaxis_title="Model", yaxis_title="Score (%)", barmode='group', height=500, template="plotly_white" ) return fig # Load data benchmark_data = load_benchmark_data() punct_data = load_punctuation_data() leaderboard_df = create_leaderboard_df(benchmark_data) punct_df = create_punctuation_df(punct_data) # Create Gradio interface with gr.Blocks(title="Podcast ASR Evaluation Results", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎙️ Podcast ASR Evaluation Results Comprehensive evaluation of Automatic Speech Recognition (ASR) systems on podcast audio. This benchmark compares both local and cloud-based ASR models across multiple metrics. **Evaluated on:** Single podcast episode ground truth **Models Tested:** 8 ASR systems (3 local Whisper models, 5 cloud services) """) with gr.Tabs(): with gr.Tab("📊 Overall Leaderboard"): gr.Markdown("### Transcription Accuracy Leaderboard") gr.Markdown("Sorted by Word Error Rate (WER) - lower is better") gr.Dataframe( value=leaderboard_df, interactive=False, wrap=True ) with gr.Tab("📈 Accuracy Metrics"): gr.Markdown("### Word Error Rate (WER) Comparison") gr.Plot(create_wer_chart(leaderboard_df)) gr.Markdown("### Word Accuracy Comparison") gr.Plot(create_accuracy_chart(leaderboard_df)) with gr.Tab("🔍 Error Analysis"): gr.Markdown("### Error Type Breakdown") gr.Markdown(""" - **Insertions**: Words added that weren't in the original - **Deletions**: Words missed from the original - **Substitutions**: Words incorrectly transcribed """) gr.Plot(create_error_breakdown_chart(leaderboard_df)) gr.Markdown("### Detailed Error Metrics") error_df = leaderboard_df[['Provider', 'Model', 'Insertions', 'Deletions', 'Substitutions', 'Hits']] gr.Dataframe(value=error_df, interactive=False) with gr.Tab("✏️ Punctuation Analysis"): gr.Markdown("### Punctuation Performance") gr.Markdown(""" Evaluates how well each model handles punctuation marks. - **Overall Score**: Combined punctuation accuracy metric - **Context Match**: Punctuation placed in correct context """) gr.Plot(create_punctuation_chart(punct_df)) gr.Markdown("### Detailed Punctuation Metrics") gr.Dataframe(value=punct_df, interactive=False) with gr.Tab("📖 About"): gr.Markdown(""" ## About This Evaluation This benchmark evaluates ASR systems on long-form podcast audio, measuring: ### Accuracy Metrics - **WER (Word Error Rate)**: Primary accuracy metric - percentage of word errors - **CER (Character Error Rate)**: Character-level accuracy - **Word Accuracy**: Percentage of correctly transcribed words ### Error Types - **Insertions**: Extra words added - **Deletions**: Missing words - **Substitutions**: Incorrect word substitutions ### Punctuation Metrics - Overall punctuation accuracy - Context-based punctuation matching - Per-mark accuracy (periods, commas, quotes, etc.) ### Models Tested **Local Models (via Buzz):** - Whisper Base - Whisper Tiny **Cloud Services:** - Gladia (Solaria-1) - Deepgram (Nova-3) - AssemblyAI (Best) - Speechmatics (SLAM-1 Global English) - OpenAI (Whisper-1) ### Dataset Full evaluation data available at: [danielrosehill/Podcast-ASR-Evaluation](https://huggingface.co/datasets/danielrosehill/Podcast-ASR-Evaluation) ### Key Findings 1. **Best Overall Accuracy**: Local Whisper Base achieved lowest WER (17.52%) 2. **Best Punctuation**: Deepgram Nova-3 scored highest (51.17%) 3. **Local vs Cloud**: Local models competitive on accuracy, cloud better on punctuation """) if __name__ == "__main__": demo.launch()