danielrosehill's picture
Deduplicate leaderboard entries
7265315
import json
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
BASE_DIR = Path(__file__).parent
RESULTS_DIR = BASE_DIR / "data" / "results"
def _load_json_file(filename: str) -> dict:
"""Load a JSON file from the local results directory."""
path = RESULTS_DIR / filename
if not path.exists():
raise FileNotFoundError(f"Required data file not found: {path}")
with path.open("r", encoding="utf-8") as file:
return json.load(file)
# Load data from local JSON files
def load_benchmark_data():
return _load_json_file("benchmark_results.json")
def load_punctuation_data():
return _load_json_file("punctuation_results.json")
# Create leaderboard dataframe from benchmark results
def create_leaderboard_df(benchmark_data):
rows = []
for result in benchmark_data["results"]:
row = {
"Provider": result["provider"],
"Model": result["model"],
"Type": result["run_type"],
"WER (%)": result["metrics"]["wer"],
"CER (%)": result["metrics"]["cer"],
"Word Accuracy (%)": result["metrics"]["word_accuracy"],
"Insertions": result["metrics"]["insertions"],
"Deletions": result["metrics"]["deletions"],
"Substitutions": result["metrics"]["substitutions"],
"Hits": result["metrics"]["hits"]
}
rows.append(row)
df = pd.DataFrame(rows)
# Remove duplicate model/provider entries so each model shows once in the charts
df = df.drop_duplicates(subset=["Provider", "Model", "Type"], keep="first")
# Sort by WER (lower is better)
df = df.sort_values("WER (%)", ascending=True)
return df
# Create punctuation dataframe
def create_punctuation_df(punct_data):
rows = []
for result in punct_data["results"]:
row = {
"Provider": result["provider"],
"Model": result["model"],
"Overall Score (%)": result["metrics"]["overall_punctuation_score"],
"Context Match (%)": result["metrics"]["context_match_accuracy"],
"Total Punctuation": result["metrics"]["total_punctuation"]["hypothesis"],
"Reference": result["metrics"]["total_punctuation"]["reference"],
"Difference": result["metrics"]["total_punctuation"]["difference"]
}
rows.append(row)
df = pd.DataFrame(rows)
df = df.drop_duplicates(subset=["Provider", "Model"], keep="first")
# Sort by overall score (higher is better)
df = df.sort_values("Overall Score (%)", ascending=False)
return df
# Create WER comparison chart
def create_wer_chart(df):
fig = go.Figure()
colors = ['#2E86AB' if t == 'local-stt' else '#A23B72' for t in df['Type']]
fig.add_trace(go.Bar(
x=df['Model'],
y=df['WER (%)'],
text=df['WER (%)'].round(2),
textposition='outside',
marker_color=colors,
hovertemplate='<b>%{x}</b><br>WER: %{y:.2f}%<extra></extra>'
))
fig.update_layout(
title="Word Error Rate (WER) Comparison - Lower is Better",
xaxis_title="Model",
yaxis_title="WER (%)",
height=500,
showlegend=False,
template="plotly_white"
)
return fig
# Create accuracy comparison chart
def create_accuracy_chart(df):
fig = go.Figure()
fig.add_trace(go.Bar(
name='Word Accuracy',
x=df['Model'],
y=df['Word Accuracy (%)'],
text=df['Word Accuracy (%)'].round(2),
textposition='outside',
marker_color='#06A77D'
))
fig.update_layout(
title="Word Accuracy Comparison - Higher is Better",
xaxis_title="Model",
yaxis_title="Accuracy (%)",
height=500,
template="plotly_white"
)
return fig
# Create error breakdown chart
def create_error_breakdown_chart(df):
fig = go.Figure()
fig.add_trace(go.Bar(name='Insertions', x=df['Model'], y=df['Insertions'], marker_color='#F18F01'))
fig.add_trace(go.Bar(name='Deletions', x=df['Model'], y=df['Deletions'], marker_color='#C73E1D'))
fig.add_trace(go.Bar(name='Substitutions', x=df['Model'], y=df['Substitutions'], marker_color='#6A4C93'))
fig.update_layout(
title="Error Type Breakdown by Model",
xaxis_title="Model",
yaxis_title="Count",
barmode='group',
height=500,
template="plotly_white"
)
return fig
# Create punctuation score chart
def create_punctuation_chart(punct_df):
fig = go.Figure()
fig.add_trace(go.Bar(
name='Overall Punctuation Score',
x=punct_df['Model'],
y=punct_df['Overall Score (%)'],
text=punct_df['Overall Score (%)'].round(2),
textposition='outside',
marker_color='#5F0F40'
))
fig.add_trace(go.Bar(
name='Context Match Accuracy',
x=punct_df['Model'],
y=punct_df['Context Match (%)'],
text=punct_df['Context Match (%)'].round(2),
textposition='outside',
marker_color='#9A031E'
))
fig.update_layout(
title="Punctuation Performance Comparison",
xaxis_title="Model",
yaxis_title="Score (%)",
barmode='group',
height=500,
template="plotly_white"
)
return fig
# Load data
benchmark_data = load_benchmark_data()
punct_data = load_punctuation_data()
leaderboard_df = create_leaderboard_df(benchmark_data)
punct_df = create_punctuation_df(punct_data)
# Create Gradio interface
with gr.Blocks(title="Podcast ASR Evaluation Results", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸŽ™οΈ Podcast ASR Evaluation Results
Comprehensive evaluation of Automatic Speech Recognition (ASR) systems on podcast audio.
This benchmark compares both local and cloud-based ASR models across multiple metrics.
**Evaluated on:** Single podcast episode ground truth
**Models Tested:** 8 ASR systems (3 local Whisper models, 5 cloud services)
""")
with gr.Tabs():
with gr.Tab("πŸ“Š Overall Leaderboard"):
gr.Markdown("### Transcription Accuracy Leaderboard")
gr.Markdown("Sorted by Word Error Rate (WER) - lower is better")
gr.Dataframe(
value=leaderboard_df,
interactive=False,
wrap=True
)
with gr.Tab("πŸ“ˆ Accuracy Metrics"):
gr.Markdown("### Word Error Rate (WER) Comparison")
gr.Plot(create_wer_chart(leaderboard_df))
gr.Markdown("### Word Accuracy Comparison")
gr.Plot(create_accuracy_chart(leaderboard_df))
with gr.Tab("πŸ” Error Analysis"):
gr.Markdown("### Error Type Breakdown")
gr.Markdown("""
- **Insertions**: Words added that weren't in the original
- **Deletions**: Words missed from the original
- **Substitutions**: Words incorrectly transcribed
""")
gr.Plot(create_error_breakdown_chart(leaderboard_df))
gr.Markdown("### Detailed Error Metrics")
error_df = leaderboard_df[['Provider', 'Model', 'Insertions', 'Deletions', 'Substitutions', 'Hits']]
gr.Dataframe(value=error_df, interactive=False)
with gr.Tab("✏️ Punctuation Analysis"):
gr.Markdown("### Punctuation Performance")
gr.Markdown("""
Evaluates how well each model handles punctuation marks.
- **Overall Score**: Combined punctuation accuracy metric
- **Context Match**: Punctuation placed in correct context
""")
gr.Plot(create_punctuation_chart(punct_df))
gr.Markdown("### Detailed Punctuation Metrics")
gr.Dataframe(value=punct_df, interactive=False)
with gr.Tab("πŸ“– About"):
gr.Markdown("""
## About This Evaluation
This benchmark evaluates ASR systems on long-form podcast audio, measuring:
### Accuracy Metrics
- **WER (Word Error Rate)**: Primary accuracy metric - percentage of word errors
- **CER (Character Error Rate)**: Character-level accuracy
- **Word Accuracy**: Percentage of correctly transcribed words
### Error Types
- **Insertions**: Extra words added
- **Deletions**: Missing words
- **Substitutions**: Incorrect word substitutions
### Punctuation Metrics
- Overall punctuation accuracy
- Context-based punctuation matching
- Per-mark accuracy (periods, commas, quotes, etc.)
### Models Tested
**Local Models (via Buzz):**
- Whisper Base
- Whisper Tiny
**Cloud Services:**
- Gladia (Solaria-1)
- Deepgram (Nova-3)
- AssemblyAI (Best)
- Speechmatics (SLAM-1 Global English)
- OpenAI (Whisper-1)
### Dataset
Full evaluation data available at: [danielrosehill/Podcast-ASR-Evaluation](https://huggingface.co/datasets/danielrosehill/Podcast-ASR-Evaluation)
### Key Findings
1. **Best Overall Accuracy**: Local Whisper Base achieved lowest WER (17.52%)
2. **Best Punctuation**: Deepgram Nova-3 scored highest (51.17%)
3. **Local vs Cloud**: Local models competitive on accuracy, cloud better on punctuation
""")
if __name__ == "__main__":
demo.launch()