|
|
import json |
|
|
from pathlib import Path |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import plotly.graph_objects as go |
|
|
|
|
|
BASE_DIR = Path(__file__).parent |
|
|
RESULTS_DIR = BASE_DIR / "data" / "results" |
|
|
|
|
|
|
|
|
def _load_json_file(filename: str) -> dict: |
|
|
"""Load a JSON file from the local results directory.""" |
|
|
path = RESULTS_DIR / filename |
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"Required data file not found: {path}") |
|
|
with path.open("r", encoding="utf-8") as file: |
|
|
return json.load(file) |
|
|
|
|
|
|
|
|
|
|
|
def load_benchmark_data(): |
|
|
return _load_json_file("benchmark_results.json") |
|
|
|
|
|
|
|
|
def load_punctuation_data(): |
|
|
return _load_json_file("punctuation_results.json") |
|
|
|
|
|
|
|
|
def create_leaderboard_df(benchmark_data): |
|
|
rows = [] |
|
|
for result in benchmark_data["results"]: |
|
|
row = { |
|
|
"Provider": result["provider"], |
|
|
"Model": result["model"], |
|
|
"Type": result["run_type"], |
|
|
"WER (%)": result["metrics"]["wer"], |
|
|
"CER (%)": result["metrics"]["cer"], |
|
|
"Word Accuracy (%)": result["metrics"]["word_accuracy"], |
|
|
"Insertions": result["metrics"]["insertions"], |
|
|
"Deletions": result["metrics"]["deletions"], |
|
|
"Substitutions": result["metrics"]["substitutions"], |
|
|
"Hits": result["metrics"]["hits"] |
|
|
} |
|
|
rows.append(row) |
|
|
|
|
|
df = pd.DataFrame(rows) |
|
|
|
|
|
df = df.drop_duplicates(subset=["Provider", "Model", "Type"], keep="first") |
|
|
|
|
|
df = df.sort_values("WER (%)", ascending=True) |
|
|
return df |
|
|
|
|
|
|
|
|
def create_punctuation_df(punct_data): |
|
|
rows = [] |
|
|
for result in punct_data["results"]: |
|
|
row = { |
|
|
"Provider": result["provider"], |
|
|
"Model": result["model"], |
|
|
"Overall Score (%)": result["metrics"]["overall_punctuation_score"], |
|
|
"Context Match (%)": result["metrics"]["context_match_accuracy"], |
|
|
"Total Punctuation": result["metrics"]["total_punctuation"]["hypothesis"], |
|
|
"Reference": result["metrics"]["total_punctuation"]["reference"], |
|
|
"Difference": result["metrics"]["total_punctuation"]["difference"] |
|
|
} |
|
|
rows.append(row) |
|
|
|
|
|
df = pd.DataFrame(rows) |
|
|
df = df.drop_duplicates(subset=["Provider", "Model"], keep="first") |
|
|
|
|
|
df = df.sort_values("Overall Score (%)", ascending=False) |
|
|
return df |
|
|
|
|
|
|
|
|
def create_wer_chart(df): |
|
|
fig = go.Figure() |
|
|
|
|
|
colors = ['#2E86AB' if t == 'local-stt' else '#A23B72' for t in df['Type']] |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
x=df['Model'], |
|
|
y=df['WER (%)'], |
|
|
text=df['WER (%)'].round(2), |
|
|
textposition='outside', |
|
|
marker_color=colors, |
|
|
hovertemplate='<b>%{x}</b><br>WER: %{y:.2f}%<extra></extra>' |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
title="Word Error Rate (WER) Comparison - Lower is Better", |
|
|
xaxis_title="Model", |
|
|
yaxis_title="WER (%)", |
|
|
height=500, |
|
|
showlegend=False, |
|
|
template="plotly_white" |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_accuracy_chart(df): |
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name='Word Accuracy', |
|
|
x=df['Model'], |
|
|
y=df['Word Accuracy (%)'], |
|
|
text=df['Word Accuracy (%)'].round(2), |
|
|
textposition='outside', |
|
|
marker_color='#06A77D' |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
title="Word Accuracy Comparison - Higher is Better", |
|
|
xaxis_title="Model", |
|
|
yaxis_title="Accuracy (%)", |
|
|
height=500, |
|
|
template="plotly_white" |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_error_breakdown_chart(df): |
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar(name='Insertions', x=df['Model'], y=df['Insertions'], marker_color='#F18F01')) |
|
|
fig.add_trace(go.Bar(name='Deletions', x=df['Model'], y=df['Deletions'], marker_color='#C73E1D')) |
|
|
fig.add_trace(go.Bar(name='Substitutions', x=df['Model'], y=df['Substitutions'], marker_color='#6A4C93')) |
|
|
|
|
|
fig.update_layout( |
|
|
title="Error Type Breakdown by Model", |
|
|
xaxis_title="Model", |
|
|
yaxis_title="Count", |
|
|
barmode='group', |
|
|
height=500, |
|
|
template="plotly_white" |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_punctuation_chart(punct_df): |
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name='Overall Punctuation Score', |
|
|
x=punct_df['Model'], |
|
|
y=punct_df['Overall Score (%)'], |
|
|
text=punct_df['Overall Score (%)'].round(2), |
|
|
textposition='outside', |
|
|
marker_color='#5F0F40' |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name='Context Match Accuracy', |
|
|
x=punct_df['Model'], |
|
|
y=punct_df['Context Match (%)'], |
|
|
text=punct_df['Context Match (%)'].round(2), |
|
|
textposition='outside', |
|
|
marker_color='#9A031E' |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
title="Punctuation Performance Comparison", |
|
|
xaxis_title="Model", |
|
|
yaxis_title="Score (%)", |
|
|
barmode='group', |
|
|
height=500, |
|
|
template="plotly_white" |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
benchmark_data = load_benchmark_data() |
|
|
punct_data = load_punctuation_data() |
|
|
leaderboard_df = create_leaderboard_df(benchmark_data) |
|
|
punct_df = create_punctuation_df(punct_data) |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Podcast ASR Evaluation Results", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# ποΈ Podcast ASR Evaluation Results |
|
|
|
|
|
Comprehensive evaluation of Automatic Speech Recognition (ASR) systems on podcast audio. |
|
|
This benchmark compares both local and cloud-based ASR models across multiple metrics. |
|
|
|
|
|
**Evaluated on:** Single podcast episode ground truth |
|
|
|
|
|
**Models Tested:** 8 ASR systems (3 local Whisper models, 5 cloud services) |
|
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.Tab("π Overall Leaderboard"): |
|
|
gr.Markdown("### Transcription Accuracy Leaderboard") |
|
|
gr.Markdown("Sorted by Word Error Rate (WER) - lower is better") |
|
|
gr.Dataframe( |
|
|
value=leaderboard_df, |
|
|
interactive=False, |
|
|
wrap=True |
|
|
) |
|
|
|
|
|
with gr.Tab("π Accuracy Metrics"): |
|
|
gr.Markdown("### Word Error Rate (WER) Comparison") |
|
|
gr.Plot(create_wer_chart(leaderboard_df)) |
|
|
|
|
|
gr.Markdown("### Word Accuracy Comparison") |
|
|
gr.Plot(create_accuracy_chart(leaderboard_df)) |
|
|
|
|
|
with gr.Tab("π Error Analysis"): |
|
|
gr.Markdown("### Error Type Breakdown") |
|
|
gr.Markdown(""" |
|
|
- **Insertions**: Words added that weren't in the original |
|
|
- **Deletions**: Words missed from the original |
|
|
- **Substitutions**: Words incorrectly transcribed |
|
|
""") |
|
|
gr.Plot(create_error_breakdown_chart(leaderboard_df)) |
|
|
|
|
|
gr.Markdown("### Detailed Error Metrics") |
|
|
error_df = leaderboard_df[['Provider', 'Model', 'Insertions', 'Deletions', 'Substitutions', 'Hits']] |
|
|
gr.Dataframe(value=error_df, interactive=False) |
|
|
|
|
|
with gr.Tab("βοΈ Punctuation Analysis"): |
|
|
gr.Markdown("### Punctuation Performance") |
|
|
gr.Markdown(""" |
|
|
Evaluates how well each model handles punctuation marks. |
|
|
- **Overall Score**: Combined punctuation accuracy metric |
|
|
- **Context Match**: Punctuation placed in correct context |
|
|
""") |
|
|
gr.Plot(create_punctuation_chart(punct_df)) |
|
|
|
|
|
gr.Markdown("### Detailed Punctuation Metrics") |
|
|
gr.Dataframe(value=punct_df, interactive=False) |
|
|
|
|
|
with gr.Tab("π About"): |
|
|
gr.Markdown(""" |
|
|
## About This Evaluation |
|
|
|
|
|
This benchmark evaluates ASR systems on long-form podcast audio, measuring: |
|
|
|
|
|
### Accuracy Metrics |
|
|
- **WER (Word Error Rate)**: Primary accuracy metric - percentage of word errors |
|
|
- **CER (Character Error Rate)**: Character-level accuracy |
|
|
- **Word Accuracy**: Percentage of correctly transcribed words |
|
|
|
|
|
### Error Types |
|
|
- **Insertions**: Extra words added |
|
|
- **Deletions**: Missing words |
|
|
- **Substitutions**: Incorrect word substitutions |
|
|
|
|
|
### Punctuation Metrics |
|
|
- Overall punctuation accuracy |
|
|
- Context-based punctuation matching |
|
|
- Per-mark accuracy (periods, commas, quotes, etc.) |
|
|
|
|
|
### Models Tested |
|
|
|
|
|
**Local Models (via Buzz):** |
|
|
- Whisper Base |
|
|
- Whisper Tiny |
|
|
|
|
|
**Cloud Services:** |
|
|
- Gladia (Solaria-1) |
|
|
- Deepgram (Nova-3) |
|
|
- AssemblyAI (Best) |
|
|
- Speechmatics (SLAM-1 Global English) |
|
|
- OpenAI (Whisper-1) |
|
|
|
|
|
### Dataset |
|
|
|
|
|
Full evaluation data available at: [danielrosehill/Podcast-ASR-Evaluation](https://huggingface.co/datasets/danielrosehill/Podcast-ASR-Evaluation) |
|
|
|
|
|
### Key Findings |
|
|
|
|
|
1. **Best Overall Accuracy**: Local Whisper Base achieved lowest WER (17.52%) |
|
|
2. **Best Punctuation**: Deepgram Nova-3 scored highest (51.17%) |
|
|
3. **Local vs Cloud**: Local models competitive on accuracy, cloud better on punctuation |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|