Spaces:

danielrosehill
/

Single-Podcast-ASR-Eval

Sleeping

App Files Files Community

Single-Podcast-ASR-Eval / app.py

danielrosehill

Deduplicate leaderboard entries

7265315 about 2 months ago

raw

history blame contribute delete

9.55 kB

	import json
	from pathlib import Path

	import gradio as gr
	import pandas as pd
	import plotly.graph_objects as go

	BASE_DIR = Path(__file__).parent
	RESULTS_DIR = BASE_DIR / "data" / "results"


	def _load_json_file(filename: str) -> dict:
	"""Load a JSON file from the local results directory."""
	path = RESULTS_DIR / filename
	if not path.exists():
	raise FileNotFoundError(f"Required data file not found: {path}")
	with path.open("r", encoding="utf-8") as file:
	return json.load(file)


	# Load data from local JSON files
	def load_benchmark_data():
	return _load_json_file("benchmark_results.json")


	def load_punctuation_data():
	return _load_json_file("punctuation_results.json")

	# Create leaderboard dataframe from benchmark results
	def create_leaderboard_df(benchmark_data):
	rows = []
	for result in benchmark_data["results"]:
	row = {
	"Provider": result["provider"],
	"Model": result["model"],
	"Type": result["run_type"],
	"WER (%)": result["metrics"]["wer"],
	"CER (%)": result["metrics"]["cer"],
	"Word Accuracy (%)": result["metrics"]["word_accuracy"],
	"Insertions": result["metrics"]["insertions"],
	"Deletions": result["metrics"]["deletions"],
	"Substitutions": result["metrics"]["substitutions"],
	"Hits": result["metrics"]["hits"]
	}
	rows.append(row)

	df = pd.DataFrame(rows)
	# Remove duplicate model/provider entries so each model shows once in the charts
	df = df.drop_duplicates(subset=["Provider", "Model", "Type"], keep="first")
	# Sort by WER (lower is better)
	df = df.sort_values("WER (%)", ascending=True)
	return df

	# Create punctuation dataframe
	def create_punctuation_df(punct_data):
	rows = []
	for result in punct_data["results"]:
	row = {
	"Provider": result["provider"],
	"Model": result["model"],
	"Overall Score (%)": result["metrics"]["overall_punctuation_score"],
	"Context Match (%)": result["metrics"]["context_match_accuracy"],
	"Total Punctuation": result["metrics"]["total_punctuation"]["hypothesis"],
	"Reference": result["metrics"]["total_punctuation"]["reference"],
	"Difference": result["metrics"]["total_punctuation"]["difference"]
	}
	rows.append(row)

	df = pd.DataFrame(rows)
	df = df.drop_duplicates(subset=["Provider", "Model"], keep="first")
	# Sort by overall score (higher is better)
	df = df.sort_values("Overall Score (%)", ascending=False)
	return df

	# Create WER comparison chart
	def create_wer_chart(df):
	fig = go.Figure()

	colors = ['#2E86AB' if t == 'local-stt' else '#A23B72' for t in df['Type']]

	fig.add_trace(go.Bar(
	x=df['Model'],
	y=df['WER (%)'],
	text=df['WER (%)'].round(2),
	textposition='outside',
	marker_color=colors,
	hovertemplate='<b>%{x}</b><br>WER: %{y:.2f}%<extra></extra>'
	))

	fig.update_layout(
	title="Word Error Rate (WER) Comparison - Lower is Better",
	xaxis_title="Model",
	yaxis_title="WER (%)",
	height=500,
	showlegend=False,
	template="plotly_white"
	)

	return fig

	# Create accuracy comparison chart
	def create_accuracy_chart(df):
	fig = go.Figure()

	fig.add_trace(go.Bar(
	name='Word Accuracy',
	x=df['Model'],
	y=df['Word Accuracy (%)'],
	text=df['Word Accuracy (%)'].round(2),
	textposition='outside',
	marker_color='#06A77D'
	))

	fig.update_layout(
	title="Word Accuracy Comparison - Higher is Better",
	xaxis_title="Model",
	yaxis_title="Accuracy (%)",
	height=500,
	template="plotly_white"
	)

	return fig

	# Create error breakdown chart
	def create_error_breakdown_chart(df):
	fig = go.Figure()

	fig.add_trace(go.Bar(name='Insertions', x=df['Model'], y=df['Insertions'], marker_color='#F18F01'))
	fig.add_trace(go.Bar(name='Deletions', x=df['Model'], y=df['Deletions'], marker_color='#C73E1D'))
	fig.add_trace(go.Bar(name='Substitutions', x=df['Model'], y=df['Substitutions'], marker_color='#6A4C93'))

	fig.update_layout(
	title="Error Type Breakdown by Model",
	xaxis_title="Model",
	yaxis_title="Count",
	barmode='group',
	height=500,
	template="plotly_white"
	)

	return fig

	# Create punctuation score chart
	def create_punctuation_chart(punct_df):
	fig = go.Figure()

	fig.add_trace(go.Bar(
	name='Overall Punctuation Score',
	x=punct_df['Model'],
	y=punct_df['Overall Score (%)'],
	text=punct_df['Overall Score (%)'].round(2),
	textposition='outside',
	marker_color='#5F0F40'
	))

	fig.add_trace(go.Bar(
	name='Context Match Accuracy',
	x=punct_df['Model'],
	y=punct_df['Context Match (%)'],
	text=punct_df['Context Match (%)'].round(2),
	textposition='outside',
	marker_color='#9A031E'
	))

	fig.update_layout(
	title="Punctuation Performance Comparison",
	xaxis_title="Model",
	yaxis_title="Score (%)",
	barmode='group',
	height=500,
	template="plotly_white"
	)

	return fig

	# Load data
	benchmark_data = load_benchmark_data()
	punct_data = load_punctuation_data()
	leaderboard_df = create_leaderboard_df(benchmark_data)
	punct_df = create_punctuation_df(punct_data)

	# Create Gradio interface
	with gr.Blocks(title="Podcast ASR Evaluation Results", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎙️ Podcast ASR Evaluation Results

	Comprehensive evaluation of Automatic Speech Recognition (ASR) systems on podcast audio.
	This benchmark compares both local and cloud-based ASR models across multiple metrics.

	Evaluated on: Single podcast episode ground truth

	Models Tested: 8 ASR systems (3 local Whisper models, 5 cloud services)
	""")

	with gr.Tabs():
	with gr.Tab("📊 Overall Leaderboard"):
	gr.Markdown("### Transcription Accuracy Leaderboard")
	gr.Markdown("Sorted by Word Error Rate (WER) - lower is better")
	gr.Dataframe(
	value=leaderboard_df,
	interactive=False,
	wrap=True
	)

	with gr.Tab("📈 Accuracy Metrics"):
	gr.Markdown("### Word Error Rate (WER) Comparison")
	gr.Plot(create_wer_chart(leaderboard_df))

	gr.Markdown("### Word Accuracy Comparison")
	gr.Plot(create_accuracy_chart(leaderboard_df))

	with gr.Tab("🔍 Error Analysis"):
	gr.Markdown("### Error Type Breakdown")
	gr.Markdown("""
	- Insertions: Words added that weren't in the original
	- Deletions: Words missed from the original
	- Substitutions: Words incorrectly transcribed
	""")
	gr.Plot(create_error_breakdown_chart(leaderboard_df))

	gr.Markdown("### Detailed Error Metrics")
	error_df = leaderboard_df[['Provider', 'Model', 'Insertions', 'Deletions', 'Substitutions', 'Hits']]
	gr.Dataframe(value=error_df, interactive=False)

	with gr.Tab("✏️ Punctuation Analysis"):
	gr.Markdown("### Punctuation Performance")
	gr.Markdown("""
	Evaluates how well each model handles punctuation marks.
	- Overall Score: Combined punctuation accuracy metric
	- Context Match: Punctuation placed in correct context
	""")
	gr.Plot(create_punctuation_chart(punct_df))

	gr.Markdown("### Detailed Punctuation Metrics")
	gr.Dataframe(value=punct_df, interactive=False)

	with gr.Tab("📖 About"):
	gr.Markdown("""
	## About This Evaluation

	This benchmark evaluates ASR systems on long-form podcast audio, measuring:

	### Accuracy Metrics
	- WER (Word Error Rate): Primary accuracy metric - percentage of word errors
	- CER (Character Error Rate): Character-level accuracy
	- Word Accuracy: Percentage of correctly transcribed words

	### Error Types
	- Insertions: Extra words added
	- Deletions: Missing words
	- Substitutions: Incorrect word substitutions

	### Punctuation Metrics
	- Overall punctuation accuracy
	- Context-based punctuation matching
	- Per-mark accuracy (periods, commas, quotes, etc.)

	### Models Tested

	Local Models (via Buzz):
	- Whisper Base
	- Whisper Tiny

	Cloud Services:
	- Gladia (Solaria-1)
	- Deepgram (Nova-3)
	- AssemblyAI (Best)
	- Speechmatics (SLAM-1 Global English)
	- OpenAI (Whisper-1)

	### Dataset

	Full evaluation data available at: [danielrosehill/Podcast-ASR-Evaluation](https://huggingface.co/datasets/danielrosehill/Podcast-ASR-Evaluation)

	### Key Findings

	1. Best Overall Accuracy: Local Whisper Base achieved lowest WER (17.52%)
	2. Best Punctuation: Deepgram Nova-3 scored highest (51.17%)
	3. Local vs Cloud: Local models competitive on accuracy, cloud better on punctuation
	""")

	if __name__ == "__main__":
	demo.launch()