|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import plotly.graph_objects as go |
|
|
import plotly.express as px |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
reference_text_path = Path("text/reference.txt") |
|
|
if reference_text_path.exists(): |
|
|
with open(reference_text_path, "r") as f: |
|
|
reference_text = f.read() |
|
|
else: |
|
|
reference_text = "Reference text not available" |
|
|
|
|
|
|
|
|
audio_path = Path("audio/001.wav") |
|
|
audio_exists = audio_path.exists() |
|
|
|
|
|
|
|
|
wer_data = { |
|
|
"Model": ["tiny", "base", "small", "medium", "large-v3-turbo"], |
|
|
"WER (%)": [15.05, 9.95, 11.17, 6.07, 7.04], |
|
|
"Speed (s)": [2.73, 5.01, 5.14, 19.42, 33.08], |
|
|
"Model Size": ["39M", "74M", "244M", "769M", "809M"] |
|
|
} |
|
|
df_wer = pd.DataFrame(wer_data) |
|
|
|
|
|
|
|
|
engine_data = { |
|
|
"Engine": ["faster-whisper", "openai-whisper", "distil-whisper"], |
|
|
"WER (%)": [9.95, 9.95, 21.6], |
|
|
"Speed (s)": [4.87, 6.51, 38.49] |
|
|
} |
|
|
df_engine = pd.DataFrame(engine_data) |
|
|
|
|
|
|
|
|
fig_wer = go.Figure() |
|
|
fig_wer.add_trace(go.Bar( |
|
|
x=df_wer["Model"], |
|
|
y=df_wer["WER (%)"], |
|
|
text=df_wer["WER (%)"].round(2), |
|
|
textposition='auto', |
|
|
marker_color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'], |
|
|
hovertemplate='<b>%{x}</b><br>WER: %{y:.2f}%<br>Size: %{customdata}<extra></extra>', |
|
|
customdata=df_wer["Model Size"] |
|
|
)) |
|
|
fig_wer.update_layout( |
|
|
title="Word Error Rate by Model Size", |
|
|
xaxis_title="Model", |
|
|
yaxis_title="WER (%)", |
|
|
template="plotly_white", |
|
|
height=400 |
|
|
) |
|
|
|
|
|
|
|
|
fig_scatter = go.Figure() |
|
|
fig_scatter.add_trace(go.Scatter( |
|
|
x=df_wer["Speed (s)"], |
|
|
y=df_wer["WER (%)"], |
|
|
mode='markers+text', |
|
|
marker=dict(size=15, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']), |
|
|
text=df_wer["Model"], |
|
|
textposition="top center", |
|
|
hovertemplate='<b>%{text}</b><br>Speed: %{x:.2f}s<br>WER: %{y:.2f}%<extra></extra>' |
|
|
)) |
|
|
fig_scatter.update_layout( |
|
|
title="Speed vs Accuracy Tradeoff", |
|
|
xaxis_title="Inference Time (seconds)", |
|
|
yaxis_title="WER (%)", |
|
|
template="plotly_white", |
|
|
height=400 |
|
|
) |
|
|
|
|
|
|
|
|
fig_engine = go.Figure() |
|
|
fig_engine.add_trace(go.Bar( |
|
|
x=df_engine["Engine"], |
|
|
y=df_engine["WER (%)"], |
|
|
name="WER (%)", |
|
|
marker_color='#4ECDC4', |
|
|
text=df_engine["WER (%)"].round(2), |
|
|
textposition='auto' |
|
|
)) |
|
|
fig_engine.update_layout( |
|
|
title="WER by Engine (Base Model)", |
|
|
xaxis_title="Engine", |
|
|
yaxis_title="WER (%)", |
|
|
template="plotly_white", |
|
|
height=400 |
|
|
) |
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
.gradio-container { |
|
|
font-family: 'Inter', sans-serif; |
|
|
} |
|
|
.limitation-box { |
|
|
background-color: #FFF3CD; |
|
|
border-left: 4px solid #FFC107; |
|
|
padding: 15px; |
|
|
margin: 10px 0; |
|
|
} |
|
|
.question-box { |
|
|
background-color: #E3F2FD; |
|
|
border-left: 4px solid #2196F3; |
|
|
padding: 15px; |
|
|
margin: 15px 0; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# Local ASR/STT Benchmark Evaluation |
|
|
### A Single Sample Evaluation on Local Hardware |
|
|
|
|
|
Testing different Whisper model sizes to find the optimal balance between accuracy and speed for daily transcription workflow. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("📊 Overview"): |
|
|
gr.Markdown( |
|
|
""" |
|
|
## About This Evaluation |
|
|
|
|
|
This was a "back of the envelope" style experiment to determine which Whisper model size works best |
|
|
for daily transcription on local hardware, focusing on the tradeoff between accuracy (WER) and inference speed. |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.Markdown("### 🎯 Test Sample") |
|
|
|
|
|
if audio_exists: |
|
|
gr.Audio( |
|
|
value=str(audio_path), |
|
|
label="Test Audio (001.wav)", |
|
|
type="filepath" |
|
|
) |
|
|
else: |
|
|
gr.Markdown("**Note:** Audio file will be added soon.") |
|
|
|
|
|
gr.Markdown("### 📝 Reference Text (Ground Truth)") |
|
|
gr.Textbox( |
|
|
value=reference_text, |
|
|
label="Reference Transcription", |
|
|
lines=10, |
|
|
max_lines=15, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
### ⚠️ Important Limitations |
|
|
|
|
|
- **Quick experiment**: Not a definitive scientific evaluation |
|
|
- **Hardware specific**: AMD GPU with ROCm (not ideal for STT), using CPU inference |
|
|
- **Single sample**: Results based on one audio clip |
|
|
- **Variable conditions**: ASR accuracy depends on mic quality, background noise, speaking style |
|
|
- **Personal use case**: Optimized for one user's voice and workflow |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("📈 Results"): |
|
|
gr.Markdown("## Key Findings") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown( |
|
|
""" |
|
|
### Best Accuracy |
|
|
**medium** model |
|
|
- 6.07% WER |
|
|
- 19.42s inference |
|
|
|
|
|
### Fastest |
|
|
**tiny** model |
|
|
- 15.05% WER |
|
|
- 2.73s inference |
|
|
|
|
|
### Recommended for Daily Use |
|
|
**base** model (faster-whisper) |
|
|
- 9.95% WER |
|
|
- ~5s inference |
|
|
- Good balance |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
gr.Markdown( |
|
|
""" |
|
|
### Key Takeaways |
|
|
|
|
|
1. **Biggest jump**: tiny → base (15% → 10% WER) |
|
|
2. **Diminishing returns**: After base, accuracy gains are smaller |
|
|
3. **faster-whisper**: Same accuracy as OpenAI, 1.2x faster |
|
|
4. **distil-whisper**: Unexpectedly slower AND less accurate on this sample |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.Markdown("## Interactive Visualizations") |
|
|
|
|
|
with gr.Row(): |
|
|
gr.Plot(fig_wer, label="WER by Model Size") |
|
|
|
|
|
with gr.Row(): |
|
|
gr.Plot(fig_scatter, label="Speed vs Accuracy") |
|
|
|
|
|
with gr.Row(): |
|
|
gr.Plot(fig_engine, label="Engine Comparison") |
|
|
|
|
|
gr.Markdown("## Original Charts from Benchmark") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Image("results/wer_by_size.png", label="WER by Size") |
|
|
with gr.Column(): |
|
|
gr.Image("results/speed_by_size.png", label="Speed by Size") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Image("results/accuracy_speed_tradeoff.png", label="Accuracy vs Speed") |
|
|
with gr.Column(): |
|
|
gr.Image("results/engine_comparison.png", label="Engine Comparison") |
|
|
|
|
|
with gr.Row(): |
|
|
gr.Image("results/variants_comparison.png", label="All Variants Tested") |
|
|
|
|
|
|
|
|
with gr.Tab("❓ Questions & Answers"): |
|
|
gr.Markdown( |
|
|
""" |
|
|
# Research Questions & Findings |
|
|
|
|
|
## Q1: How much does model size actually matter for accuracy? |
|
|
|
|
|
**Answer:** On my hardware, diminishing returns set in around **medium**. |
|
|
|
|
|
The biggest accuracy jump was from tiny (15.05% WER) → base (9.95% WER). After that, improvements are smaller: |
|
|
- tiny → base: 5.1% improvement |
|
|
- base → medium: 3.88% improvement |
|
|
- medium → large-v3-turbo: Actually worse (1% regression) |
|
|
|
|
|
The "sweet spot" depends on your use case: |
|
|
- **Live transcription**: Even small lags matter → base or small |
|
|
- **Batch processing**: Can afford slower → medium or large |
|
|
|
|
|
--- |
|
|
|
|
|
## Q2: Is faster-whisper really as good as OpenAI Whisper? |
|
|
|
|
|
**Answer:** Yes! On this test, identical accuracy with better speed. |
|
|
|
|
|
Testing the base model: |
|
|
- **faster-whisper**: 9.95% WER in 5.01s |
|
|
- **openai-whisper**: 9.95% WER in 6.17s |
|
|
|
|
|
faster-whisper was ~1.2x faster with no accuracy loss. Clear winner for my use case. |
|
|
|
|
|
--- |
|
|
|
|
|
## Q3: What's the speed vs. accuracy tradeoff? |
|
|
|
|
|
**Answer:** For daily transcription of my own voice, base or small hits the sweet spot. |
|
|
|
|
|
- **tiny**: 2.73s but 15% WER is too rough |
|
|
- **base**: 5s with 10% WER - acceptable for daily use |
|
|
- **small**: Similar to base, slightly slower |
|
|
- **medium**: 6% WER but 7x slower than tiny |
|
|
- **large-v3-turbo**: 33s for 7% WER - overkill for casual use |
|
|
|
|
|
--- |
|
|
|
|
|
## Q4: Which model should I use for my daily STT workflow? |
|
|
|
|
|
**My personal answer:** base model with faster-whisper |
|
|
|
|
|
**Why it works for me:** |
|
|
- ~10% WER is acceptable for dictation (I can quickly fix errors) |
|
|
- 5 seconds per clip is fast enough |
|
|
- 140MB model size is manageable |
|
|
- Good balance for daily workflow |
|
|
|
|
|
**When I'd use something else:** |
|
|
- **tiny**: Quick tests or very long recordings where speed matters most |
|
|
- **medium/large**: Publishing or professional work needing better accuracy |
|
|
|
|
|
--- |
|
|
|
|
|
## Bonus Finding: distil-whisper |
|
|
|
|
|
I tested distil-whisper expecting it to be faster, but on my sample: |
|
|
- **distil-whisper**: 21.6% WER in 38.49s ✗ |
|
|
|
|
|
Both slower AND less accurate than the standard models. Unexpected, but that's the data. |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("💻 Hardware & Setup"): |
|
|
gr.Markdown( |
|
|
""" |
|
|
## Test Environment |
|
|
|
|
|
### Hardware |
|
|
- **GPU**: AMD Radeon RX 7700 XT (ROCm available but using CPU inference) |
|
|
- **CPU**: Intel Core i7-12700F (12 cores, 20 threads) |
|
|
- **RAM**: 64 GB |
|
|
- **OS**: Ubuntu 25.04 |
|
|
|
|
|
### Why CPU Inference? |
|
|
- AMD GPU with ROCm isn't ideal for STT workloads |
|
|
- CPU inference provided more consistent results |
|
|
- Your performance will differ based on your hardware |
|
|
|
|
|
### Models Tested |
|
|
|
|
|
**Whisper model sizes:** |
|
|
- tiny (39M params) |
|
|
- base (74M params) |
|
|
- small (244M params) |
|
|
- medium (769M params) |
|
|
- large-v3-turbo (809M params) |
|
|
|
|
|
**Engines compared:** |
|
|
- OpenAI Whisper (original implementation) |
|
|
- faster-whisper (optimized CTranslate2) |
|
|
- distil-whisper (distilled variant) |
|
|
|
|
|
### Metrics |
|
|
- **WER (Word Error Rate)**: Lower is better - percentage of words transcribed incorrectly |
|
|
- **Inference Time**: How long it takes to transcribe the audio sample |
|
|
|
|
|
## Running Your Own Tests |
|
|
|
|
|
Want to benchmark on your own voice and hardware? |
|
|
|
|
|
1. Clone the repository: [github.com/danielrosehill/Local-ASR-STT-Benchmark](https://github.com/danielrosehill/Local-ASR-STT-Benchmark) |
|
|
2. Set up the conda environment (see `setup.md`) |
|
|
3. Record your own audio and create reference transcriptions |
|
|
4. Run the benchmark scripts |
|
|
5. Generate visualizations |
|
|
|
|
|
Your results will likely differ based on: |
|
|
- Your hardware (GPU/CPU) |
|
|
- Your voice characteristics |
|
|
- Your microphone quality |
|
|
- Background noise conditions |
|
|
- Speaking style and pace |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("ℹ️ About"): |
|
|
gr.Markdown( |
|
|
""" |
|
|
## About This Project |
|
|
|
|
|
### Motivation |
|
|
|
|
|
I was tired of guessing which Whisper model size to use for speech-to-text. There are plenty of |
|
|
benchmarks out there, but they're often: |
|
|
- Run on different hardware than mine |
|
|
- Tested on different voice characteristics |
|
|
- Using different microphones and conditions |
|
|
|
|
|
So I decided to run my own evaluation on my actual setup with my actual voice. |
|
|
|
|
|
### Why This Matters |
|
|
|
|
|
If you're doing hours of transcription per day (like I am), optimizing your STT setup is worth it: |
|
|
- Faster models = less waiting |
|
|
- More accurate models = less editing |
|
|
- Finding the sweet spot = better workflow |
|
|
|
|
|
### Next Steps |
|
|
|
|
|
For a more robust evaluation, I'd want to: |
|
|
- Test on multiple audio samples |
|
|
- Include different speaking styles (casual, technical, professional) |
|
|
- Test on different microphones |
|
|
- Evaluate punctuation and capitalization accuracy |
|
|
- Compare ASR (Automatic Speech Recognition) vs traditional STT |
|
|
- Test GPU inference on NVIDIA hardware |
|
|
|
|
|
### Repository |
|
|
|
|
|
Full benchmark code and results: |
|
|
[github.com/danielrosehill/Local-ASR-STT-Benchmark](https://github.com/danielrosehill/Local-ASR-STT-Benchmark) |
|
|
|
|
|
### License |
|
|
|
|
|
MIT License - Feel free to use and adapt for your own benchmarks! |
|
|
|
|
|
--- |
|
|
|
|
|
*Built with Gradio • Whisper models by OpenAI • Hosted on Hugging Face Spaces* |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
--- |
|
|
### 📧 Questions or feedback? |
|
|
Visit the [GitHub repository](https://github.com/danielrosehill/Local-ASR-STT-Benchmark) to open an issue or contribute. |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.HTML( |
|
|
""" |
|
|
<div style="text-align: center; margin-top: 20px;"> |
|
|
<a href="https://danielrosehill.com" target="_blank"> |
|
|
<img src="/file/badge.png" alt="Daniel Rosehill" style="width: 480px;"> |
|
|
</a> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|