|
|
import gradio as gr
|
|
|
from faster_whisper import WhisperModel
|
|
|
from pydub import AudioSegment
|
|
|
import os
|
|
|
import tempfile
|
|
|
import time
|
|
|
import torch
|
|
|
from pathlib import Path
|
|
|
import warnings
|
|
|
import numpy as np
|
|
|
import torchaudio
|
|
|
import scipy.io.wavfile as wavfile
|
|
|
from jiwer import wer, cer
|
|
|
import re
|
|
|
import string
|
|
|
|
|
|
|
|
|
warnings.filterwarnings("ignore")
|
|
|
|
|
|
|
|
|
WHISPER_MODELS = {}
|
|
|
DEVICE = None
|
|
|
|
|
|
|
|
|
AVAILABLE_WHISPER_MODELS = {
|
|
|
"ivrit-ai/faster-whisper-v2-d4": "Hebrew Faster-Whisper V2-D4 (Recommended)",
|
|
|
"ivrit-ai/faster-whisper-v2-d3": "Hebrew Faster-Whisper V2-D3",
|
|
|
"ivrit-ai/faster-whisper-v2-d2": "Hebrew Faster-Whisper V2-D2",
|
|
|
"large-v3": "OpenAI Whisper Large V3 (Multilingual)",
|
|
|
"large-v2": "OpenAI Whisper Large V2 (Multilingual)",
|
|
|
"medium": "OpenAI Whisper Medium (Multilingual)",
|
|
|
"small": "OpenAI Whisper Small (Multilingual)",
|
|
|
}
|
|
|
|
|
|
|
|
|
DEFAULT_AUDIO = "heb.wav"
|
|
|
DEFAULT_TRANSCRIPTION = "שלום! אנחנו נרגשים להציג לכם את יכולות הדיבור הטבעי שלנו. כאן תוכלו לביים קול, ליצור דיאלוגים מציאותיים ועוד הרבה יותר. ערכו את המקומות הללו כדי להתחיל."
|
|
|
|
|
|
|
|
|
PREDEFINED_AUDIO_FILES = {
|
|
|
"heb.wav": {
|
|
|
"file": "heb.wav",
|
|
|
"description": "Regular quality Hebrew audio",
|
|
|
"transcription": "שלום! אנחנו נרגשים להציג לכם את יכולות הדיבור הטבעי שלנו. כאן תוכלו לביים קול, ליצור דיאלוגים מציאותיים ועוד הרבה יותר. ערכו את המקומות הללו כדי להתחיל."
|
|
|
},
|
|
|
"noise.wav": {
|
|
|
"file": "noise.wav",
|
|
|
"description": "Noisy Hebrew audio",
|
|
|
"transcription": "אז כך, קרנות החיסכון האלה כאילו מנסות לבנות מנדט לכל הסטארט-אפים הפרטיים.."
|
|
|
}
|
|
|
}
|
|
|
|
|
|
def normalize_hebrew_text(text):
|
|
|
"""Normalize Hebrew text for WER calculation"""
|
|
|
if not text:
|
|
|
return ""
|
|
|
|
|
|
|
|
|
hebrew_diacritics = "".join([chr(i) for i in range(0x0591, 0x05C8)])
|
|
|
text = "".join(c for c in text if c not in hebrew_diacritics)
|
|
|
|
|
|
|
|
|
text = re.sub(r'[^\w\s]', ' ', text)
|
|
|
|
|
|
|
|
|
text = ' '.join(text.split()).strip().lower()
|
|
|
|
|
|
return text
|
|
|
|
|
|
def calculate_wer_cer(reference, hypothesis):
|
|
|
"""Calculate WER and CER for Hebrew text"""
|
|
|
try:
|
|
|
|
|
|
ref_normalized = normalize_hebrew_text(reference)
|
|
|
hyp_normalized = normalize_hebrew_text(hypothesis)
|
|
|
|
|
|
if not ref_normalized or not hyp_normalized:
|
|
|
return float('inf'), float('inf'), ref_normalized, hyp_normalized
|
|
|
|
|
|
|
|
|
word_error_rate = wer(ref_normalized, hyp_normalized)
|
|
|
char_error_rate = cer(ref_normalized, hyp_normalized)
|
|
|
|
|
|
return word_error_rate, char_error_rate, ref_normalized, hyp_normalized
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error calculating WER/CER: {e}")
|
|
|
return float('inf'), float('inf'), "", ""
|
|
|
|
|
|
def initialize_whisper_model(model_id, progress=gr.Progress()):
|
|
|
"""Initialize a specific Whisper model with progress indication"""
|
|
|
global WHISPER_MODELS, DEVICE
|
|
|
|
|
|
try:
|
|
|
|
|
|
if model_id in WHISPER_MODELS and WHISPER_MODELS[model_id] is not None:
|
|
|
print(f"✅ Model {model_id} already loaded")
|
|
|
return True
|
|
|
|
|
|
|
|
|
if DEVICE is None:
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
compute_type = "float16" if torch.cuda.is_available() else "int8"
|
|
|
|
|
|
print(f"🔧 Loading Whisper model: {model_id} on {DEVICE}")
|
|
|
progress(0.3, desc=f"Loading {model_id}...")
|
|
|
|
|
|
|
|
|
WHISPER_MODELS[model_id] = WhisperModel(
|
|
|
model_id,
|
|
|
device=DEVICE,
|
|
|
compute_type=compute_type
|
|
|
)
|
|
|
|
|
|
progress(1.0, desc=f"Loaded {model_id} successfully!")
|
|
|
print(f"✅ Model {model_id} initialized successfully!")
|
|
|
return True
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ Error initializing model {model_id}: {str(e)}")
|
|
|
WHISPER_MODELS[model_id] = None
|
|
|
return False
|
|
|
|
|
|
def transcribe_audio_with_model(audio_file, model_id, language="he"):
|
|
|
"""Transcribe audio using a specific Whisper model"""
|
|
|
try:
|
|
|
|
|
|
if model_id not in WHISPER_MODELS or WHISPER_MODELS[model_id] is None:
|
|
|
success = initialize_whisper_model(model_id)
|
|
|
if not success:
|
|
|
return "", f"Failed to load model {model_id}"
|
|
|
|
|
|
model = WHISPER_MODELS[model_id]
|
|
|
|
|
|
print(f"🎤 Transcribing with {model_id}: {Path(audio_file).name}")
|
|
|
|
|
|
|
|
|
segments, info = model.transcribe(
|
|
|
audio_file,
|
|
|
language=language,
|
|
|
beam_size=5,
|
|
|
best_of=5,
|
|
|
temperature=0.0
|
|
|
)
|
|
|
|
|
|
|
|
|
transcript_text = ""
|
|
|
for segment in segments:
|
|
|
transcript_text += segment.text + " "
|
|
|
|
|
|
transcript_text = transcript_text.strip()
|
|
|
|
|
|
print(f"✅ Transcription completed with {model_id}. Length: {len(transcript_text)} characters")
|
|
|
return transcript_text, f"Success - Duration: {info.duration:.1f}s"
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ Error transcribing with {model_id}: {str(e)}")
|
|
|
return "", f"Error: {str(e)}"
|
|
|
|
|
|
def evaluate_all_models(audio_file, reference_text, selected_models, progress=gr.Progress()):
|
|
|
"""Evaluate all selected models and calculate WER/CER"""
|
|
|
if not audio_file or not reference_text.strip():
|
|
|
return "❌ Please provide both audio file and reference transcription", []
|
|
|
|
|
|
if not selected_models:
|
|
|
return "❌ Please select at least one model to evaluate", []
|
|
|
|
|
|
results = []
|
|
|
detailed_results = []
|
|
|
|
|
|
print(f"🎯 Starting WER evaluation with {len(selected_models)} models...")
|
|
|
|
|
|
for i, model_id in enumerate(selected_models):
|
|
|
progress((i + 1) / len(selected_models), desc=f"Evaluating {model_id}...")
|
|
|
print(f"\n🔄 Evaluating model: {model_id}")
|
|
|
|
|
|
|
|
|
start_time = time.time()
|
|
|
transcript, status = transcribe_audio_with_model(audio_file, model_id)
|
|
|
transcription_time = time.time() - start_time
|
|
|
|
|
|
if transcript:
|
|
|
|
|
|
word_error_rate, char_error_rate, ref_norm, hyp_norm = calculate_wer_cer(reference_text, transcript)
|
|
|
|
|
|
|
|
|
result = {
|
|
|
'model': model_id,
|
|
|
'model_name': AVAILABLE_WHISPER_MODELS.get(model_id, model_id),
|
|
|
'transcript': transcript,
|
|
|
'wer': word_error_rate,
|
|
|
'cer': char_error_rate,
|
|
|
'time': transcription_time,
|
|
|
'status': status,
|
|
|
'ref_normalized': ref_norm,
|
|
|
'hyp_normalized': hyp_norm
|
|
|
}
|
|
|
|
|
|
results.append(result)
|
|
|
|
|
|
print(f"✅ {model_id}: WER={word_error_rate:.3f}, CER={char_error_rate:.3f}")
|
|
|
else:
|
|
|
print(f"❌ {model_id}: Transcription failed")
|
|
|
results.append({
|
|
|
'model': model_id,
|
|
|
'model_name': AVAILABLE_WHISPER_MODELS.get(model_id, model_id),
|
|
|
'transcript': 'FAILED',
|
|
|
'wer': float('inf'),
|
|
|
'cer': float('inf'),
|
|
|
'time': transcription_time,
|
|
|
'status': status,
|
|
|
'ref_normalized': '',
|
|
|
'hyp_normalized': ''
|
|
|
})
|
|
|
|
|
|
|
|
|
results.sort(key=lambda x: x['wer'])
|
|
|
|
|
|
|
|
|
summary_report = "# 📊 WER Evaluation Results\n\n"
|
|
|
summary_report += f"**Audio File:** {os.path.basename(audio_file)}\n"
|
|
|
summary_report += f"**Reference Text:** {reference_text[:100]}...\n"
|
|
|
summary_report += f"**Models Tested:** {len(selected_models)}\n"
|
|
|
summary_report += f"**Device:** {DEVICE}\n\n"
|
|
|
|
|
|
|
|
|
summary_report += "## Results Summary (sorted by WER)\n\n"
|
|
|
for i, result in enumerate(results):
|
|
|
if result['wer'] == float('inf'):
|
|
|
wer_display = "FAILED"
|
|
|
cer_display = "FAILED"
|
|
|
else:
|
|
|
wer_display = f"{result['wer']:.3f} ({result['wer']*100:.1f}%)"
|
|
|
cer_display = f"{result['cer']:.3f} ({result['cer']*100:.1f}%)"
|
|
|
|
|
|
summary_report += f"**{i+1}. {result['model_name']}**\n"
|
|
|
summary_report += f"- WER: {wer_display}\n"
|
|
|
summary_report += f"- CER: {cer_display}\n"
|
|
|
summary_report += f"- Processing Time: {result['time']:.2f}s\n\n"
|
|
|
|
|
|
|
|
|
table_data = []
|
|
|
|
|
|
|
|
|
table_data.append(["Ground Truth", reference_text, "N/A", "N/A"])
|
|
|
|
|
|
|
|
|
for result in results:
|
|
|
if result['wer'] == float('inf'):
|
|
|
wer_display = "FAILED"
|
|
|
cer_display = "FAILED"
|
|
|
else:
|
|
|
wer_display = f"{result['wer']:.3f}"
|
|
|
cer_display = f"{result['cer']:.3f}"
|
|
|
|
|
|
table_data.append([
|
|
|
result['model_name'],
|
|
|
result['transcript'],
|
|
|
wer_display,
|
|
|
cer_display
|
|
|
])
|
|
|
|
|
|
print("✅ WER evaluation completed!")
|
|
|
return summary_report, table_data
|
|
|
|
|
|
def create_gradio_interface():
|
|
|
"""Create and configure the Gradio interface"""
|
|
|
|
|
|
|
|
|
global DEVICE
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
status_msg = f"""✅ Hebrew STT WER Evaluation Tool Ready!
|
|
|
🔧 Device: {DEVICE}
|
|
|
📱 Available Models: {len(AVAILABLE_WHISPER_MODELS)}
|
|
|
🎯 Purpose: Compare WER performance across Hebrew STT models"""
|
|
|
|
|
|
|
|
|
with gr.Blocks(
|
|
|
title="Hebrew STT WER Evaluation",
|
|
|
theme=gr.themes.Soft(),
|
|
|
css="""
|
|
|
.gradio-container { max-width: 1600px !important; }
|
|
|
.evaluation-section {
|
|
|
border: 2px solid #e0e0e0;
|
|
|
border-radius: 10px;
|
|
|
padding: 15px;
|
|
|
margin: 10px 0;
|
|
|
}
|
|
|
"""
|
|
|
) as demo:
|
|
|
|
|
|
gr.Markdown("""
|
|
|
# 📊 Hebrew STT WER Evaluation Tool
|
|
|
|
|
|
Upload an audio file and reference transcription to test the performance of different Whisper models on Hebrew speech-to-text tasks.
|
|
|
""")
|
|
|
|
|
|
|
|
|
with gr.Row():
|
|
|
status_display = gr.Textbox(
|
|
|
label="🔧 System Status",
|
|
|
value=status_msg,
|
|
|
interactive=False,
|
|
|
lines=4
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Row():
|
|
|
|
|
|
with gr.Column(scale=1, elem_classes=["evaluation-section"]):
|
|
|
gr.Markdown("### 📁 Evaluation Inputs")
|
|
|
|
|
|
|
|
|
predefined_audio_dropdown = gr.Dropdown(
|
|
|
label="🎵 Select Predefined Audio File",
|
|
|
choices=[(f"{k} - {v['description']}", k) for k, v in PREDEFINED_AUDIO_FILES.items()],
|
|
|
value="web01.wav",
|
|
|
interactive=True
|
|
|
)
|
|
|
|
|
|
|
|
|
gr.Markdown("**OR**")
|
|
|
|
|
|
audio_input = gr.Audio(
|
|
|
label="🎵 Upload Custom Audio File - Upload Hebrew audio file for transcription",
|
|
|
type="filepath",
|
|
|
value=None
|
|
|
)
|
|
|
|
|
|
reference_text = gr.Textbox(
|
|
|
label="📝 Reference Transcription (Ground Truth) - The correct transcription for WER calculation",
|
|
|
placeholder="Enter the correct transcription of the audio file...",
|
|
|
value=DEFAULT_TRANSCRIPTION,
|
|
|
lines=5
|
|
|
)
|
|
|
|
|
|
|
|
|
model_selection = gr.CheckboxGroup(
|
|
|
label="🤖 Select Models to Test - Choose which models to evaluate (2-4 recommended)",
|
|
|
choices=list(AVAILABLE_WHISPER_MODELS.keys()),
|
|
|
value=["ivrit-ai/faster-whisper-v2-d4", "large-v3"]
|
|
|
)
|
|
|
|
|
|
with gr.Row():
|
|
|
load_models_btn = gr.Button(
|
|
|
"🔧 Pre-load Selected Models (Optional)",
|
|
|
variant="secondary"
|
|
|
)
|
|
|
|
|
|
evaluate_btn = gr.Button(
|
|
|
"🎯 Run WER Evaluation",
|
|
|
variant="primary"
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Column(scale=1, elem_classes=["evaluation-section"]):
|
|
|
gr.Markdown("### 📊 WER Evaluation Results")
|
|
|
|
|
|
gr.Markdown("""
|
|
|
**What is WER?**
|
|
|
Word Error Rate - measures transcription accuracy at word level
|
|
|
|
|
|
**How it works:**
|
|
|
1. Upload Hebrew audio file
|
|
|
2. Enter correct transcription
|
|
|
3. Select models to test
|
|
|
4. Tool transcribes with each model
|
|
|
5. Calculates WER & CER for each model
|
|
|
6. Ranks models by performance
|
|
|
|
|
|
**Evaluation Metrics:**
|
|
|
- **WER**: Word-level errors (%)
|
|
|
- **CER**: Character-level errors (%)
|
|
|
- **Processing Time**: Transcription speed
|
|
|
|
|
|
**Tips:**
|
|
|
- Use high-quality audio
|
|
|
- Ensure reference transcription is accurate
|
|
|
- Select 2-4 models for comparison
|
|
|
- Lower WER = better performance
|
|
|
""")
|
|
|
|
|
|
|
|
|
with gr.Row():
|
|
|
with gr.Column(scale=1):
|
|
|
gr.Markdown("### 📊 WER Evaluation Results")
|
|
|
|
|
|
results_output = gr.Markdown(
|
|
|
value="Evaluation results will appear here after running the test..."
|
|
|
)
|
|
|
|
|
|
results_table = gr.Dataframe(
|
|
|
label="Transcription Comparison",
|
|
|
headers=["Model", "Transcription", "WER", "CER"],
|
|
|
datatype=["str", "str", "str", "str"],
|
|
|
col_count=(4, "fixed")
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_predefined_audio(selected_file):
|
|
|
"""Load predefined audio file and its transcription"""
|
|
|
if selected_file and selected_file in PREDEFINED_AUDIO_FILES:
|
|
|
audio_data = PREDEFINED_AUDIO_FILES[selected_file]
|
|
|
return audio_data["file"], audio_data["transcription"]
|
|
|
return None, DEFAULT_TRANSCRIPTION
|
|
|
|
|
|
def load_selected_models(selected_models, progress=gr.Progress()):
|
|
|
"""Pre-load selected models"""
|
|
|
if not selected_models:
|
|
|
return "❌ No models selected"
|
|
|
|
|
|
status_msg = f"🔧 Loading {len(selected_models)} models...\n\n"
|
|
|
|
|
|
for model_id in selected_models:
|
|
|
try:
|
|
|
status_msg += f"⏳ Loading {model_id}...\n"
|
|
|
success = initialize_whisper_model(model_id, progress)
|
|
|
if success:
|
|
|
status_msg += f"✅ {model_id} loaded successfully\n"
|
|
|
else:
|
|
|
status_msg += f"❌ Error loading {model_id}\n"
|
|
|
status_msg += "\n"
|
|
|
except Exception as e:
|
|
|
status_msg += f"❌ Error loading {model_id}: {str(e)}\n\n"
|
|
|
|
|
|
loaded_count = len([m for m in selected_models if m in WHISPER_MODELS and WHISPER_MODELS[m] is not None])
|
|
|
status_msg += f"✅ Model loading complete! Available: {loaded_count}/{len(selected_models)}"
|
|
|
return status_msg
|
|
|
|
|
|
def run_wer_evaluation(audio_file, reference, selected_models, predefined_file, progress=gr.Progress()):
|
|
|
"""Run the complete WER evaluation"""
|
|
|
|
|
|
if not audio_file and predefined_file:
|
|
|
audio_file = PREDEFINED_AUDIO_FILES[predefined_file]["file"]
|
|
|
|
|
|
if not audio_file:
|
|
|
return "❌ Please select a predefined audio file or upload a custom one", []
|
|
|
|
|
|
if not reference or not reference.strip():
|
|
|
return "❌ Please enter reference transcription", []
|
|
|
|
|
|
if not selected_models:
|
|
|
return "❌ Please select at least one model", []
|
|
|
|
|
|
|
|
|
results, table_data = evaluate_all_models(audio_file, reference, selected_models, progress)
|
|
|
return results, table_data
|
|
|
|
|
|
|
|
|
predefined_audio_dropdown.change(
|
|
|
fn=load_predefined_audio,
|
|
|
inputs=[predefined_audio_dropdown],
|
|
|
outputs=[audio_input, reference_text]
|
|
|
)
|
|
|
|
|
|
load_models_btn.click(
|
|
|
fn=load_selected_models,
|
|
|
inputs=[model_selection],
|
|
|
outputs=[status_display]
|
|
|
)
|
|
|
|
|
|
evaluate_btn.click(
|
|
|
fn=run_wer_evaluation,
|
|
|
inputs=[audio_input, reference_text, model_selection, predefined_audio_dropdown],
|
|
|
outputs=[results_output, results_table]
|
|
|
)
|
|
|
|
|
|
|
|
|
gr.Markdown("""
|
|
|
---
|
|
|
### 🔧 Technical Information
|
|
|
- **STT Engine**: Faster-Whisper (optimized for Hebrew)
|
|
|
- **Evaluation Metrics**: WER (Word Error Rate) and CER (Character Error Rate)
|
|
|
- **Text Normalization**: Removes diacritics, punctuation, and extra whitespace
|
|
|
- **Purpose**: Compare performance of different transcription models on Hebrew text
|
|
|
|
|
|
### 📦 Setup Instructions
|
|
|
```bash
|
|
|
# Install dependencies
|
|
|
pip install gradio faster-whisper torch torchaudio jiwer
|
|
|
|
|
|
# For GPU support (recommended)
|
|
|
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
|
```
|
|
|
|
|
|
### 📊 Output Format
|
|
|
The tool displays:
|
|
|
- Model ranking by WER
|
|
|
- Detailed results for each model
|
|
|
- Processing times
|
|
|
- Normalized transcription comparison
|
|
|
""")
|
|
|
|
|
|
return demo
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
print("🎯 Launching Hebrew STT WER Evaluation Tool...")
|
|
|
demo = create_gradio_interface()
|
|
|
|
|
|
demo.launch(
|
|
|
share=False,
|
|
|
debug=True,
|
|
|
server_name="0.0.0.0",
|
|
|
server_port=7860,
|
|
|
show_error=True
|
|
|
) |