ASR-finetuning / app.py
saadmannan's picture
app file reviewed
b79357c
"""
Gradio Demo for Whisper German ASR - HuggingFace Space
Interactive web interface for audio transcription
"""
import gradio as gr
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import librosa
import numpy as np
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Global variables
model = None
processor = None
device = None
def load_model(model_name="openai/whisper-small"):
"""Load the Whisper model from HuggingFace Hub
Args:
model_name: HuggingFace model ID (e.g., 'openai/whisper-small' or 'YOUR_USERNAME/whisper-small-german')
"""
global model, processor, device
logger.info(f"Loading model from HuggingFace Hub: {model_name}")
try:
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
# Set German language conditioning
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
language="german",
task="transcribe"
)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()
logger.info(f"βœ“ Model loaded successfully on {device}")
return f"Model loaded successfully on {device}"
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise
def transcribe_audio(audio_input):
"""Transcribe audio from file upload or microphone"""
if model is None:
return "❌ Error: Model not loaded. Please wait for model to load."
try:
# Handle different input formats
if audio_input is None:
return "❌ No audio provided. Please upload an audio file or record using the microphone."
# audio_input is a tuple (sample_rate, audio_data) from gradio
if isinstance(audio_input, tuple):
sr, audio = audio_input
# Convert to float32 and normalize
if audio.dtype == np.int16:
audio = audio.astype(np.float32) / 32768.0
elif audio.dtype == np.int32:
audio = audio.astype(np.float32) / 2147483648.0
else:
# File path
audio, sr = librosa.load(audio_input, sr=16000, mono=True)
# Resample if needed
if sr != 16000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
# Ensure mono
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
duration = len(audio) / 16000
# Process audio
input_features = processor(
audio,
sampling_rate=16000,
return_tensors="pt"
).input_features.to(device)
# Generate transcription
with torch.no_grad():
predicted_ids = model.generate(
input_features,
max_length=448,
num_beams=5,
early_stopping=True
)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
logger.info(f"Transcribed {duration:.2f}s audio: {transcription[:50]}...")
return f"🎀 **Transcription:**\n\n{transcription}\n\nπŸ“Š **Duration:** {duration:.2f} seconds"
except Exception as e:
logger.error(f"Transcription error: {e}")
return f"❌ Error: {str(e)}"
# Load model on startup
# IMPORTANT: Replace 'openai/whisper-small' with your fine-tuned model ID
# e.g., 'saadmannan/whisper-small-german' after you upload your model to HF Hub
MODEL_ID = "openai/whisper-small" # Change this to your model ID
try:
load_model(MODEL_ID)
except Exception as e:
logger.error(f"Failed to load model: {e}")
logger.info("Model will need to be loaded manually")
# Create Gradio interface
with gr.Blocks(title="Whisper German ASR", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# πŸŽ™οΈ Whisper German ASR
Fine-tuned Whisper model for German speech recognition.
**How to use:**
1. Upload an audio file (WAV, MP3, FLAC, etc.) or record using your microphone
2. Click the "Transcribe" button
3. Wait for the transcription to appear
**Features:**
- Supports multiple audio formats
- Microphone recording
- Optimized for German language
**Model:** Whisper-small fine-tuned on German MINDS14 dataset
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="numpy",
label="Upload Audio or Record"
)
transcribe_btn = gr.Button("🎯 Transcribe", variant="primary", size="lg")
with gr.Column():
output_text = gr.Markdown(label="Transcription Result")
transcribe_btn.click(
fn=transcribe_audio,
inputs=audio_input,
outputs=output_text
)
gr.Markdown(
"""
---
## πŸ“‹ About This Model
This is a fine-tuned version of OpenAI's Whisper-small model,
specifically optimized for German speech recognition.
### Performance
- **Word Error Rate (WER):** ~13%
- **Sample Rate:** 16kHz
- **Max Duration:** 30 seconds
- **Language:** German (de)
### Tips for Best Results
- Speak clearly and at a moderate pace
- Minimize background noise
- Audio should be in German language
- Best results with 1-30 second clips
### Links
- [GitHub Repository](https://github.com/YOUR_USERNAME/whisper-german-asr)
- [Model Card](https://huggingface.co/YOUR_USERNAME/whisper-small-german)
"""
)
# Launch the app
if __name__ == "__main__":
demo.launch()