Spaces:
Sleeping
Sleeping
File size: 6,067 Bytes
b79357c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
"""
Gradio Demo for Whisper German ASR - HuggingFace Space
Interactive web interface for audio transcription
"""
import gradio as gr
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import librosa
import numpy as np
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Global variables
model = None
processor = None
device = None
def load_model(model_name="openai/whisper-small"):
"""Load the Whisper model from HuggingFace Hub
Args:
model_name: HuggingFace model ID (e.g., 'openai/whisper-small' or 'YOUR_USERNAME/whisper-small-german')
"""
global model, processor, device
logger.info(f"Loading model from HuggingFace Hub: {model_name}")
try:
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
# Set German language conditioning
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
language="german",
task="transcribe"
)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()
logger.info(f"β Model loaded successfully on {device}")
return f"Model loaded successfully on {device}"
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise
def transcribe_audio(audio_input):
"""Transcribe audio from file upload or microphone"""
if model is None:
return "β Error: Model not loaded. Please wait for model to load."
try:
# Handle different input formats
if audio_input is None:
return "β No audio provided. Please upload an audio file or record using the microphone."
# audio_input is a tuple (sample_rate, audio_data) from gradio
if isinstance(audio_input, tuple):
sr, audio = audio_input
# Convert to float32 and normalize
if audio.dtype == np.int16:
audio = audio.astype(np.float32) / 32768.0
elif audio.dtype == np.int32:
audio = audio.astype(np.float32) / 2147483648.0
else:
# File path
audio, sr = librosa.load(audio_input, sr=16000, mono=True)
# Resample if needed
if sr != 16000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
# Ensure mono
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
duration = len(audio) / 16000
# Process audio
input_features = processor(
audio,
sampling_rate=16000,
return_tensors="pt"
).input_features.to(device)
# Generate transcription
with torch.no_grad():
predicted_ids = model.generate(
input_features,
max_length=448,
num_beams=5,
early_stopping=True
)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
logger.info(f"Transcribed {duration:.2f}s audio: {transcription[:50]}...")
return f"π€ **Transcription:**\n\n{transcription}\n\nπ **Duration:** {duration:.2f} seconds"
except Exception as e:
logger.error(f"Transcription error: {e}")
return f"β Error: {str(e)}"
# Load model on startup
# IMPORTANT: Replace 'openai/whisper-small' with your fine-tuned model ID
# e.g., 'saadmannan/whisper-small-german' after you upload your model to HF Hub
MODEL_ID = "openai/whisper-small" # Change this to your model ID
try:
load_model(MODEL_ID)
except Exception as e:
logger.error(f"Failed to load model: {e}")
logger.info("Model will need to be loaded manually")
# Create Gradio interface
with gr.Blocks(title="Whisper German ASR", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# ποΈ Whisper German ASR
Fine-tuned Whisper model for German speech recognition.
**How to use:**
1. Upload an audio file (WAV, MP3, FLAC, etc.) or record using your microphone
2. Click the "Transcribe" button
3. Wait for the transcription to appear
**Features:**
- Supports multiple audio formats
- Microphone recording
- Optimized for German language
**Model:** Whisper-small fine-tuned on German MINDS14 dataset
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="numpy",
label="Upload Audio or Record"
)
transcribe_btn = gr.Button("π― Transcribe", variant="primary", size="lg")
with gr.Column():
output_text = gr.Markdown(label="Transcription Result")
transcribe_btn.click(
fn=transcribe_audio,
inputs=audio_input,
outputs=output_text
)
gr.Markdown(
"""
---
## π About This Model
This is a fine-tuned version of OpenAI's Whisper-small model,
specifically optimized for German speech recognition.
### Performance
- **Word Error Rate (WER):** ~13%
- **Sample Rate:** 16kHz
- **Max Duration:** 30 seconds
- **Language:** German (de)
### Tips for Best Results
- Speak clearly and at a moderate pace
- Minimize background noise
- Audio should be in German language
- Best results with 1-30 second clips
### Links
- [GitHub Repository](https://github.com/YOUR_USERNAME/whisper-german-asr)
- [Model Card](https://huggingface.co/YOUR_USERNAME/whisper-small-german)
"""
)
# Launch the app
if __name__ == "__main__":
demo.launch()
|