File size: 6,067 Bytes
b79357c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
"""
Gradio Demo for Whisper German ASR - HuggingFace Space
Interactive web interface for audio transcription
"""

import gradio as gr
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import librosa
import numpy as np
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global variables
model = None
processor = None
device = None


def load_model(model_name="openai/whisper-small"):
    """Load the Whisper model from HuggingFace Hub
    
    Args:
        model_name: HuggingFace model ID (e.g., 'openai/whisper-small' or 'YOUR_USERNAME/whisper-small-german')
    """
    global model, processor, device
    
    logger.info(f"Loading model from HuggingFace Hub: {model_name}")
    
    try:
        processor = WhisperProcessor.from_pretrained(model_name)
        model = WhisperForConditionalGeneration.from_pretrained(model_name)
        
        # Set German language conditioning
        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
            language="german",
            task="transcribe"
        )
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)
        model.eval()
        
        logger.info(f"βœ“ Model loaded successfully on {device}")
        return f"Model loaded successfully on {device}"
    except Exception as e:
        logger.error(f"Failed to load model: {e}")
        raise


def transcribe_audio(audio_input):
    """Transcribe audio from file upload or microphone"""
    if model is None:
        return "❌ Error: Model not loaded. Please wait for model to load."
    
    try:
        # Handle different input formats
        if audio_input is None:
            return "❌ No audio provided. Please upload an audio file or record using the microphone."
        
        # audio_input is a tuple (sample_rate, audio_data) from gradio
        if isinstance(audio_input, tuple):
            sr, audio = audio_input
            # Convert to float32 and normalize
            if audio.dtype == np.int16:
                audio = audio.astype(np.float32) / 32768.0
            elif audio.dtype == np.int32:
                audio = audio.astype(np.float32) / 2147483648.0
        else:
            # File path
            audio, sr = librosa.load(audio_input, sr=16000, mono=True)
        
        # Resample if needed
        if sr != 16000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        
        # Ensure mono
        if len(audio.shape) > 1:
            audio = audio.mean(axis=1)
        
        duration = len(audio) / 16000
        
        # Process audio
        input_features = processor(
            audio,
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features.to(device)
        
        # Generate transcription
        with torch.no_grad():
            predicted_ids = model.generate(
                input_features,
                max_length=448,
                num_beams=5,
                early_stopping=True
            )
        
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        logger.info(f"Transcribed {duration:.2f}s audio: {transcription[:50]}...")
        
        return f"🎀 **Transcription:**\n\n{transcription}\n\nπŸ“Š **Duration:** {duration:.2f} seconds"
        
    except Exception as e:
        logger.error(f"Transcription error: {e}")
        return f"❌ Error: {str(e)}"


# Load model on startup
# IMPORTANT: Replace 'openai/whisper-small' with your fine-tuned model ID
# e.g., 'saadmannan/whisper-small-german' after you upload your model to HF Hub
MODEL_ID = "openai/whisper-small"  # Change this to your model ID

try:
    load_model(MODEL_ID)
except Exception as e:
    logger.error(f"Failed to load model: {e}")
    logger.info("Model will need to be loaded manually")


# Create Gradio interface
with gr.Blocks(title="Whisper German ASR", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # πŸŽ™οΈ Whisper German ASR
        
        Fine-tuned Whisper model for German speech recognition.
        
        **How to use:**
        1. Upload an audio file (WAV, MP3, FLAC, etc.) or record using your microphone
        2. Click the "Transcribe" button
        3. Wait for the transcription to appear
        
        **Features:**
        - Supports multiple audio formats
        - Microphone recording
        - Optimized for German language
        
        **Model:** Whisper-small fine-tuned on German MINDS14 dataset
        """
    )
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="numpy",
                label="Upload Audio or Record"
            )
            transcribe_btn = gr.Button("🎯 Transcribe", variant="primary", size="lg")
        
        with gr.Column():
            output_text = gr.Markdown(label="Transcription Result")
    
    transcribe_btn.click(
        fn=transcribe_audio,
        inputs=audio_input,
        outputs=output_text
    )
    
    gr.Markdown(
        """
        ---
        ## πŸ“‹ About This Model
        
        This is a fine-tuned version of OpenAI's Whisper-small model, 
        specifically optimized for German speech recognition.
        
        ### Performance
        - **Word Error Rate (WER):** ~13%
        - **Sample Rate:** 16kHz
        - **Max Duration:** 30 seconds
        - **Language:** German (de)
        
        ### Tips for Best Results
        - Speak clearly and at a moderate pace
        - Minimize background noise
        - Audio should be in German language
        - Best results with 1-30 second clips
        
        ### Links
        - [GitHub Repository](https://github.com/YOUR_USERNAME/whisper-german-asr)
        - [Model Card](https://huggingface.co/YOUR_USERNAME/whisper-small-german)
        """
    )


# Launch the app
if __name__ == "__main__":
    demo.launch()