Spaces:

saadmannan
/

ASR-finetuning

Sleeping

File size: 6,067 Bytes

b79357c

"""
Gradio Demo for Whisper German ASR - HuggingFace Space
Interactive web interface for audio transcription
"""

import gradio as gr
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import librosa
import numpy as np
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global variables
model = None
processor = None
device = None


def load_model(model_name="openai/whisper-small"):
    """Load the Whisper model from HuggingFace Hub
    
    Args:
        model_name: HuggingFace model ID (e.g., 'openai/whisper-small' or 'YOUR_USERNAME/whisper-small-german')
    """
    global model, processor, device
    
    logger.info(f"Loading model from HuggingFace Hub: {model_name}")
    
    try:
        processor = WhisperProcessor.from_pretrained(model_name)
        model = WhisperForConditionalGeneration.from_pretrained(model_name)
        
        # Set German language conditioning
        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
            language="german",
            task="transcribe"
        )
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)
        model.eval()
        
        logger.info(f"✓ Model loaded successfully on {device}")
        return f"Model loaded successfully on {device}"
    except Exception as e:
        logger.error(f"Failed to load model: {e}")
        raise


def transcribe_audio(audio_input):
    """Transcribe audio from file upload or microphone"""
    if model is None:
        return "❌ Error: Model not loaded. Please wait for model to load."
    
    try:
        # Handle different input formats
        if audio_input is None:
            return "❌ No audio provided. Please upload an audio file or record using the microphone."
        
        # audio_input is a tuple (sample_rate, audio_data) from gradio
        if isinstance(audio_input, tuple):
            sr, audio = audio_input
            # Convert to float32 and normalize
            if audio.dtype == np.int16:
                audio = audio.astype(np.float32) / 32768.0
            elif audio.dtype == np.int32:
                audio = audio.astype(np.float32) / 2147483648.0
        else:
            # File path
            audio, sr = librosa.load(audio_input, sr=16000, mono=True)
        
        # Resample if needed
        if sr != 16000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        
        # Ensure mono
        if len(audio.shape) > 1:
            audio = audio.mean(axis=1)
        
        duration = len(audio) / 16000
        
        # Process audio
        input_features = processor(
            audio,
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features.to(device)
        
        # Generate transcription
        with torch.no_grad():
            predicted_ids = model.generate(
                input_features,
                max_length=448,
                num_beams=5,
                early_stopping=True
            )
        
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        logger.info(f"Transcribed {duration:.2f}s audio: {transcription[:50]}...")
        
        return f"🎤 **Transcription:**\n\n{transcription}\n\n📊 **Duration:** {duration:.2f} seconds"
        
    except Exception as e:
        logger.error(f"Transcription error: {e}")
        return f"❌ Error: {str(e)}"


# Load model on startup
# IMPORTANT: Replace 'openai/whisper-small' with your fine-tuned model ID
# e.g., 'saadmannan/whisper-small-german' after you upload your model to HF Hub
MODEL_ID = "openai/whisper-small"  # Change this to your model ID

try:
    load_model(MODEL_ID)
except Exception as e:
    logger.error(f"Failed to load model: {e}")
    logger.info("Model will need to be loaded manually")


# Create Gradio interface
with gr.Blocks(title="Whisper German ASR", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🎙️ Whisper German ASR
        
        Fine-tuned Whisper model for German speech recognition.
        
        **How to use:**
        1. Upload an audio file (WAV, MP3, FLAC, etc.) or record using your microphone
        2. Click the "Transcribe" button
        3. Wait for the transcription to appear
        
        **Features:**
        - Supports multiple audio formats
        - Microphone recording
        - Optimized for German language
        
        **Model:** Whisper-small fine-tuned on German MINDS14 dataset
        """
    )
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="numpy",
                label="Upload Audio or Record"
            )
            transcribe_btn = gr.Button("🎯 Transcribe", variant="primary", size="lg")
        
        with gr.Column():
            output_text = gr.Markdown(label="Transcription Result")
    
    transcribe_btn.click(
        fn=transcribe_audio,
        inputs=audio_input,
        outputs=output_text
    )
    
    gr.Markdown(
        """
        ---
        ## 📋 About This Model
        
        This is a fine-tuned version of OpenAI's Whisper-small model, 
        specifically optimized for German speech recognition.
        
        ### Performance
        - **Word Error Rate (WER):** ~13%
        - **Sample Rate:** 16kHz
        - **Max Duration:** 30 seconds
        - **Language:** German (de)
        
        ### Tips for Best Results
        - Speak clearly and at a moderate pace
        - Minimize background noise
        - Audio should be in German language
        - Best results with 1-30 second clips
        
        ### Links
        - [GitHub Repository](https://github.com/YOUR_USERNAME/whisper-german-asr)
        - [Model Card](https://huggingface.co/YOUR_USERNAME/whisper-small-german)
        """
    )


# Launch the app
if __name__ == "__main__":
    demo.launch()