Spaces:

saadmannan
/

ASR-finetuning

Sleeping

App Files Files Community

saadmannan commited on Nov 8, 2025

Commit

b79357c

1 Parent(s): 5554ef1

app file reviewed

Browse files

Files changed (3) hide show

README.md +66 -6
app.py +193 -0
requirements.txt +3 -22

README.md CHANGED Viewed

@@ -1,12 +1,72 @@
 ---
-title: ASR Finetuning
-emoji: 🌖
-colorFrom: red
-colorTo: green
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Whisper German ASR
+emoji: 🎙️
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.0.0
 app_file: app.py
 pinned: false
+license: mit
 ---
+# 🎙️ Whisper German ASR
+Fine-tuned Whisper model for German Automatic Speech Recognition (ASR).
+## Description
+This Space provides an interactive interface for transcribing German audio using a fine-tuned version of OpenAI's Whisper-small model. The model has been specifically optimized for German speech recognition.
+## How to Use
+1. **Upload Audio**: Click on the audio input area to upload an audio file (WAV, MP3, FLAC, etc.)
+   - OR -
+2. **Record Audio**: Use the microphone button to record audio directly
+3. **Transcribe**: Click the "Transcribe" button to generate the transcription
+4. **View Results**: The transcription will appear on the right side
+## Model Details
+- **Base Model**: OpenAI Whisper-small (242M parameters)
+- **Fine-tuned on**: German MINDS14 dataset
+- **Language**: German (de)
+- **Task**: Transcription
+- **Performance**: ~13% Word Error Rate (WER)
+## Features
+- ✅ Upload audio files in various formats
+- ✅ Record audio directly from microphone
+- ✅ Real-time transcription
+- ✅ Optimized for German language
+- ✅ Support for audio up to 30 seconds
+## Technical Specifications
+- **Sample Rate**: 16kHz
+- **Max Duration**: 30 seconds
+- **Beam Search**: 5 beams
+- **Device**: CPU/GPU auto-detection
+## Tips for Best Results
+- Speak clearly and at a moderate pace
+- Minimize background noise
+- Ensure audio is in German language
+- Keep audio clips between 1-30 seconds for optimal results
+## Links
+- [GitHub Repository](https://github.com/YOUR_USERNAME/whisper-german-asr)
+- [Model Card](https://huggingface.co/YOUR_USERNAME/whisper-small-german)
+## License
+MIT License
+## Acknowledgments
+- [OpenAI Whisper](https://github.com/openai/whisper) for the base model
+- [Hugging Face](https://huggingface.co/) for Transformers library
+- [PolyAI](https://huggingface.co/datasets/PolyAI/minds14) for the MINDS14 dataset

app.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""
+Gradio Demo for Whisper German ASR - HuggingFace Space
+Interactive web interface for audio transcription
+"""
+import gradio as gr
+import torch
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+import librosa
+import numpy as np
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global variables
+model = None
+processor = None
+device = None
+def load_model(model_name="openai/whisper-small"):
+    """Load the Whisper model from HuggingFace Hub
+    Args:
+        model_name: HuggingFace model ID (e.g., 'openai/whisper-small' or 'YOUR_USERNAME/whisper-small-german')
+    """
+    global model, processor, device
+    logger.info(f"Loading model from HuggingFace Hub: {model_name}")
+    try:
+        processor = WhisperProcessor.from_pretrained(model_name)
+        model = WhisperForConditionalGeneration.from_pretrained(model_name)
+        # Set German language conditioning
+        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
+            language="german",
+            task="transcribe"
+        )
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = model.to(device)
+        model.eval()
+        logger.info(f"✓ Model loaded successfully on {device}")
+        return f"Model loaded successfully on {device}"
+    except Exception as e:
+        logger.error(f"Failed to load model: {e}")
+        raise
+def transcribe_audio(audio_input):
+    """Transcribe audio from file upload or microphone"""
+    if model is None:
+        return "❌ Error: Model not loaded. Please wait for model to load."
+    try:
+        # Handle different input formats
+        if audio_input is None:
+            return "❌ No audio provided. Please upload an audio file or record using the microphone."
+        # audio_input is a tuple (sample_rate, audio_data) from gradio
+        if isinstance(audio_input, tuple):
+            sr, audio = audio_input
+            # Convert to float32 and normalize
+            if audio.dtype == np.int16:
+                audio = audio.astype(np.float32) / 32768.0
+            elif audio.dtype == np.int32:
+                audio = audio.astype(np.float32) / 2147483648.0
+        else:
+            # File path
+            audio, sr = librosa.load(audio_input, sr=16000, mono=True)
+        # Resample if needed
+        if sr != 16000:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+        # Ensure mono
+        if len(audio.shape) > 1:
+            audio = audio.mean(axis=1)
+        duration = len(audio) / 16000
+        # Process audio
+        input_features = processor(
+            audio,
+            sampling_rate=16000,
+            return_tensors="pt"
+        ).input_features.to(device)
+        # Generate transcription
+        with torch.no_grad():
+            predicted_ids = model.generate(
+                input_features,
+                max_length=448,
+                num_beams=5,
+                early_stopping=True
+            )
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        logger.info(f"Transcribed {duration:.2f}s audio: {transcription[:50]}...")
+        return f"🎤 **Transcription:**\n\n{transcription}\n\n📊 **Duration:** {duration:.2f} seconds"
+    except Exception as e:
+        logger.error(f"Transcription error: {e}")
+        return f"❌ Error: {str(e)}"
+# Load model on startup
+# IMPORTANT: Replace 'openai/whisper-small' with your fine-tuned model ID
+# e.g., 'saadmannan/whisper-small-german' after you upload your model to HF Hub
+MODEL_ID = "openai/whisper-small"  # Change this to your model ID
+try:
+    load_model(MODEL_ID)
+except Exception as e:
+    logger.error(f"Failed to load model: {e}")
+    logger.info("Model will need to be loaded manually")
+# Create Gradio interface
+with gr.Blocks(title="Whisper German ASR", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🎙️ Whisper German ASR
+        Fine-tuned Whisper model for German speech recognition.
+        **How to use:**
+        1. Upload an audio file (WAV, MP3, FLAC, etc.) or record using your microphone
+        2. Click the "Transcribe" button
+        3. Wait for the transcription to appear
+        **Features:**
+        - Supports multiple audio formats
+        - Microphone recording
+        - Optimized for German language
+        **Model:** Whisper-small fine-tuned on German MINDS14 dataset
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                sources=["upload", "microphone"],
+                type="numpy",
+                label="Upload Audio or Record"
+            )
+            transcribe_btn = gr.Button("🎯 Transcribe", variant="primary", size="lg")
+        with gr.Column():
+            output_text = gr.Markdown(label="Transcription Result")
+    transcribe_btn.click(
+        fn=transcribe_audio,
+        inputs=audio_input,
+        outputs=output_text
+    )
+    gr.Markdown(
+        """
+        ---
+        ## 📋 About This Model
+        This is a fine-tuned version of OpenAI's Whisper-small model,
+        specifically optimized for German speech recognition.
+        ### Performance
+        - **Word Error Rate (WER):** ~13%
+        - **Sample Rate:** 16kHz
+        - **Max Duration:** 30 seconds
+        - **Language:** German (de)
+        ### Tips for Best Results
+        - Speak clearly and at a moderate pace
+        - Minimize background noise
+        - Audio should be in German language
+        - Best results with 1-30 second clips
+        ### Links
+        - [GitHub Repository](https://github.com/YOUR_USERNAME/whisper-german-asr)
+        - [Model Card](https://huggingface.co/YOUR_USERNAME/whisper-small-german)
+        """
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,25 +1,6 @@
-# Core ML/DL frameworks
-torch>=2.2.0
 transformers>=4.42.0
-datasets>=2.19.0
-accelerate>=0.30.0
-# Audio processing
 librosa>=0.10.1
-soundfile>=0.12.1
-# Metrics and evaluation
-jiwer>=3.0.4
-evaluate>=0.4.1
-# Utilities
 numpy>=1.24.0
-sentencepiece>=0.2.0
-einops>=0.7.0
-# Logging and visualization
-tensorboard>=2.16.0
-tensorboardX>=2.6.2
-# Optional: Flash Attention 2 (requires CUDA)
-# flash-attn>=2.5.0  # Uncomment if you have CUDA toolkit installed

 transformers>=4.42.0
+torch>=2.2.0
+gradio>=4.0.0
 librosa>=0.10.1
 numpy>=1.24.0
+soundfile>=0.12.1