mai_tarjamaa / app.py
Bello Abdullahi
Fix audio recording - handle both recorded and uploaded audio properly
035adb1
import gradio as gr
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import librosa
import numpy as np
# Load model and processor
def load_model():
try:
model = WhisperForConditionalGeneration.from_pretrained(
"therealbee/whisper-small-ha-bible-tts",
ignore_mismatched_sizes=True
)
processor = WhisperProcessor.from_pretrained("therealbee/whisper-small-ha-bible-tts")
return model, processor
except Exception as e:
print(f"Error loading model: {e}")
return None, None
# Load model once
model, processor = load_model()
def transcribe_audio(audio_input):
"""Transcribe audio file to Hausa text"""
if model is None or processor is None:
return "❌ Model failed to load. Please try again later."
if audio_input is None:
return "❌ Please provide an audio file or record audio."
try:
# Handle different audio input types
if isinstance(audio_input, tuple):
# Recorded audio: (sample_rate, audio_data)
sample_rate, audio_data = audio_input
audio = audio_data.astype(np.float32)
# Normalize if needed
if audio.max() > 1.0:
audio = audio / np.max(np.abs(audio))
# Resample if needed
if sample_rate != 16000:
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
else:
# Uploaded file: filepath
audio, sample_rate = librosa.load(audio_input, sr=16000)
# Ensure audio is not empty
if len(audio) < 1000: # Less than ~0.06 seconds
return "❌ Audio too short. Please record at least 1 second of speech."
# Prepare inputs
inputs = processor(
audio,
sampling_rate=16000,
return_tensors="pt",
language="ha"
)
# Generate transcription
with torch.no_grad():
outputs = model.generate(inputs.input_features, task="transcribe")
# Decode transcription
transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0]
if not transcription.strip():
return "❌ No speech detected. Please speak more clearly or check your audio."
return f"πŸ“ Hausa Transcription:\n\n{transcription}"
except Exception as e:
return f"❌ Transcription failed: {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="πŸŽ™οΈ Record or Upload Hausa Audio"
),
outputs=gr.Textbox(
label="πŸ“‹ Transcription Result",
lines=5,
placeholder="Your Hausa transcription will appear here..."
),
title="πŸŽ™οΈ Hausa Speech Transcription",
description="""
Upload an audio file or record directly to get Hausa transcription.
**Supported formats:** WAV, MP3, OGG, M4A, FLAC
**Tips:**
- Speak clearly in Hausa
- Keep recordings under 30 seconds for best results
- Use good quality audio
""",
examples=[],
theme=gr.themes.Soft(),
allow_flagging="never"
)
if __name__ == "__main__":
demo.launch()