Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from transformers import WhisperForConditionalGeneration, WhisperProcessor | |
| import librosa | |
| import numpy as np | |
| # Load model and processor | |
| def load_model(): | |
| try: | |
| model = WhisperForConditionalGeneration.from_pretrained( | |
| "therealbee/whisper-small-ha-bible-tts", | |
| ignore_mismatched_sizes=True | |
| ) | |
| processor = WhisperProcessor.from_pretrained("therealbee/whisper-small-ha-bible-tts") | |
| return model, processor | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| return None, None | |
| # Load model once | |
| model, processor = load_model() | |
| def transcribe_audio(audio_input): | |
| """Transcribe audio file to Hausa text""" | |
| if model is None or processor is None: | |
| return "β Model failed to load. Please try again later." | |
| if audio_input is None: | |
| return "β Please provide an audio file or record audio." | |
| try: | |
| # Handle different audio input types | |
| if isinstance(audio_input, tuple): | |
| # Recorded audio: (sample_rate, audio_data) | |
| sample_rate, audio_data = audio_input | |
| audio = audio_data.astype(np.float32) | |
| # Normalize if needed | |
| if audio.max() > 1.0: | |
| audio = audio / np.max(np.abs(audio)) | |
| # Resample if needed | |
| if sample_rate != 16000: | |
| audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) | |
| else: | |
| # Uploaded file: filepath | |
| audio, sample_rate = librosa.load(audio_input, sr=16000) | |
| # Ensure audio is not empty | |
| if len(audio) < 1000: # Less than ~0.06 seconds | |
| return "β Audio too short. Please record at least 1 second of speech." | |
| # Prepare inputs | |
| inputs = processor( | |
| audio, | |
| sampling_rate=16000, | |
| return_tensors="pt", | |
| language="ha" | |
| ) | |
| # Generate transcription | |
| with torch.no_grad(): | |
| outputs = model.generate(inputs.input_features, task="transcribe") | |
| # Decode transcription | |
| transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
| if not transcription.strip(): | |
| return "β No speech detected. Please speak more clearly or check your audio." | |
| return f"π Hausa Transcription:\n\n{transcription}" | |
| except Exception as e: | |
| return f"β Transcription failed: {str(e)}" | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=transcribe_audio, | |
| inputs=gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="numpy", | |
| label="ποΈ Record or Upload Hausa Audio" | |
| ), | |
| outputs=gr.Textbox( | |
| label="π Transcription Result", | |
| lines=5, | |
| placeholder="Your Hausa transcription will appear here..." | |
| ), | |
| title="ποΈ Hausa Speech Transcription", | |
| description=""" | |
| Upload an audio file or record directly to get Hausa transcription. | |
| **Supported formats:** WAV, MP3, OGG, M4A, FLAC | |
| **Tips:** | |
| - Speak clearly in Hausa | |
| - Keep recordings under 30 seconds for best results | |
| - Use good quality audio | |
| """, | |
| examples=[], | |
| theme=gr.themes.Soft(), | |
| allow_flagging="never" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |