Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import tempfile | |
| import os | |
| from pathlib import Path | |
| # Try to import wakanda_whisper, fallback to transformers if not available | |
| try: | |
| import wakanda_whisper | |
| USE_WAKANDA_WHISPER = True | |
| print("β Using wakanda_whisper package") | |
| except ImportError: | |
| print("β οΈ wakanda_whisper not found, falling back to transformers...") | |
| try: | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| import librosa | |
| USE_WAKANDA_WHISPER = False | |
| print("β Using transformers as fallback") | |
| except ImportError: | |
| print("β Neither wakanda_whisper nor transformers available") | |
| USE_WAKANDA_WHISPER = None | |
| # Initialize the model | |
| def load_model(): | |
| """Load the Wakanda Whisper model from Hugging Face.""" | |
| try: | |
| if USE_WAKANDA_WHISPER: | |
| # Use wakanda_whisper if available | |
| print("π₯ Loading model with wakanda_whisper...") | |
| model = wakanda_whisper.from_pretrained("WakandaAI/wakanda-whisper-small-rw-v1") | |
| return model, None | |
| elif USE_WAKANDA_WHISPER is False: | |
| # Fallback to transformers | |
| print("π₯ Loading model with transformers...") | |
| processor = WhisperProcessor.from_pretrained("WakandaAI/wakanda-whisper-small-rw-v1") | |
| model = WhisperForConditionalGeneration.from_pretrained("WakandaAI/wakanda-whisper-small-rw-v1") | |
| return model, processor | |
| else: | |
| print("β No compatible libraries available") | |
| return None, None | |
| except Exception as e: | |
| print(f"β Error loading model: {e}") | |
| return None, None | |
| # Global model variables | |
| MODEL = None | |
| PROCESSOR = None | |
| def initialize_model(): | |
| """Initialize model on first use""" | |
| global MODEL, PROCESSOR | |
| if MODEL is None: | |
| print("π Initializing model...") | |
| MODEL, PROCESSOR = load_model() | |
| return MODEL, PROCESSOR | |
| def transcribe_audio(audio_file): | |
| """ | |
| Transcribe audio using the Wakanda Whisper model. | |
| """ | |
| if audio_file is None: | |
| return "Please upload an audio file." | |
| try: | |
| # Initialize model if needed | |
| model, processor = initialize_model() | |
| if model is None: | |
| return "β Error: Could not load the model. Please try again later." | |
| print(f"π΅ Processing audio file: {Path(audio_file).name}") | |
| # Check if using mock model | |
| if model == "mock_model": | |
| filename = Path(audio_file).name | |
| if "sample_1" in filename: | |
| return "Muraho, witwa gute?" | |
| elif "sample_2" in filename: | |
| return "Ndashaka kwiga Ikinyarwanda." | |
| elif "sample_3" in filename: | |
| return "Urakoze cyane kubafasha." | |
| elif "sample_4" in filename: | |
| return "Tugiye gutangiza ikiganiro mu Kinyarwanda." | |
| else: | |
| return f"Mock transcription for {filename}: [This would be the actual Kinyarwanda transcription]" | |
| # Real model processing | |
| elif USE_WAKANDA_WHISPER: | |
| # Use wakanda_whisper | |
| result = model.transcribe(audio_file) | |
| transcribed_text = result['text'].strip() | |
| elif USE_WAKANDA_WHISPER is False: | |
| # Use transformers | |
| import librosa | |
| audio, sr = librosa.load(audio_file, sr=16000) | |
| input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features | |
| with torch.no_grad(): | |
| predicted_ids = model.generate(input_features) | |
| transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip() | |
| else: | |
| return "β Error: No compatible transcription library available." | |
| if not transcribed_text: | |
| return "π No speech detected in the audio file. Please try with a clearer audio recording." | |
| print(f"β Transcription completed: {len(transcribed_text)} characters") | |
| return transcribed_text | |
| except Exception as e: | |
| print(f"β Transcription error: {e}") | |
| return f"β Error during transcription: {str(e)}" | |
| def transcribe_microphone(audio_data): | |
| """ | |
| Transcribe audio from microphone input. | |
| Args: | |
| audio_data: Audio data from microphone | |
| Returns: | |
| str: Transcribed text | |
| """ | |
| if audio_data is None: | |
| return "Please record some audio first." | |
| try: | |
| # Save the audio data to a temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
| # audio_data is a tuple (sample_rate, audio_array) | |
| sample_rate, audio_array = audio_data | |
| print(f"ποΈ Processing microphone input: {len(audio_array)} samples at {sample_rate}Hz") | |
| # Convert to float32 and normalize if needed | |
| if audio_array.dtype != np.float32: | |
| audio_array = audio_array.astype(np.float32) | |
| if audio_array.max() > 1.0: | |
| # Normalize based on the original dtype | |
| if audio_array.max() > 32767: | |
| audio_array = audio_array / 32768.0 | |
| else: | |
| audio_array = audio_array / audio_array.max() | |
| # Save using soundfile | |
| import soundfile as sf | |
| sf.write(tmp_file.name, audio_array, sample_rate) | |
| # Transcribe the temporary file | |
| result = transcribe_audio(tmp_file.name) | |
| # Clean up | |
| os.unlink(tmp_file.name) | |
| return result | |
| except Exception as e: | |
| print(f"β Microphone processing error: {e}") | |
| return f"β Error processing microphone input: {str(e)}" | |
| # Create a simple Gradio interface | |
| def create_interface(): | |
| """Create a clean, simple Gradio interface.""" | |
| with gr.Blocks(title="Wakanda Whisper - Kinyarwanda ASR") as interface: | |
| gr.Markdown("# Wakanda ASR For Kinyarwanda") | |
| gr.Markdown("### Kinyarwanda Automatic Speech Recognition") | |
| gr.Markdown("Upload an audio file or record your voice to get Kinyarwanda transcription") | |
| with gr.Tabs(): | |
| # File Upload Tab | |
| with gr.TabItem("π Upload Audio File"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| label="Choose Audio File", | |
| type="filepath" | |
| ) | |
| # Sample audio files | |
| gr.Markdown("**Try these sample Kinyarwanda audio files:**") | |
| with gr.Row(): | |
| sample_1 = gr.Button("Sample 1", size="sm") | |
| sample_2 = gr.Button("Sample 2", size="sm") | |
| sample_3 = gr.Button("Sample 3", size="sm") | |
| sample_4 = gr.Button("Sample 4", size="sm") | |
| upload_btn = gr.Button("π― Transcribe Audio", variant="primary") | |
| with gr.Column(): | |
| upload_output = gr.Textbox( | |
| label="Transcription Result", | |
| placeholder="Your Kinyarwanda transcription will appear here...", | |
| lines=6, | |
| show_copy_button=True | |
| ) | |
| # Microphone Tab | |
| with gr.TabItem("ποΈ Record Audio"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| mic_input = gr.Audio( | |
| label="Record Your Voice", | |
| type="numpy" | |
| ) | |
| mic_btn = gr.Button(" Transcribe Recording", variant="primary") | |
| with gr.Column(): | |
| mic_output = gr.Textbox( | |
| label="Transcription Result", | |
| placeholder="Your Kinyarwanda transcription will appear here...", | |
| lines=6, | |
| show_copy_button=True | |
| ) | |
| # Set up event handlers | |
| upload_btn.click( | |
| fn=transcribe_audio, | |
| inputs=audio_input, | |
| outputs=upload_output, | |
| show_progress=True | |
| ) | |
| # Sample audio button handlers | |
| sample_1.click( | |
| fn=lambda: "sample_1.wav", | |
| outputs=audio_input | |
| ) | |
| sample_2.click( | |
| fn=lambda: "sample_2.wav", | |
| outputs=audio_input | |
| ) | |
| sample_3.click( | |
| fn=lambda: "sample_3.wav", | |
| outputs=audio_input | |
| ) | |
| sample_4.click( | |
| fn=lambda: "sample_4.wav", | |
| outputs=audio_input | |
| ) | |
| mic_btn.click( | |
| fn=transcribe_microphone, | |
| inputs=mic_input, | |
| outputs=mic_output, | |
| show_progress=True | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("**Powered by WakandaAI** | Model: [wakanda-whisper-small-rw-v1](https://huggingface.co/WakandaAI/wakanda-whisper-small-rw-v1)") | |
| return interface | |
| # Launch the app | |
| if __name__ == "__main__": | |
| print("π Starting Wakanda Whisper ASR Demo...") | |
| # Create and launch the interface | |
| demo = create_interface() | |
| # Launch configuration for Hugging Face Spaces | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| share=False, # Set to False for Hugging Face Spaces | |
| show_error=True | |
| ) | |