import gradio as gr from transformers import pipeline import torch # Initialize the speech recognition pipeline print("Loading Whisper Lozi model...") try: # Use the specific Lozi model transcriber = pipeline( "automatic-speech-recognition", model="simzacademy/whisper-small-lozi1", device=0 if torch.cuda.is_available() else -1 # Use GPU if available ) print("Model loaded successfully!") except Exception as e: print(f"Error loading model: {e}") transcriber = None def transcribe_audio(audio): """ Transcribe audio to text using the Whisper Lozi model Args: audio: Audio file path or tuple (sample_rate, audio_data) Returns: Transcribed text """ if transcriber is None: return "Error: Model failed to load. Please check your installation." if audio is None: return "Please provide an audio file or recording." try: # Transcribe the audio result = transcriber(audio) return result["text"] except Exception as e: return f"Error during transcription: {str(e)}" # Create the Gradio interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🎤 Lozi Speech-to-Text Interface ### Powered by Whisper Small Lozi Model This interface uses the `simzacademy/whisper-small-lozi1` model to transcribe Lozi language speech to text. """ ) with gr.Row(): with gr.Column(): # Audio input - supports both recording and file upload audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Record or Upload Audio" ) transcribe_btn = gr.Button("🔄 Transcribe", variant="primary", size="lg") with gr.Column(): output_text = gr.Textbox( label="Transcription", placeholder="Your transcription will appear here...", lines=10 ) gr.Markdown( """ ### 📋 Instructions: 1. **Record**: Click the microphone icon to record audio directly 2. **Upload**: Or click to upload an audio file (MP3, WAV, etc.) 3. **Transcribe**: Click the "Transcribe" button to convert speech to text 4. **View**: The transcribed text will appear on the right ### â„šī¸ Notes: - Speak clearly in Lozi for best results - The model works best with clear audio and minimal background noise - First transcription may take longer as the model loads """ ) # Set up the transcription action transcribe_btn.click( fn=transcribe_audio, inputs=audio_input, outputs=output_text ) # Also allow Enter key to trigger transcription audio_input.change( fn=lambda: gr.update(interactive=True), outputs=transcribe_btn ) # Launch the interface if __name__ == "__main__": demo.launch( share=False, # Set to True to create a public link server_name="0.0.0.0", # Allow access from network server_port=7860 )