BembaASRtest / app.py
chiyo123's picture
Update app.py
cff7277 verified
import gradio as gr
from transformers import pipeline
import torch
# Initialize the speech recognition pipeline
print("Loading Whisper Lozi model...")
try:
# Use the specific Lozi model
transcriber = pipeline(
"automatic-speech-recognition",
model="simzacademy/whisper-small-lozi1",
device=0 if torch.cuda.is_available() else -1 # Use GPU if available
)
print("Model loaded successfully!")
except Exception as e:
print(f"Error loading model: {e}")
transcriber = None
def transcribe_audio(audio):
"""
Transcribe audio to text using the Whisper Lozi model
Args:
audio: Audio file path or tuple (sample_rate, audio_data)
Returns:
Transcribed text
"""
if transcriber is None:
return "Error: Model failed to load. Please check your installation."
if audio is None:
return "Please provide an audio file or recording."
try:
# Transcribe the audio
result = transcriber(audio)
return result["text"]
except Exception as e:
return f"Error during transcription: {str(e)}"
# Create the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# 🎀 Lozi Speech-to-Text Interface
### Powered by Whisper Small Lozi Model
This interface uses the `simzacademy/whisper-small-lozi1` model to transcribe
Lozi language speech to text.
"""
)
with gr.Row():
with gr.Column():
# Audio input - supports both recording and file upload
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Record or Upload Audio"
)
transcribe_btn = gr.Button("πŸ”„ Transcribe", variant="primary", size="lg")
with gr.Column():
output_text = gr.Textbox(
label="Transcription",
placeholder="Your transcription will appear here...",
lines=10
)
gr.Markdown(
"""
### πŸ“‹ Instructions:
1. **Record**: Click the microphone icon to record audio directly
2. **Upload**: Or click to upload an audio file (MP3, WAV, etc.)
3. **Transcribe**: Click the "Transcribe" button to convert speech to text
4. **View**: The transcribed text will appear on the right
### ℹ️ Notes:
- Speak clearly in Lozi for best results
- The model works best with clear audio and minimal background noise
- First transcription may take longer as the model loads
"""
)
# Set up the transcription action
transcribe_btn.click(
fn=transcribe_audio,
inputs=audio_input,
outputs=output_text
)
# Also allow Enter key to trigger transcription
audio_input.change(
fn=lambda: gr.update(interactive=True),
outputs=transcribe_btn
)
# Launch the interface
if __name__ == "__main__":
demo.launch(
share=False, # Set to True to create a public link
server_name="0.0.0.0", # Allow access from network
server_port=7860
)