|
|
| import gradio as gr |
| from transformers import WhisperForConditionalGeneration, WhisperProcessor |
| import torch |
| import librosa |
| import warnings |
| import numpy as np |
|
|
| |
| |
| |
| warnings.filterwarnings("ignore", category=ResourceWarning) |
| warnings.filterwarnings("ignore", category=FutureWarning) |
|
|
| |
| |
| |
| MODEL_PATH = "MaryWambo/whisper-base-kikuyu4" |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| print(f"Loading model on {device}...") |
|
|
| |
| |
| |
| processor = WhisperProcessor.from_pretrained(MODEL_PATH) |
| model = WhisperForConditionalGeneration.from_pretrained(MODEL_PATH).to(device) |
|
|
| |
| model.config.forced_decoder_ids = processor.get_decoder_prompt_ids( |
| language="swahili", |
| task="transcribe" |
| ) |
|
|
| |
| |
| |
| theme_styles = """ |
| body, .gradio-container { background-color: white !important; } |
| |
| #title-text h1 { |
| color: #8b0000 !important; |
| font-weight: 900 !important; |
| text-align: center; |
| } |
| |
| .upload-button svg, .mic-button svg { |
| transform: scale(1.5) !important; |
| color: #8b0000 !important; |
| } |
| |
| #predict-box textarea { |
| font-size: 1.6rem !important; |
| font-weight: 800 !important; |
| color: #000000 !important; |
| border: 3px solid #8b0000 !important; |
| } |
| |
| #run-btn { |
| background: #8b0000 !important; |
| color: white !important; |
| font-weight: bold !important; |
| font-size: 1.4rem !important; |
| } |
| """ |
|
|
| |
| |
| |
| def transcribe_kikuyu(audio): |
| if audio is None: |
| return "Please record or upload audio." |
|
|
| try: |
| |
| speech_array, sr = librosa.load(audio, sr=16000) |
| |
| |
| if speech_array.dtype != np.float32: |
| speech_array = speech_array.astype(np.float32) |
| |
| |
| inputs = processor(speech_array, sampling_rate=sr, return_tensors="pt") |
| input_features = inputs.input_features.to(device) |
|
|
| |
| with torch.no_grad(): |
| predicted_ids = model.generate( |
| input_features, |
| num_beams=5, |
| max_new_tokens=255 |
| ) |
|
|
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
| return transcription |
|
|
| except Exception as e: |
| return f"Error during transcription: {str(e)}" |
|
|
| |
| |
| |
| with gr.Blocks() as demo: |
| gr.Markdown("# ποΈ Kikuyu ASR", elem_id="title-text") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| audio_input = gr.Audio( |
| sources=["microphone", "upload"], |
| type="filepath", |
| label="π€ Record or Upload Kikuyu Speech" |
| ) |
|
|
| submit_btn = gr.Button( |
| "π RUN TRANSCRIPTION", |
| elem_id="run-btn" |
| ) |
|
|
| with gr.Column(): |
| text_out = gr.Textbox( |
| label="π€ AI Prediction", |
| elem_id="predict-box", |
| lines=8 |
| ) |
|
|
| submit_btn.click( |
| fn=transcribe_kikuyu, |
| inputs=audio_input, |
| outputs=text_out |
| ) |
|
|
| |
| |
| |
| import asyncio |
| import sys |
|
|
| def _suppress_event_loop_closed(loop, context): |
| if "Invalid file descriptor" in str(context.get("exception", "")): |
| return |
| loop.default_exception_handler(context) |
|
|
| try: |
| loop = asyncio.get_event_loop() |
| loop.set_exception_handler(_suppress_event_loop_closed) |
| except RuntimeError: |
| pass |
|
|
| demo.launch(ssr_mode=False) |