File size: 2,961 Bytes
c2ca4c0 d57ce5b c2ca4c0 0e51547 c2ca4c0 d57ce5b c2ca4c0 5d41ed7 c2ca4c0 42081cb c2ca4c0 5db1504 c2ca4c0 5db1504 d57ce5b c2ca4c0 5db1504 c2ca4c0 ba7ca0f c2ca4c0 d57ce5b 5db1504 c2ca4c0 d57ce5b 5db1504 c2ca4c0 5db1504 d57ce5b c2ca4c0 5db1504 c2ca4c0 535bed6 c2ca4c0 bd45f35 c2ca4c0 42081cb 5db1504 bd45f35 5db1504 42081cb c2ca4c0 bd45f35 5db1504 c2ca4c0 5db1504 d57ce5b 5db1504 46b214b c2ca4c0 5db1504 c2ca4c0 d57ce5b c2ca4c0 535bed6 c2ca4c0 535bed6 5db1504 535bed6 c2ca4c0 5db1504 d57ce5b c2ca4c0 5db1504 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import os
import torch
import gradio as gr
import librosa
import numpy as np
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
# ----------------------------
# Config
# ----------------------------
ASR_MODEL_ID = "openai/whisper-large-v3"
HF_TOKEN = os.getenv("HF_TOKEN")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
# ----------------------------
# Load Whisper
# ----------------------------
processor = AutoProcessor.from_pretrained(
ASR_MODEL_ID,
use_auth_token=HF_TOKEN
)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
ASR_MODEL_ID,
torch_dtype=DTYPE,
low_cpu_mem_usage=True,
use_safetensors=True,
use_auth_token=HF_TOKEN
).to(DEVICE)
model.eval()
# ----------------------------
# Audio preprocessing
# ----------------------------
def preprocess_audio(audio):
if audio is None:
return None
sr, speech = audio
# Stereo → mono
if speech.ndim > 1:
speech = np.mean(speech, axis=1)
# Convert to float32
speech = speech.astype(np.float32)
# Normalize volume
rms = np.sqrt(np.mean(speech ** 2))
if rms > 0:
speech = speech / rms
# Trim silence
speech, _ = librosa.effects.trim(speech, top_db=25)
# Force 16kHz
if sr != 16000:
speech = librosa.resample(speech, orig_sr=sr, target_sr=16000).astype(np.float32)
return speech
# ----------------------------
# Transcription
# ----------------------------
def transcribe_audio(audio):
speech = preprocess_audio(audio)
if speech is None or len(speech) < 16000:
return "Audio too short or unclear. Please speak clearly and try again."
inputs = processor(
speech,
sampling_rate=16000,
return_tensors="pt"
)
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
MAX_DECODER_TOKENS = 448
START_TOKENS = 4
max_new_tokens = MAX_DECODER_TOKENS - START_TOKENS # 444
with torch.no_grad():
generated_ids = model.generate(
**inputs,
task="transcribe",
language="yo",
max_new_tokens=max_new_tokens,
temperature=0.0,
no_repeat_ngram_size=3
)
text = processor.batch_decode(
generated_ids,
skip_special_tokens=True
)[0].strip()
if len(text.split()) < 2:
return "Speech unclear. Please repeat slowly in Yoruba."
return text
# ----------------------------
# Gradio UI
# ----------------------------
demo = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Speak clearly or upload audio in Yoruba"
),
outputs=gr.Textbox(label="Transcription"),
title="Yoruba ASR (Whisper)",
description="Speech-to-text system that transcribes only Yoruba"
)
if __name__ == "__main__":
demo.launch(share=True)
|