| import gradio as gr |
| import torchaudio |
| import torch |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
|
|
| |
| processor = Wav2Vec2Processor.from_pretrained("Mustafaa4a/ASR-Somali") |
| model = Wav2Vec2ForCTC.from_pretrained("Mustafaa4a/ASR-Somali") |
|
|
| def transcribe(audio): |
| waveform, sample_rate = torchaudio.load(audio) |
|
|
| if sample_rate != 16000: |
| resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) |
| waveform = resampler(waveform) |
|
|
| inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt") |
| with torch.no_grad(): |
| logits = model(**inputs).logits |
|
|
| predicted_ids = torch.argmax(logits, dim=-1) |
| transcription = processor.decode(predicted_ids[0]) |
| return transcription |
|
|
| |
| interface = gr.Interface( |
| fn=transcribe, |
| inputs=gr.Audio(type="filepath", label="Upload Somali Audio (.wav)"), |
| outputs=gr.Textbox(label="Transcription"), |
| title="Somali ASR using Mustafaa4a/ASR-Somali", |
| description="Upload a Somali speech audio file (mono WAV, 16kHz) and get the text transcription." |
| ) |
|
|
| interface.launch() |