|
|
|
|
|
import torch |
|
|
import gradio as gr |
|
|
from nemo.collections.asr.models import EncDecRNNTBPEModel |
|
|
import soundfile as sf |
|
|
import numpy as np |
|
|
import torchaudio |
|
|
MODEL_NAME = "ARTPARK-IISc/Vaani-FastConformer-Multilingual" |
|
|
|
|
|
print("Loading model, this may take a few minutes...") |
|
|
model = EncDecRNNTBPEModel.from_pretrained(MODEL_NAME) |
|
|
model.eval() |
|
|
|
|
|
|
|
|
if not torch.cuda.is_available(): |
|
|
model = model.cpu() |
|
|
print("Model loaded successfully.") |
|
|
TARGET_SR = 16000 |
|
|
|
|
|
def resample_if_needed(audio, sr): |
|
|
if sr == TARGET_SR: |
|
|
return audio |
|
|
|
|
|
audio_tensor = torch.from_numpy(audio).unsqueeze(0) |
|
|
|
|
|
resampler = torchaudio.transforms.Resample( |
|
|
orig_freq=sr, |
|
|
new_freq=TARGET_SR |
|
|
) |
|
|
|
|
|
audio_resampled = resampler(audio_tensor) |
|
|
return audio_resampled.squeeze(0).numpy() |
|
|
|
|
|
|
|
|
def transcribe(audio_input): |
|
|
""" |
|
|
audio_input: (sample_rate, audio_array) |
|
|
""" |
|
|
if audio_input is None: |
|
|
return "" |
|
|
|
|
|
sr, audio = audio_input |
|
|
|
|
|
|
|
|
if audio.ndim == 2: |
|
|
audio = np.mean(audio, axis=1) |
|
|
|
|
|
|
|
|
audio = audio.astype(np.float32) |
|
|
|
|
|
|
|
|
audio = audio / (np.max(np.abs(audio)) + 1e-9) |
|
|
|
|
|
|
|
|
audio = resample_if_needed(audio, sr) |
|
|
|
|
|
hypotheses = model.transcribe( |
|
|
audio=[audio], |
|
|
return_hypotheses=True |
|
|
) |
|
|
|
|
|
return hypotheses[0].text if hypotheses else "" |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=transcribe, |
|
|
inputs=gr.Audio( |
|
|
sources=["microphone", "upload"], |
|
|
type="numpy", |
|
|
label="Record or upload WAV audio" |
|
|
), |
|
|
outputs=gr.Textbox(label="Transcription"), |
|
|
title="Vaani Multilingual ASR (NeMo RNNT)", |
|
|
description="Upload a WAV file and get the multilingual ASR transcription." |
|
|
) |
|
|
|
|
|
demo.launch() |
|
|
|