File size: 1,816 Bytes
7b7d2ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import torch
import gradio as gr
from nemo.collections.asr.models import EncDecRNNTBPEModel
import soundfile as sf
import numpy as np
import torchaudio
MODEL_NAME = "ARTPARK-IISc/Vaani-FastConformer-Multilingual"
print("Loading model, this may take a few minutes...")
model = EncDecRNNTBPEModel.from_pretrained(MODEL_NAME)
model.eval()
# Use CPU if GPU is not available
if not torch.cuda.is_available():
model = model.cpu()
print("Model loaded successfully.")
TARGET_SR = 16000
def resample_if_needed(audio, sr):
if sr == TARGET_SR:
return audio
audio_tensor = torch.from_numpy(audio).unsqueeze(0) # (1, T)
resampler = torchaudio.transforms.Resample(
orig_freq=sr,
new_freq=TARGET_SR
)
audio_resampled = resampler(audio_tensor)
return audio_resampled.squeeze(0).numpy()
def transcribe(audio_input):
"""
audio_input: (sample_rate, audio_array)
"""
if audio_input is None:
return ""
sr, audio = audio_input
# Convert stereo → mono
if audio.ndim == 2:
audio = np.mean(audio, axis=1)
# Convert to float32
audio = audio.astype(np.float32)
# Normalize
audio = audio / (np.max(np.abs(audio)) + 1e-9)
# Resample to 16kHz if needed
audio = resample_if_needed(audio, sr)
hypotheses = model.transcribe(
audio=[audio],
return_hypotheses=True
)
return hypotheses[0].text if hypotheses else ""
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Record or upload WAV audio"
),
outputs=gr.Textbox(label="Transcription"),
title="Vaani Multilingual ASR (NeMo RNNT)",
description="Upload a WAV file and get the multilingual ASR transcription."
)
demo.launch()
|