SujithPulikodan's picture
Upload 3 files
7b7d2ab verified
import torch
import gradio as gr
from nemo.collections.asr.models import EncDecRNNTBPEModel
import soundfile as sf
import numpy as np
import torchaudio
MODEL_NAME = "ARTPARK-IISc/Vaani-FastConformer-Multilingual"
print("Loading model, this may take a few minutes...")
model = EncDecRNNTBPEModel.from_pretrained(MODEL_NAME)
model.eval()
# Use CPU if GPU is not available
if not torch.cuda.is_available():
model = model.cpu()
print("Model loaded successfully.")
TARGET_SR = 16000
def resample_if_needed(audio, sr):
if sr == TARGET_SR:
return audio
audio_tensor = torch.from_numpy(audio).unsqueeze(0) # (1, T)
resampler = torchaudio.transforms.Resample(
orig_freq=sr,
new_freq=TARGET_SR
)
audio_resampled = resampler(audio_tensor)
return audio_resampled.squeeze(0).numpy()
def transcribe(audio_input):
"""
audio_input: (sample_rate, audio_array)
"""
if audio_input is None:
return ""
sr, audio = audio_input
# Convert stereo → mono
if audio.ndim == 2:
audio = np.mean(audio, axis=1)
# Convert to float32
audio = audio.astype(np.float32)
# Normalize
audio = audio / (np.max(np.abs(audio)) + 1e-9)
# Resample to 16kHz if needed
audio = resample_if_needed(audio, sr)
hypotheses = model.transcribe(
audio=[audio],
return_hypotheses=True
)
return hypotheses[0].text if hypotheses else ""
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Record or upload WAV audio"
),
outputs=gr.Textbox(label="Transcription"),
title="Vaani Multilingual ASR (NeMo RNNT)",
description="Upload a WAV file and get the multilingual ASR transcription."
)
demo.launch()