import torch import gradio as gr from nemo.collections.asr.models import EncDecRNNTBPEModel import soundfile as sf import numpy as np import torchaudio MODEL_NAME = "ARTPARK-IISc/Vaani-FastConformer-Multilingual" print("Loading model, this may take a few minutes...") model = EncDecRNNTBPEModel.from_pretrained(MODEL_NAME) model.eval() # Use CPU if GPU is not available if not torch.cuda.is_available(): model = model.cpu() print("Model loaded successfully.") TARGET_SR = 16000 def resample_if_needed(audio, sr): if sr == TARGET_SR: return audio audio_tensor = torch.from_numpy(audio).unsqueeze(0) # (1, T) resampler = torchaudio.transforms.Resample( orig_freq=sr, new_freq=TARGET_SR ) audio_resampled = resampler(audio_tensor) return audio_resampled.squeeze(0).numpy() def transcribe(audio_input): """ audio_input: (sample_rate, audio_array) """ if audio_input is None: return "" sr, audio = audio_input # Convert stereo → mono if audio.ndim == 2: audio = np.mean(audio, axis=1) # Convert to float32 audio = audio.astype(np.float32) # Normalize audio = audio / (np.max(np.abs(audio)) + 1e-9) # Resample to 16kHz if needed audio = resample_if_needed(audio, sr) hypotheses = model.transcribe( audio=[audio], return_hypotheses=True ) return hypotheses[0].text if hypotheses else "" demo = gr.Interface( fn=transcribe, inputs=gr.Audio( sources=["microphone", "upload"], type="numpy", label="Record or upload WAV audio" ), outputs=gr.Textbox(label="Transcription"), title="Vaani Multilingual ASR (NeMo RNNT)", description="Upload a WAV file and get the multilingual ASR transcription." ) demo.launch()