Spaces:
Runtime error
Runtime error
| import torch | |
| import gradio as gr | |
| from nemo.collections.asr.models import EncDecRNNTBPEModel | |
| import soundfile as sf | |
| import numpy as np | |
| import torchaudio | |
| MODEL_NAME = "ARTPARK-IISc/Vaani-FastConformer-Multilingual" | |
| print("Loading model, this may take a few minutes...") | |
| model = EncDecRNNTBPEModel.from_pretrained(MODEL_NAME) | |
| model.eval() | |
| #Use CPU if GPU is not available | |
| if not torch.cuda.is_available(): | |
| model = model.cpu() | |
| print("Model loaded successfully.") | |
| TARGET_SR = 16000 | |
| def resample_if_needed(audio, sr): | |
| if sr == TARGET_SR: | |
| return audio | |
| audio_tensor = torch.from_numpy(audio).unsqueeze(0) # (1, T) | |
| resampler = torchaudio.transforms.Resample( | |
| orig_freq=sr, | |
| new_freq=TARGET_SR | |
| ) | |
| audio_resampled = resampler(audio_tensor) | |
| return audio_resampled.squeeze(0).numpy() | |
| def transcribe(audio_input): | |
| """ | |
| audio_input: (sample_rate, audio_array) | |
| """ | |
| if audio_input is None: | |
| return "" | |
| sr, audio = audio_input | |
| # Convert stereo → mono | |
| if audio.ndim == 2: | |
| audio = np.mean(audio, axis=1) | |
| # Convert to float32 | |
| audio = audio.astype(np.float32) | |
| # Normalize | |
| audio = audio / (np.max(np.abs(audio)) + 1e-9) | |
| # Resample to 16kHz if needed | |
| audio = resample_if_needed(audio, sr) | |
| hypotheses = model.transcribe( | |
| audio=[audio], | |
| return_hypotheses=True | |
| ) | |
| return hypotheses[0].text if hypotheses else "" | |
| demo = gr.Interface( | |
| fn=transcribe, | |
| inputs=gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="numpy", | |
| label="Record or upload WAV audio" | |
| ), | |
| outputs=gr.Textbox(label="Transcription"), | |
| title="Vaani Multilingual ASR (NeMo RNNT)", | |
| description="Upload a WAV file and get the multilingual ASR transcription." | |
| ) | |
| demo.launch() | |