SujithPulikodan commited on
Commit
7b7d2ab
·
verified ·
1 Parent(s): 8e1bba7

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +4 -5
  2. app.py +77 -0
  3. requirements.txt +40 -0
README.md CHANGED
@@ -1,14 +1,13 @@
1
  ---
2
  title: Vaani FastConformer Multilingual ASR
3
- emoji:
4
- colorFrom: blue
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 6.5.1
 
8
  app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: 'Speech-to-text across multiple Indian langauges '
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Vaani FastConformer Multilingual ASR
3
+ emoji: 🚀
4
+ colorFrom: red
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 6.5.1
8
+ python_version: 3.10.0
9
  app_file: app.py
10
+ pinned: false
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import gradio as gr
4
+ from nemo.collections.asr.models import EncDecRNNTBPEModel
5
+ import soundfile as sf
6
+ import numpy as np
7
+ import torchaudio
8
+ MODEL_NAME = "ARTPARK-IISc/Vaani-FastConformer-Multilingual"
9
+
10
+ print("Loading model, this may take a few minutes...")
11
+ model = EncDecRNNTBPEModel.from_pretrained(MODEL_NAME)
12
+ model.eval()
13
+
14
+ # Use CPU if GPU is not available
15
+ if not torch.cuda.is_available():
16
+ model = model.cpu()
17
+ print("Model loaded successfully.")
18
+ TARGET_SR = 16000
19
+
20
+ def resample_if_needed(audio, sr):
21
+ if sr == TARGET_SR:
22
+ return audio
23
+
24
+ audio_tensor = torch.from_numpy(audio).unsqueeze(0) # (1, T)
25
+
26
+ resampler = torchaudio.transforms.Resample(
27
+ orig_freq=sr,
28
+ new_freq=TARGET_SR
29
+ )
30
+
31
+ audio_resampled = resampler(audio_tensor)
32
+ return audio_resampled.squeeze(0).numpy()
33
+
34
+
35
+ def transcribe(audio_input):
36
+ """
37
+ audio_input: (sample_rate, audio_array)
38
+ """
39
+ if audio_input is None:
40
+ return ""
41
+
42
+ sr, audio = audio_input
43
+
44
+ # Convert stereo → mono
45
+ if audio.ndim == 2:
46
+ audio = np.mean(audio, axis=1)
47
+
48
+ # Convert to float32
49
+ audio = audio.astype(np.float32)
50
+
51
+ # Normalize
52
+ audio = audio / (np.max(np.abs(audio)) + 1e-9)
53
+
54
+ # Resample to 16kHz if needed
55
+ audio = resample_if_needed(audio, sr)
56
+
57
+ hypotheses = model.transcribe(
58
+ audio=[audio],
59
+ return_hypotheses=True
60
+ )
61
+
62
+ return hypotheses[0].text if hypotheses else ""
63
+
64
+
65
+ demo = gr.Interface(
66
+ fn=transcribe,
67
+ inputs=gr.Audio(
68
+ sources=["microphone", "upload"],
69
+ type="numpy",
70
+ label="Record or upload WAV audio"
71
+ ),
72
+ outputs=gr.Textbox(label="Transcription"),
73
+ title="Vaani Multilingual ASR (NeMo RNNT)",
74
+ description="Upload a WAV file and get the multilingual ASR transcription."
75
+ )
76
+
77
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.8.0
2
+ lightning==2.4.0
3
+ cloudpickle==3.1.2
4
+ fiddle==0.3.0
5
+ numpy==2.1.0
6
+ nemo-toolkit==2.4.0
7
+ lhotse==1.32.0
8
+ ml-dtypes==0.5.3
9
+ onnx==1.19.0
10
+ librosa
11
+ einops==0.8.1
12
+ soundfile
13
+ gradio
14
+ omegaconf
15
+ hydra-core
16
+ sentencepiece
17
+ texterrors
18
+ transformers
19
+ jiwer
20
+ webdataset==1.0.2
21
+ pyannote.core==5.0.0
22
+
23
+ omegaconf==2.3.0
24
+ editdistance
25
+ pyannote.core
26
+
27
+ # Audio
28
+ soxr
29
+
30
+ # Utilities
31
+ tqdm
32
+ packaging
33
+ PyYAML
34
+ requests
35
+ ipython==8.37.0
36
+ pyannote.audio==3.3.2
37
+
38
+ pyannote.database==5.1.3
39
+ pyannote.metrics==3.2.1
40
+ pyannote.pipeline==3.0.1