trysem commited on
Commit
4e887cc
·
verified ·
1 Parent(s): 20f178a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -16
app.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import shutil
4
  import tarfile
5
  import torch
6
- import torchaudio
7
  import torchaudio.functional as F
8
  from huggingface_hub import snapshot_download
9
  from omegaconf import OmegaConf
@@ -48,7 +48,6 @@ os.makedirs(patched_dir, exist_ok=True)
48
  for item in os.listdir(model_dir):
49
  s = os.path.join(model_dir, item)
50
  d = os.path.join(patched_dir, item)
51
- # Only copy files (ignores hidden cache directories)
52
  if os.path.isfile(s):
53
  shutil.copy2(s, d)
54
 
@@ -71,23 +70,32 @@ def transcribe(audio_path):
71
  return "Please upload or record audio."
72
 
73
  try:
74
- # 1. Load the audio file that Gradio provides
75
- waveform, sample_rate = torchaudio.load(audio_path)
 
76
 
77
- # 2. Convert to Mono (average the channels if it's stereo)
 
 
 
 
 
 
78
  if waveform.shape[0] > 1:
79
  waveform = torch.mean(waveform, dim=0, keepdim=True)
80
 
81
- # 3. Convert to 16000 Hz (Standard for NeMo models)
82
  if sample_rate != 16000:
83
  waveform = F.resample(waveform, sample_rate, 16000)
84
  sample_rate = 16000
85
 
86
- # 4. Save the cleaned audio to a temporary file
87
  processed_path = audio_path + "_mono_16k.wav"
88
- torchaudio.save(processed_path, waveform, sample_rate)
 
 
89
 
90
- # 5. Pass the strictly formatted audio to the model
91
  transcription = model.transcribe(paths2audio_files=[processed_path])[0]
92
 
93
  if isinstance(transcription, list):
@@ -111,10 +119,4 @@ with gr.Blocks(title="Malayalam FastConformer ASR") as demo:
111
  text_output = gr.Textbox(label="Transcription", lines=5)
112
 
113
  transcribe_btn.click(
114
- fn=transcribe,
115
- inputs=audio_input,
116
- outputs=text_output
117
- )
118
-
119
- if __name__ == "__main__":
120
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  import shutil
4
  import tarfile
5
  import torch
6
+ import soundfile as sf
7
  import torchaudio.functional as F
8
  from huggingface_hub import snapshot_download
9
  from omegaconf import OmegaConf
 
48
  for item in os.listdir(model_dir):
49
  s = os.path.join(model_dir, item)
50
  d = os.path.join(patched_dir, item)
 
51
  if os.path.isfile(s):
52
  shutil.copy2(s, d)
53
 
 
70
  return "Please upload or record audio."
71
 
72
  try:
73
+ # 1. Load file using soundfile to completely bypass torchcodec bugs
74
+ data, sample_rate = sf.read(audio_path)
75
+ waveform = torch.from_numpy(data).float()
76
 
77
+ # 2. Reshape soundfile format [time, channels] to torchaudio format [channels, time]
78
+ if waveform.ndim == 1:
79
+ waveform = waveform.unsqueeze(0) # Mono: [time] -> [1, time]
80
+ else:
81
+ waveform = waveform.transpose(0, 1) # Stereo: [time, channels] -> [channels, time]
82
+
83
+ # 3. Convert to Mono if stereo
84
  if waveform.shape[0] > 1:
85
  waveform = torch.mean(waveform, dim=0, keepdim=True)
86
 
87
+ # 4. Resample to 16000 Hz if necessary
88
  if sample_rate != 16000:
89
  waveform = F.resample(waveform, sample_rate, 16000)
90
  sample_rate = 16000
91
 
92
+ # 5. Write the file back out using soundfile
93
  processed_path = audio_path + "_mono_16k.wav"
94
+ # soundfile expects mono arrays to be flat 1D: [time]
95
+ flat_numpy_waveform = waveform.squeeze(0).numpy()
96
+ sf.write(processed_path, flat_numpy_waveform, 16000)
97
 
98
+ # 6. Pass to NeMo model
99
  transcription = model.transcribe(paths2audio_files=[processed_path])[0]
100
 
101
  if isinstance(transcription, list):
 
119
  text_output = gr.Textbox(label="Transcription", lines=5)
120
 
121
  transcribe_btn.click(
122
+ fn=transcribe,