trysem commited on
Commit
cd72718
·
verified ·
1 Parent(s): 8390824

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -1
app.py CHANGED
@@ -2,6 +2,9 @@ import gradio as gr
2
  import os
3
  import shutil
4
  import tarfile
 
 
 
5
  from huggingface_hub import snapshot_download
6
  from omegaconf import OmegaConf
7
  from nemo.collections.asr.models import ASRModel
@@ -68,7 +71,24 @@ def transcribe(audio_path):
68
  return "Please upload or record audio."
69
 
70
  try:
71
- transcription = model.transcribe(paths2audio_files=[audio_path])[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  if isinstance(transcription, list):
74
  return transcription[0]
 
2
  import os
3
  import shutil
4
  import tarfile
5
+ import torch
6
+ import torchaudio
7
+ import torchaudio.functional as F
8
  from huggingface_hub import snapshot_download
9
  from omegaconf import OmegaConf
10
  from nemo.collections.asr.models import ASRModel
 
71
  return "Please upload or record audio."
72
 
73
  try:
74
+ # 1. Load the audio file that Gradio provides
75
+ waveform, sample_rate = torchaudio.load(audio_path)
76
+
77
+ # 2. Convert to Mono (average the channels if it's stereo)
78
+ if waveform.shape[0] > 1:
79
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
80
+
81
+ # 3. Convert to 16000 Hz (Standard for NeMo models)
82
+ if sample_rate != 16000:
83
+ waveform = F.resample(waveform, sample_rate, 16000)
84
+ sample_rate = 16000
85
+
86
+ # 4. Save the cleaned audio to a temporary file
87
+ processed_path = audio_path + "_mono_16k.wav"
88
+ torchaudio.save(processed_path, waveform, sample_rate)
89
+
90
+ # 5. Pass the strictly formatted audio to the model
91
+ transcription = model.transcribe(paths2audio_files=[processed_path])[0]
92
 
93
  if isinstance(transcription, list):
94
  return transcription[0]