Update app.py
Browse files
app.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
| 3 |
import shutil
|
| 4 |
import tarfile
|
| 5 |
import torch
|
| 6 |
-
import
|
| 7 |
import torchaudio.functional as F
|
| 8 |
from huggingface_hub import snapshot_download
|
| 9 |
from omegaconf import OmegaConf
|
|
@@ -48,7 +48,6 @@ os.makedirs(patched_dir, exist_ok=True)
|
|
| 48 |
for item in os.listdir(model_dir):
|
| 49 |
s = os.path.join(model_dir, item)
|
| 50 |
d = os.path.join(patched_dir, item)
|
| 51 |
-
# Only copy files (ignores hidden cache directories)
|
| 52 |
if os.path.isfile(s):
|
| 53 |
shutil.copy2(s, d)
|
| 54 |
|
|
@@ -71,23 +70,32 @@ def transcribe(audio_path):
|
|
| 71 |
return "Please upload or record audio."
|
| 72 |
|
| 73 |
try:
|
| 74 |
-
# 1. Load
|
| 75 |
-
|
|
|
|
| 76 |
|
| 77 |
-
# 2.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
if waveform.shape[0] > 1:
|
| 79 |
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
| 80 |
|
| 81 |
-
#
|
| 82 |
if sample_rate != 16000:
|
| 83 |
waveform = F.resample(waveform, sample_rate, 16000)
|
| 84 |
sample_rate = 16000
|
| 85 |
|
| 86 |
-
#
|
| 87 |
processed_path = audio_path + "_mono_16k.wav"
|
| 88 |
-
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
#
|
| 91 |
transcription = model.transcribe(paths2audio_files=[processed_path])[0]
|
| 92 |
|
| 93 |
if isinstance(transcription, list):
|
|
@@ -111,10 +119,4 @@ with gr.Blocks(title="Malayalam FastConformer ASR") as demo:
|
|
| 111 |
text_output = gr.Textbox(label="Transcription", lines=5)
|
| 112 |
|
| 113 |
transcribe_btn.click(
|
| 114 |
-
fn=transcribe,
|
| 115 |
-
inputs=audio_input,
|
| 116 |
-
outputs=text_output
|
| 117 |
-
)
|
| 118 |
-
|
| 119 |
-
if __name__ == "__main__":
|
| 120 |
-
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 3 |
import shutil
|
| 4 |
import tarfile
|
| 5 |
import torch
|
| 6 |
+
import soundfile as sf
|
| 7 |
import torchaudio.functional as F
|
| 8 |
from huggingface_hub import snapshot_download
|
| 9 |
from omegaconf import OmegaConf
|
|
|
|
| 48 |
for item in os.listdir(model_dir):
|
| 49 |
s = os.path.join(model_dir, item)
|
| 50 |
d = os.path.join(patched_dir, item)
|
|
|
|
| 51 |
if os.path.isfile(s):
|
| 52 |
shutil.copy2(s, d)
|
| 53 |
|
|
|
|
| 70 |
return "Please upload or record audio."
|
| 71 |
|
| 72 |
try:
|
| 73 |
+
# 1. Load file using soundfile to completely bypass torchcodec bugs
|
| 74 |
+
data, sample_rate = sf.read(audio_path)
|
| 75 |
+
waveform = torch.from_numpy(data).float()
|
| 76 |
|
| 77 |
+
# 2. Reshape soundfile format [time, channels] to torchaudio format [channels, time]
|
| 78 |
+
if waveform.ndim == 1:
|
| 79 |
+
waveform = waveform.unsqueeze(0) # Mono: [time] -> [1, time]
|
| 80 |
+
else:
|
| 81 |
+
waveform = waveform.transpose(0, 1) # Stereo: [time, channels] -> [channels, time]
|
| 82 |
+
|
| 83 |
+
# 3. Convert to Mono if stereo
|
| 84 |
if waveform.shape[0] > 1:
|
| 85 |
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
| 86 |
|
| 87 |
+
# 4. Resample to 16000 Hz if necessary
|
| 88 |
if sample_rate != 16000:
|
| 89 |
waveform = F.resample(waveform, sample_rate, 16000)
|
| 90 |
sample_rate = 16000
|
| 91 |
|
| 92 |
+
# 5. Write the file back out using soundfile
|
| 93 |
processed_path = audio_path + "_mono_16k.wav"
|
| 94 |
+
# soundfile expects mono arrays to be flat 1D: [time]
|
| 95 |
+
flat_numpy_waveform = waveform.squeeze(0).numpy()
|
| 96 |
+
sf.write(processed_path, flat_numpy_waveform, 16000)
|
| 97 |
|
| 98 |
+
# 6. Pass to NeMo model
|
| 99 |
transcription = model.transcribe(paths2audio_files=[processed_path])[0]
|
| 100 |
|
| 101 |
if isinstance(transcription, list):
|
|
|
|
| 119 |
text_output = gr.Textbox(label="Transcription", lines=5)
|
| 120 |
|
| 121 |
transcribe_btn.click(
|
| 122 |
+
fn=transcribe,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|