Spaces:

trysem
/

KASR

Runtime error

App Files Files Community

trysem commited on 22 days ago

Commit

4e887cc

verified ·

1 Parent(s): 20f178a

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -16

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import shutil
 import tarfile
 import torch
-import torchaudio
 import torchaudio.functional as F
 from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
@@ -48,7 +48,6 @@ os.makedirs(patched_dir, exist_ok=True)
 for item in os.listdir(model_dir):
     s = os.path.join(model_dir, item)
     d = os.path.join(patched_dir, item)
-    # Only copy files (ignores hidden cache directories)
     if os.path.isfile(s):
         shutil.copy2(s, d)
@@ -71,23 +70,32 @@ def transcribe(audio_path):
         return "Please upload or record audio."
     try:
-        # 1. Load the audio file that Gradio provides
-        waveform, sample_rate = torchaudio.load(audio_path)
-        # 2. Convert to Mono (average the channels if it's stereo)
         if waveform.shape[0] > 1:
             waveform = torch.mean(waveform, dim=0, keepdim=True)
-        # 3. Convert to 16000 Hz (Standard for NeMo models)
         if sample_rate != 16000:
             waveform = F.resample(waveform, sample_rate, 16000)
             sample_rate = 16000
-        # 4. Save the cleaned audio to a temporary file
         processed_path = audio_path + "_mono_16k.wav"
-        torchaudio.save(processed_path, waveform, sample_rate)
-        # 5. Pass the strictly formatted audio to the model
         transcription = model.transcribe(paths2audio_files=[processed_path])[0]
         if isinstance(transcription, list):
@@ -111,10 +119,4 @@ with gr.Blocks(title="Malayalam FastConformer ASR") as demo:
             text_output = gr.Textbox(label="Transcription", lines=5)
     transcribe_btn.click(
-        fn=transcribe,
-        inputs=audio_input,
-        outputs=text_output
-    )
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import shutil
 import tarfile
 import torch
+import soundfile as sf
 import torchaudio.functional as F
 from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
 for item in os.listdir(model_dir):
     s = os.path.join(model_dir, item)
     d = os.path.join(patched_dir, item)
     if os.path.isfile(s):
         shutil.copy2(s, d)
         return "Please upload or record audio."
     try:
+        # 1. Load file using soundfile to completely bypass torchcodec bugs
+        data, sample_rate = sf.read(audio_path)
+        waveform = torch.from_numpy(data).float()
+        # 2. Reshape soundfile format [time, channels] to torchaudio format [channels, time]
+        if waveform.ndim == 1:
+            waveform = waveform.unsqueeze(0)  # Mono: [time] -> [1, time]
+        else:
+            waveform = waveform.transpose(0, 1)  # Stereo: [time, channels] -> [channels, time]
+        # 3. Convert to Mono if stereo
         if waveform.shape[0] > 1:
             waveform = torch.mean(waveform, dim=0, keepdim=True)
+        # 4. Resample to 16000 Hz if necessary
         if sample_rate != 16000:
             waveform = F.resample(waveform, sample_rate, 16000)
             sample_rate = 16000
+        # 5. Write the file back out using soundfile
         processed_path = audio_path + "_mono_16k.wav"
+        # soundfile expects mono arrays to be flat 1D: [time]
+        flat_numpy_waveform = waveform.squeeze(0).numpy()
+        sf.write(processed_path, flat_numpy_waveform, 16000)
+        # 6. Pass to NeMo model
         transcription = model.transcribe(paths2audio_files=[processed_path])[0]
         if isinstance(transcription, list):
             text_output = gr.Textbox(label="Transcription", lines=5)
     transcribe_btn.click(
+        fn=transcribe,