Thanh-Lam commited on
Commit
2c4c515
·
1 Parent(s): 3799c1c

Use librosa instead of torchaudio for audio loading

Browse files
Files changed (1) hide show
  1. app.py +5 -18
app.py CHANGED
@@ -5,7 +5,8 @@ Supports: Vietnamese Wav2Vec2 and PhoWhisper encoders
5
 
6
  import os
7
  import torch
8
- import torchaudio
 
9
  import gradio as gr
10
  from pathlib import Path
11
  from safetensors.torch import load_file as load_safetensors
@@ -128,29 +129,15 @@ class MultiModelProfiler:
128
  processor = self.processors[model_name]
129
  is_whisper = MODELS_CONFIG[model_name]["is_whisper"]
130
 
131
- # Load audio
132
- waveform, sr = torchaudio.load(audio_path)
133
-
134
- # Convert to mono
135
- if waveform.shape[0] > 1:
136
- waveform = waveform.mean(dim=0, keepdim=True)
137
-
138
- # Resample if needed
139
- if sr != self.sampling_rate:
140
- resampler = torchaudio.transforms.Resample(sr, self.sampling_rate)
141
- waveform = resampler(waveform)
142
-
143
- waveform = waveform.squeeze(0).numpy()
144
 
145
  # Process based on model type
146
  if is_whisper:
147
  # Whisper requires exactly 30 seconds of audio
148
  whisper_length = self.sampling_rate * 30 # 480000 samples
149
  if len(waveform) < whisper_length:
150
- waveform_padded = torch.nn.functional.pad(
151
- torch.tensor(waveform),
152
- (0, whisper_length - len(waveform))
153
- ).numpy()
154
  else:
155
  waveform_padded = waveform[:whisper_length]
156
 
 
5
 
6
  import os
7
  import torch
8
+ import librosa
9
+ import numpy as np
10
  import gradio as gr
11
  from pathlib import Path
12
  from safetensors.torch import load_file as load_safetensors
 
129
  processor = self.processors[model_name]
130
  is_whisper = MODELS_CONFIG[model_name]["is_whisper"]
131
 
132
+ # Load audio using librosa (more compatible)
133
+ waveform, sr = librosa.load(audio_path, sr=self.sampling_rate, mono=True)
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  # Process based on model type
136
  if is_whisper:
137
  # Whisper requires exactly 30 seconds of audio
138
  whisper_length = self.sampling_rate * 30 # 480000 samples
139
  if len(waveform) < whisper_length:
140
+ waveform_padded = np.pad(waveform, (0, whisper_length - len(waveform)))
 
 
 
141
  else:
142
  waveform_padded = waveform[:whisper_length]
143