Luis J Camargo commited on
Commit
124a2d5
·
1 Parent(s): a22ea4f

refactor: delegate audio normalization and resampling to the processor.

Browse files
Files changed (1) hide show
  1. app.py +4 -18
app.py CHANGED
@@ -103,28 +103,14 @@ def predict_language(audio):
103
  print(f"[LOG] Start Memory: {start_mem:.2f} MB")
104
  print(f"[LOG] Audio duration: {audio_len_sec:.2f}s, SR: {sample_rate}")
105
 
106
- # Normalization
107
- print("[LOG] Step 1: Normalizing audio...")
108
- if audio_array.dtype == np.int16:
109
- audio_array = audio_array.astype(np.float32) / 32768.0
110
- elif audio_array.dtype == np.int32:
111
- audio_array = audio_array.astype(np.float32) / 2147483648.0
112
- print(f"[LOG] Memory after normalization: {get_mem_usage():.2f} MB")
113
-
114
- # Resampling
115
- if sample_rate != 16000:
116
- print(f"[LOG] Step 2: Resampling {sample_rate}Hz -> 16000Hz...")
117
- import librosa
118
- # Use res_type="kaiser_fast" to save memory/cpu if needed, but default is usually fine
119
- audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
120
- print(f"[LOG] Memory after resampling: {get_mem_usage():.2f} MB")
121
-
122
  # Preprocessing
123
  print("[LOG] Step 3: Extracting features...")
124
  inputs = processor(
125
  audio_array,
126
- sampling_rate=16000,
127
- return_tensors="pt"
 
 
128
  )
129
  # Delete raw audio array immediately as it's now in 'inputs'
130
  del audio_array
 
103
  print(f"[LOG] Start Memory: {start_mem:.2f} MB")
104
  print(f"[LOG] Audio duration: {audio_len_sec:.2f}s, SR: {sample_rate}")
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # Preprocessing
107
  print("[LOG] Step 3: Extracting features...")
108
  inputs = processor(
109
  audio_array,
110
+ sampling_rate=sample_rate,
111
+ do_normalize=True,
112
+ device="cpu",
113
+ return_tensors="pt",
114
  )
115
  # Delete raw audio array immediately as it's now in 'inputs'
116
  del audio_array