Spaces:
Sleeping
Sleeping
Luis J Camargo commited on
Commit ·
124a2d5
1
Parent(s): a22ea4f
refactor: delegate audio normalization and resampling to the processor.
Browse files
app.py
CHANGED
|
@@ -103,28 +103,14 @@ def predict_language(audio):
|
|
| 103 |
print(f"[LOG] Start Memory: {start_mem:.2f} MB")
|
| 104 |
print(f"[LOG] Audio duration: {audio_len_sec:.2f}s, SR: {sample_rate}")
|
| 105 |
|
| 106 |
-
# Normalization
|
| 107 |
-
print("[LOG] Step 1: Normalizing audio...")
|
| 108 |
-
if audio_array.dtype == np.int16:
|
| 109 |
-
audio_array = audio_array.astype(np.float32) / 32768.0
|
| 110 |
-
elif audio_array.dtype == np.int32:
|
| 111 |
-
audio_array = audio_array.astype(np.float32) / 2147483648.0
|
| 112 |
-
print(f"[LOG] Memory after normalization: {get_mem_usage():.2f} MB")
|
| 113 |
-
|
| 114 |
-
# Resampling
|
| 115 |
-
if sample_rate != 16000:
|
| 116 |
-
print(f"[LOG] Step 2: Resampling {sample_rate}Hz -> 16000Hz...")
|
| 117 |
-
import librosa
|
| 118 |
-
# Use res_type="kaiser_fast" to save memory/cpu if needed, but default is usually fine
|
| 119 |
-
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
|
| 120 |
-
print(f"[LOG] Memory after resampling: {get_mem_usage():.2f} MB")
|
| 121 |
-
|
| 122 |
# Preprocessing
|
| 123 |
print("[LOG] Step 3: Extracting features...")
|
| 124 |
inputs = processor(
|
| 125 |
audio_array,
|
| 126 |
-
sampling_rate=
|
| 127 |
-
|
|
|
|
|
|
|
| 128 |
)
|
| 129 |
# Delete raw audio array immediately as it's now in 'inputs'
|
| 130 |
del audio_array
|
|
|
|
| 103 |
print(f"[LOG] Start Memory: {start_mem:.2f} MB")
|
| 104 |
print(f"[LOG] Audio duration: {audio_len_sec:.2f}s, SR: {sample_rate}")
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# Preprocessing
|
| 107 |
print("[LOG] Step 3: Extracting features...")
|
| 108 |
inputs = processor(
|
| 109 |
audio_array,
|
| 110 |
+
sampling_rate=sample_rate,
|
| 111 |
+
do_normalize=True,
|
| 112 |
+
device="cpu",
|
| 113 |
+
return_tensors="pt",
|
| 114 |
)
|
| 115 |
# Delete raw audio array immediately as it's now in 'inputs'
|
| 116 |
del audio_array
|