Spaces:

tachiwin
/

classifier

Sleeping

Luis J Camargo commited on 8 days ago

Commit

124a2d5

1 Parent(s): a22ea4f

refactor: delegate audio normalization and resampling to the processor.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -103,28 +103,14 @@ def predict_language(audio):
     print(f"[LOG] Start Memory: {start_mem:.2f} MB")
     print(f"[LOG] Audio duration: {audio_len_sec:.2f}s, SR: {sample_rate}")
-    # Normalization
-    print("[LOG] Step 1: Normalizing audio...")
-    if audio_array.dtype == np.int16:
-        audio_array = audio_array.astype(np.float32) / 32768.0
-    elif audio_array.dtype == np.int32:
-        audio_array = audio_array.astype(np.float32) / 2147483648.0
-    print(f"[LOG] Memory after normalization: {get_mem_usage():.2f} MB")
-    # Resampling
-    if sample_rate != 16000:
-        print(f"[LOG] Step 2: Resampling {sample_rate}Hz -> 16000Hz...")
-        import librosa
-        # Use res_type="kaiser_fast" to save memory/cpu if needed, but default is usually fine
-        audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
-        print(f"[LOG] Memory after resampling: {get_mem_usage():.2f} MB")
     # Preprocessing
     print("[LOG] Step 3: Extracting features...")
     inputs = processor(
         audio_array,
-        sampling_rate=16000,
-        return_tensors="pt"
     )
     # Delete raw audio array immediately as it's now in 'inputs'
     del audio_array

     print(f"[LOG] Start Memory: {start_mem:.2f} MB")
     print(f"[LOG] Audio duration: {audio_len_sec:.2f}s, SR: {sample_rate}")
     # Preprocessing
     print("[LOG] Step 3: Extracting features...")
     inputs = processor(
         audio_array,
+        sampling_rate=sample_rate,
+        do_normalize=True,
+        device="cpu",
+        return_tensors="pt",
     )
     # Delete raw audio array immediately as it's now in 'inputs'
     del audio_array