Spaces:
Running
Running
Luis J Camargo commited on
Commit ·
a22ea4f
1
Parent(s): 90f1441
feat: Add detailed logging for audio processing and inference, remove unused imports, and adjust Gradio launch configuration.
Browse files- app.py +29 -3
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
|
@@ -80,15 +81,27 @@ model.eval()
|
|
| 80 |
|
| 81 |
print("Model loaded successfully!")
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
# === INFERENCE FUNCTION ===
|
| 84 |
def predict_language(audio):
|
| 85 |
if audio is None:
|
| 86 |
return "⚠️ No audio provided", "", ""
|
| 87 |
|
|
|
|
|
|
|
|
|
|
| 88 |
sample_rate, audio_array = audio
|
| 89 |
audio_len_sec = len(audio_array) / sample_rate
|
|
|
|
| 90 |
print(f"\n--- [LOG] New Request ---")
|
| 91 |
-
print(f"[LOG]
|
|
|
|
| 92 |
|
| 93 |
# Normalization
|
| 94 |
print("[LOG] Step 1: Normalizing audio...")
|
|
@@ -96,12 +109,15 @@ def predict_language(audio):
|
|
| 96 |
audio_array = audio_array.astype(np.float32) / 32768.0
|
| 97 |
elif audio_array.dtype == np.int32:
|
| 98 |
audio_array = audio_array.astype(np.float32) / 2147483648.0
|
|
|
|
| 99 |
|
| 100 |
# Resampling
|
| 101 |
if sample_rate != 16000:
|
| 102 |
print(f"[LOG] Step 2: Resampling {sample_rate}Hz -> 16000Hz...")
|
| 103 |
import librosa
|
|
|
|
| 104 |
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
|
|
|
|
| 105 |
|
| 106 |
# Preprocessing
|
| 107 |
print("[LOG] Step 3: Extracting features...")
|
|
@@ -110,12 +126,21 @@ def predict_language(audio):
|
|
| 110 |
sampling_rate=16000,
|
| 111 |
return_tensors="pt"
|
| 112 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
# Inference
|
| 115 |
-
print("[LOG] Step 4: Running model inference
|
| 116 |
with torch.no_grad():
|
| 117 |
outputs = model(input_features=inputs.input_features)
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
# Post-processing
|
| 120 |
print("[LOG] Step 5: Post-processing results...")
|
| 121 |
fam_probs = torch.softmax(outputs["fam_logits"], dim=-1)
|
|
@@ -130,7 +155,7 @@ def predict_language(audio):
|
|
| 130 |
super_conf = super_probs[0, super_idx].item()
|
| 131 |
code_conf = code_probs[0, code_idx].item()
|
| 132 |
|
| 133 |
-
print(f"[LOG]
|
| 134 |
print(f"--- [LOG] Request Finished ---\n")
|
| 135 |
|
| 136 |
# Formatting results
|
|
@@ -140,6 +165,7 @@ def predict_language(audio):
|
|
| 140 |
{f"{code_idx}": code_conf}
|
| 141 |
)
|
| 142 |
|
|
|
|
| 143 |
# === UI COMPONENTS ===
|
| 144 |
with gr.Blocks() as demo:
|
| 145 |
gr.HTML(
|
|
|
|
| 1 |
+
import os
|
| 2 |
import gradio as gr
|
| 3 |
import torch
|
| 4 |
import numpy as np
|
|
|
|
| 81 |
|
| 82 |
print("Model loaded successfully!")
|
| 83 |
|
| 84 |
+
import psutil
|
| 85 |
+
import gc
|
| 86 |
+
|
| 87 |
+
def get_mem_usage():
|
| 88 |
+
process = psutil.Process(os.getpid())
|
| 89 |
+
return process.memory_info().rss / (1024 ** 2) # In MB
|
| 90 |
+
|
| 91 |
# === INFERENCE FUNCTION ===
|
| 92 |
def predict_language(audio):
|
| 93 |
if audio is None:
|
| 94 |
return "⚠️ No audio provided", "", ""
|
| 95 |
|
| 96 |
+
gc.collect() # Start clean
|
| 97 |
+
start_mem = get_mem_usage()
|
| 98 |
+
|
| 99 |
sample_rate, audio_array = audio
|
| 100 |
audio_len_sec = len(audio_array) / sample_rate
|
| 101 |
+
|
| 102 |
print(f"\n--- [LOG] New Request ---")
|
| 103 |
+
print(f"[LOG] Start Memory: {start_mem:.2f} MB")
|
| 104 |
+
print(f"[LOG] Audio duration: {audio_len_sec:.2f}s, SR: {sample_rate}")
|
| 105 |
|
| 106 |
# Normalization
|
| 107 |
print("[LOG] Step 1: Normalizing audio...")
|
|
|
|
| 109 |
audio_array = audio_array.astype(np.float32) / 32768.0
|
| 110 |
elif audio_array.dtype == np.int32:
|
| 111 |
audio_array = audio_array.astype(np.float32) / 2147483648.0
|
| 112 |
+
print(f"[LOG] Memory after normalization: {get_mem_usage():.2f} MB")
|
| 113 |
|
| 114 |
# Resampling
|
| 115 |
if sample_rate != 16000:
|
| 116 |
print(f"[LOG] Step 2: Resampling {sample_rate}Hz -> 16000Hz...")
|
| 117 |
import librosa
|
| 118 |
+
# Use res_type="kaiser_fast" to save memory/cpu if needed, but default is usually fine
|
| 119 |
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
|
| 120 |
+
print(f"[LOG] Memory after resampling: {get_mem_usage():.2f} MB")
|
| 121 |
|
| 122 |
# Preprocessing
|
| 123 |
print("[LOG] Step 3: Extracting features...")
|
|
|
|
| 126 |
sampling_rate=16000,
|
| 127 |
return_tensors="pt"
|
| 128 |
)
|
| 129 |
+
# Delete raw audio array immediately as it's now in 'inputs'
|
| 130 |
+
del audio_array
|
| 131 |
+
gc.collect()
|
| 132 |
+
print(f"[LOG] Memory after preprocessing: {get_mem_usage():.2f} MB")
|
| 133 |
|
| 134 |
# Inference
|
| 135 |
+
print("[LOG] Step 4: Running model inference...")
|
| 136 |
with torch.no_grad():
|
| 137 |
outputs = model(input_features=inputs.input_features)
|
| 138 |
|
| 139 |
+
# Cleanup inputs
|
| 140 |
+
del inputs
|
| 141 |
+
gc.collect()
|
| 142 |
+
print(f"[LOG] Memory after inference: {get_mem_usage():.2f} MB")
|
| 143 |
+
|
| 144 |
# Post-processing
|
| 145 |
print("[LOG] Step 5: Post-processing results...")
|
| 146 |
fam_probs = torch.softmax(outputs["fam_logits"], dim=-1)
|
|
|
|
| 155 |
super_conf = super_probs[0, super_idx].item()
|
| 156 |
code_conf = code_probs[0, code_idx].item()
|
| 157 |
|
| 158 |
+
print(f"[LOG] Final Memory: {get_mem_usage():.2f} MB")
|
| 159 |
print(f"--- [LOG] Request Finished ---\n")
|
| 160 |
|
| 161 |
# Formatting results
|
|
|
|
| 165 |
{f"{code_idx}": code_conf}
|
| 166 |
)
|
| 167 |
|
| 168 |
+
|
| 169 |
# === UI COMPONENTS ===
|
| 170 |
with gr.Blocks() as demo:
|
| 171 |
gr.HTML(
|
requirements.txt
CHANGED
|
@@ -4,4 +4,5 @@ transformers
|
|
| 4 |
numpy
|
| 5 |
librosa
|
| 6 |
huggingface_hub
|
| 7 |
-
safetensors
|
|
|
|
|
|
| 4 |
numpy
|
| 5 |
librosa
|
| 6 |
huggingface_hub
|
| 7 |
+
safetensors
|
| 8 |
+
psutil
|