Spaces:

tachiwin
/

classifier

Running

App Files Files Community

Luis J Camargo commited on 3 days ago

Commit

69358b9

1 Parent(s): 68fabc6

refactor

Browse files

Files changed (1) hide show

app.py +79 -92

app.py CHANGED Viewed

@@ -1,12 +1,16 @@
 import os
 import gradio as gr
 import torch
 import numpy as np
 from transformers import WhisperProcessor, AutoConfig, AutoModel, WhisperConfig, WhisperPreTrainedModel
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 import torch.nn as nn
 import psutil
-import gc
 torch.set_num_threads(1)
@@ -25,22 +29,17 @@ class WhisperEncoderOnlyForClassification(WhisperPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.encoder = WhisperEncoder(config)
         hidden = config.d_model
         self.fam_head = nn.Linear(hidden, config.n_fam)
         self.super_head = nn.Linear(hidden, config.n_super)
         self.code_head = nn.Linear(hidden, config.n_code)
         self.post_init()
     def get_input_embeddings(self):
-        """Whisper doesn't have token embeddings"""
         return None
     def set_input_embeddings(self, value):
-        """Ignore"""
         pass
     def enable_input_require_grads(self):
@@ -80,101 +79,93 @@ MODEL_REPO = "tachiwin/language_classification_enconly_model_2"
 print("Loading model on CPU...")
 processor = WhisperProcessor.from_pretrained(MODEL_REPO)
-model = WhisperEncoderOnlyForClassification.from_pretrained(MODEL_REPO, low_cpu_mem_usage=True)
 model.eval()
 print("Model loaded successfully!")
 def get_mem_usage():
     process = psutil.Process(os.getpid())
-    return process.memory_info().rss / (1024 ** 2)  # In MB
 # === INFERENCE FUNCTION ===
-def predict_language(audio):
-    if audio is None:
-        return "⚠️ No audio provided", "", ""
-    gc.collect() # Start clean
     start_mem = get_mem_usage()
-    sample_rate, audio_array = audio
-    audio_len_sec = len(audio_array) / sample_rate
     print(f"\n--- [LOG] New Request ---")
     print(f"[LOG] Start Memory: {start_mem:.2f} MB")
-    print(f"[LOG] Audio duration: {audio_len_sec:.2f}s, SR: {sample_rate}")
-    # Normalization
-    print("[LOG] Step 1: Normalizing audio...")
-    if audio_array.dtype == np.int16:
-        print("was npint16")
-        audio_array = audio_array.astype(np.float32) / 32768.0
-    elif audio_array.dtype == np.int32:
-        print("was npint32")
-        audio_array = audio_array.astype(np.float32) / 2147483648.0
-    print(f"[LOG] Memory after normalization: {get_mem_usage():.2f} MB")
-    # Resampling
-    if sample_rate != 16000:
-        print(f"[LOG] Step 2: Resampling {sample_rate}Hz -> 16000Hz...")
-        import librosa
-        audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16_000)
-        print(f"[LOG] Memory after resampling: {get_mem_usage():.2f} MB")
-    print("[LOG] DID RESAMPLE")
-    # Preprocessing
-    print("[LOG] Step 3: Extracting features...")
-    inputs = processor(
-        audio_array,
-        sampling_rate=16_000,
-        do_normalize=True,
-        device="cpu",
-        return_tensors="pt",
-    )
-    print("[LOG] DID EXTRACT")
-    # Delete raw audio array immediately as it's now in 'inputs'
-    del audio_array
-    gc.collect()
-    print(f"[LOG] Memory after preprocessing: {get_mem_usage():.2f} MB")
-    # Inference
-    print("[LOG] Step 4: Running model inference...")
-    with torch.no_grad():
-        outputs = model(input_features=inputs.input_features)
-    # Cleanup inputs
-    del inputs
-    gc.collect()
-    print(f"[LOG] Memory after inference: {get_mem_usage():.2f} MB")
-    # Post-processing
-    print("[LOG] Step 5: Post-processing results...")
-    fam_probs = torch.softmax(outputs["fam_logits"], dim=-1)
-    super_probs = torch.softmax(outputs["super_logits"], dim=-1)
-    code_probs = torch.softmax(outputs["code_logits"], dim=-1)
-    fam_idx = outputs["fam_logits"].argmax(-1).item()
-    super_idx = outputs["super_logits"].argmax(-1).item()
-    code_idx = outputs["code_logits"].argmax(-1).item()
-    fam_conf = fam_probs[0, fam_idx].item()
-    super_conf = super_probs[0, super_idx].item()
-    code_conf = code_probs[0, code_idx].item()
-    print(f"[LOG] Final Memory: {get_mem_usage():.2f} MB")
-    print(f"--- [LOG] Request Finished ---\n")
-    # Formatting results
-    return (
-        {f"{fam_idx}": fam_conf},
-        {f"{super_idx}": super_conf},
-        {f"{code_idx}": code_conf}
-    )
 # === UI COMPONENTS ===
-with gr.Blocks() as demo:
     gr.HTML(
         """
         <div style="text-align: center; padding: 30px; background: linear-gradient(135deg, #4f46e5 0%, #3b82f6 100%); color: white; border-radius: 15px; margin-bottom: 25px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);">
@@ -189,7 +180,7 @@ with gr.Blocks() as demo:
             gr.Markdown("### 🎙️ 1. Input Audio")
             audio_input = gr.Audio(
                 sources=["upload", "microphone"],
-                type="numpy",
                 label="Upload or Record"
             )
             with gr.Row():
@@ -230,8 +221,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.launch(
-        theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"),
-        ssr_mode=False,
-        show_error=True
-    )

 import os
+import gc
 import gradio as gr
 import torch
 import numpy as np
+import librosa
 from transformers import WhisperProcessor, AutoConfig, AutoModel, WhisperConfig, WhisperPreTrainedModel
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 import torch.nn as nn
 import psutil
+# --- CONFIGURATION ---
+MAX_AUDIO_SECONDS = 30
 torch.set_num_threads(1)
     def __init__(self, config):
         super().__init__(config)
         self.encoder = WhisperEncoder(config)
         hidden = config.d_model
         self.fam_head = nn.Linear(hidden, config.n_fam)
         self.super_head = nn.Linear(hidden, config.n_super)
         self.code_head = nn.Linear(hidden, config.n_code)
         self.post_init()
     def get_input_embeddings(self):
         return None
     def set_input_embeddings(self, value):
         pass
     def enable_input_require_grads(self):
 print("Loading model on CPU...")
 processor = WhisperProcessor.from_pretrained(MODEL_REPO)
+model = WhisperEncoderOnlyForClassification.from_pretrained(
+    MODEL_REPO,
+    low_cpu_mem_usage=True
+)
 model.eval()
 print("Model loaded successfully!")
 def get_mem_usage():
     process = psutil.Process(os.getpid())
+    return process.memory_info().rss / (1024 ** 2)
 # === INFERENCE FUNCTION ===
+def predict_language(audio_path):
+    if not audio_path:
+        raise gr.Error("No audio provided! Please upload or record an audio file.")
+    gc.collect()
     start_mem = get_mem_usage()
     print(f"\n--- [LOG] New Request ---")
     print(f"[LOG] Start Memory: {start_mem:.2f} MB")
+    try:
+        # Load audio directly from filepath. Librosa automatically resamples to sr=16000 and normalizes to float32
+        print("[LOG] Step 1: Loading and resampling audio from file...")
+        audio_array, sample_rate = librosa.load(audio_path, sr=16000)
+        audio_len_sec = len(audio_array) / 16000
+        print(f"[LOG] Audio duration: {audio_len_sec:.2f}s, SR: 16000")
+        print(f"[LOG] Memory after load: {get_mem_usage():.2f} MB")
+        # Enforce length limit to prevent OOM
+        if audio_len_sec > MAX_AUDIO_SECONDS:
+            del audio_array
+            gc.collect()
+            raise gr.Error(f"Audio too long ({audio_len_sec:.1f}s). Please upload or record up to {MAX_AUDIO_SECONDS} seconds.")
+        # Preprocessing
+        print("[LOG] Step 3: Extracting features...")
+        inputs = processor(
+            audio_array,
+            sampling_rate=16000,
+            return_tensors="pt"
+        )
+        # Free up the raw audio array
+        del audio_array
+        gc.collect()
+        print(f"[LOG] Memory after preprocessing: {get_mem_usage():.2f} MB")
+        # Inference
+        print("[LOG] Step 4: Running model inference...")
+        with torch.no_grad():
+            outputs = model(input_features=inputs.input_features)
+        # Free up inputs
+        del inputs
+        gc.collect()
+        # Post-processing
+        print("[LOG] Step 5: Post-processing results...")
+        fam_probs = torch.softmax(outputs["fam_logits"], dim=-1)
+        super_probs = torch.softmax(outputs["super_logits"], dim=-1)
+        code_probs = torch.softmax(outputs["code_logits"], dim=-1)
+        fam_idx = outputs["fam_logits"].argmax(-1).item()
+        super_idx = outputs["super_logits"].argmax(-1).item()
+        code_idx = outputs["code_logits"].argmax(-1).item()
+        fam_conf = fam_probs[0, fam_idx].item()
+        super_conf = super_probs[0, super_idx].item()
+        code_conf = code_probs[0, code_idx].item()
+        print(f"[LOG] Final Memory: {get_mem_usage():.2f} MB")
+        print(f"--- [LOG] Request Finished ---\n")
+        return (
+            {f"{fam_idx}": fam_conf},
+            {f"{super_idx}": super_conf},
+            {f"{code_idx}": code_conf}
+        )
+    except Exception as e:
+        print(f"Error during inference: {e}")
+        raise gr.Error(f"Processing failed: {str(e)}")
 # === UI COMPONENTS ===
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")) as demo:
     gr.HTML(
         """
         <div style="text-align: center; padding: 30px; background: linear-gradient(135deg, #4f46e5 0%, #3b82f6 100%); color: white; border-radius: 15px; margin-bottom: 25px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);">
             gr.Markdown("### 🎙️ 1. Input Audio")
             audio_input = gr.Audio(
                 sources=["upload", "microphone"],
+                type="filepath", # Changed from numpy to filepath
                 label="Upload or Record"
             )
             with gr.Row():
     )
 if __name__ == "__main__":
+    demo.launch(ssr_mode=False)