Spaces:

tachiwin
/

classifier

Running

App Files Files Community

Luis J Camargo commited on 4 days ago

Commit

90f1441

1 Parent(s): 4e7deef

refactor: Remove unused imports and commented code, add detailed logging for audio processing, and update Gradio launch parameters.

Browse files

Files changed (1) hide show

app.py +17 -12

app.py CHANGED Viewed

@@ -1,13 +1,9 @@
-# app.py
-import os
 import gradio as gr
 import torch
 import numpy as np
 from transformers import WhisperProcessor, AutoConfig, AutoModel, WhisperConfig, WhisperPreTrainedModel
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 import torch.nn as nn
-from safetensors.torch import load_file
-from huggingface_hub import hf_hub_download
 # === CUSTOM MODEL CLASSES ===
 class WhisperEncoderOnlyConfig(WhisperConfig):
@@ -79,13 +75,7 @@ MODEL_REPO = "tachiwin/language_classification_enconly_model_2"
 print("Loading model on CPU...")
 processor = WhisperProcessor.from_pretrained(MODEL_REPO)
-#config = WhisperEncoderOnlyConfig.from_pretrained(MODEL_REPO)
 model = WhisperEncoderOnlyForClassification.from_pretrained(MODEL_REPO)
-# Load weights from safetensors
-#weights_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors")
-#state_dict = load_file(weights_path)
-#model.load_state_dict(state_dict)
 model.eval()
 print("Model loaded successfully!")
@@ -96,8 +86,12 @@ def predict_language(audio):
         return "⚠️ No audio provided", "", ""
     sample_rate, audio_array = audio
     # Normalization
     if audio_array.dtype == np.int16:
         audio_array = audio_array.astype(np.float32) / 32768.0
     elif audio_array.dtype == np.int32:
@@ -105,10 +99,12 @@ def predict_language(audio):
     # Resampling
     if sample_rate != 16000:
         import librosa
         audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
     # Preprocessing
     inputs = processor(
         audio_array,
         sampling_rate=16000,
@@ -116,10 +112,12 @@ def predict_language(audio):
     )
     # Inference
     with torch.no_grad():
         outputs = model(input_features=inputs.input_features)
     # Post-processing
     fam_probs = torch.softmax(outputs["fam_logits"], dim=-1)
     super_probs = torch.softmax(outputs["super_logits"], dim=-1)
     code_probs = torch.softmax(outputs["code_logits"], dim=-1)
@@ -132,6 +130,9 @@ def predict_language(audio):
     super_conf = super_probs[0, super_idx].item()
     code_conf = code_probs[0, code_idx].item()
     # Formatting results
     return (
         {f"{fam_idx}": fam_conf},
@@ -196,5 +197,9 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.launch(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"))

 import gradio as gr
 import torch
 import numpy as np
 from transformers import WhisperProcessor, AutoConfig, AutoModel, WhisperConfig, WhisperPreTrainedModel
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 import torch.nn as nn
 # === CUSTOM MODEL CLASSES ===
 class WhisperEncoderOnlyConfig(WhisperConfig):
 print("Loading model on CPU...")
 processor = WhisperProcessor.from_pretrained(MODEL_REPO)
 model = WhisperEncoderOnlyForClassification.from_pretrained(MODEL_REPO)
 model.eval()
 print("Model loaded successfully!")
         return "⚠️ No audio provided", "", ""
     sample_rate, audio_array = audio
+    audio_len_sec = len(audio_array) / sample_rate
+    print(f"\n--- [LOG] New Request ---")
+    print(f"[LOG] Audio length: {audio_len_sec:.2f}s, SR: {sample_rate}")
     # Normalization
+    print("[LOG] Step 1: Normalizing audio...")
     if audio_array.dtype == np.int16:
         audio_array = audio_array.astype(np.float32) / 32768.0
     elif audio_array.dtype == np.int32:
     # Resampling
     if sample_rate != 16000:
+        print(f"[LOG] Step 2: Resampling {sample_rate}Hz -> 16000Hz...")
         import librosa
         audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
     # Preprocessing
+    print("[LOG] Step 3: Extracting features...")
     inputs = processor(
         audio_array,
         sampling_rate=16000,
     )
     # Inference
+    print("[LOG] Step 4: Running model inference (CPU intensive)...")
     with torch.no_grad():
         outputs = model(input_features=inputs.input_features)
     # Post-processing
+    print("[LOG] Step 5: Post-processing results...")
     fam_probs = torch.softmax(outputs["fam_logits"], dim=-1)
     super_probs = torch.softmax(outputs["super_logits"], dim=-1)
     code_probs = torch.softmax(outputs["code_logits"], dim=-1)
     super_conf = super_probs[0, super_idx].item()
     code_conf = code_probs[0, code_idx].item()
+    print(f"[LOG] Prediction successful: Family {fam_idx}")
+    print(f"--- [LOG] Request Finished ---\n")
     # Formatting results
     return (
         {f"{fam_idx}": fam_conf},
     )
 if __name__ == "__main__":
+    # Increased concurrency for CPU stability
+    demo.launch(
+        theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"),
+        ssr_mode=False,
+        show_error=True
+    )