Spaces:

notuser77
/

ravdess

Sleeping

App Files Files Community

notuser77 commited on Dec 20, 2025

Commit

a2fc1ad

verified ·

1 Parent(s): 97ba441

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -32

app.py CHANGED Viewed

@@ -1,77 +1,83 @@
-import gradio as gr
-import torch
-import torchaudio
 import joblib
 import pandas as pd
 import numpy as np
-import os
 import warnings
 from speechbrain.inference.speaker import EncoderClassifier
-# Ignore the scikit-learn version warning (1.5.2 vs 1.7.x)
-warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
-# 1. Load your SVM model
-# We try both names you provided to be safe
 MODEL_PATH = 'svm_model.joblib'
 if not os.path.exists(MODEL_PATH):
     MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
-print(f"Loading model from: {MODEL_PATH}")
 model = joblib.load(MODEL_PATH)
-# 2. Load the SpeechBrain ECAPA-TDNN feature extractor
-# NOTE: The pinned huggingface-hub==0.24.0 in requirements.txt fixes the TypeError
 feature_extractor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
 def predict_emotion(audio_path):
     if audio_path is None:
         return "Please upload an audio file."
-    # 3. Load and Preprocess Audio
     signal, fs = torchaudio.load(audio_path)
-    # Resample to 16kHz (ECAPA requirement)
     if fs != 16000:
         resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
         signal = resampler(signal)
-    # Convert to mono
     if signal.shape[0] > 1:
         signal = torch.mean(signal, dim=0, keepdim=True)
-    # 4. Feature Extraction (192-D Embeddings)
     with torch.no_grad():
         embeddings = feature_extractor.encode_batch(signal)
         embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
-    # 5. Prediction
-    # Create DataFrame with exact feature names the SVM expects
     feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
     X = pd.DataFrame(embeddings, columns=feature_names)
     try:
-        # Get probability scores for each class
         probs = model.predict_proba(X)[0]
-        # model.classes_ contains the emotion names
         return {model.classes_[i]: float(probs[i]) for i in range(len(model.classes_))}
-    except AttributeError:
-        # Fallback if probability=False was used during training
         prediction = model.predict(X)[0]
         return str(prediction)
-# 6. Gradio Interface
-description = (
-    "Extracts ECAPA-TDNN embeddings via SpeechBrain and classifies them using an SVM. "
-    "Best results with 3-5 second speech clips."
-)
 demo = gr.Interface(
     fn=predict_emotion,
-    inputs=gr.Audio(type="filepath", label="Input Audio"),
-    outputs=gr.Label(label="Emotion Confidence"),
-    title="Speech Emotion Recognition",
-    description=description
 )
 if __name__ == "__main__":

+import os
 import joblib
 import pandas as pd
 import numpy as np
+import torch
+import torchaudio
 import warnings
+import gradio as gr
+# --- STEP 1: THE MONKEY PATCH (Fixes the TypeError) ---
+import huggingface_hub
+orig_download = huggingface_hub.hf_hub_download
+def patched_download(*args, **kwargs):
+    if 'use_auth_token' in kwargs:
+        kwargs['token'] = kwargs.pop('use_auth_token')
+    return orig_download(*args, **kwargs)
+huggingface_hub.hf_hub_download = patched_download
+# Import SpeechBrain after the patch
 from speechbrain.inference.speaker import EncoderClassifier
+# Suppress scikit-learn version warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+# --- STEP 2: LOAD MODELS ---
+# We check both filenames you uploaded
 MODEL_PATH = 'svm_model.joblib'
 if not os.path.exists(MODEL_PATH):
     MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
+print(f"Loading classifier from: {MODEL_PATH}")
 model = joblib.load(MODEL_PATH)
+# Load the ECAPA-TDNN feature extractor
+print("Loading SpeechBrain feature extractor...")
 feature_extractor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
+# --- STEP 3: PREDICTION LOGIC ---
 def predict_emotion(audio_path):
     if audio_path is None:
         return "Please upload an audio file."
+    # Load audio
     signal, fs = torchaudio.load(audio_path)
+    # Preprocess (16kHz mono)
     if fs != 16000:
         resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
         signal = resampler(signal)
     if signal.shape[0] > 1:
         signal = torch.mean(signal, dim=0, keepdim=True)
+    # Extract 192-D Embeddings
     with torch.no_grad():
         embeddings = feature_extractor.encode_batch(signal)
         embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
+    # Create DataFrame with exact column names the SVM expects
+    # (0_speechbrain_embedding, 1_speechbrain_embedding, etc.)
     feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
     X = pd.DataFrame(embeddings, columns=feature_names)
+    # Predict
     try:
+        # Get probabilities for the Label output
         probs = model.predict_proba(X)[0]
         return {model.classes_[i]: float(probs[i]) for i in range(len(model.classes_))}
+    except Exception:
+        # Fallback to direct prediction
         prediction = model.predict(X)[0]
         return str(prediction)
+# --- STEP 4: GRADIO UI ---
 demo = gr.Interface(
     fn=predict_emotion,
+    inputs=gr.Audio(type="filepath", label="Upload Voice Clip"),
+    outputs=gr.Label(label="Detected Emotion"),
+    title="RAVDESS Emotion Classifier",
+    description="This app uses ECAPA-TDNN embeddings from SpeechBrain and an SVM classifier to detect emotions in speech.",
+    allow_flagging="never"
 )
 if __name__ == "__main__":