Spaces:

notuser77
/

ravdess

Sleeping

App Files Files Community

notuser77 commited on Dec 20, 2025

Commit

5d55225

verified ·

1 Parent(s): 3c7ab19

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -46

app.py CHANGED Viewed

@@ -3,85 +3,78 @@ import joblib
 import pandas as pd
 import numpy as np
 import torch
-import torchaudio
-import warnings  # <--- This fixes the NameError
 import gradio as gr
 import huggingface_hub
 from speechbrain.inference.classifiers import EncoderClassifier
-# 1. ROBUST MONKEY PATCH
-# This fixes the 'use_auth_token' vs 'token' error and the 'NoneType' crash
 orig_download = huggingface_hub.hf_hub_download
 def patched_download(*args, **kwargs):
-    if 'use_auth_token' in kwargs:
-        kwargs['token'] = kwargs.pop('use_auth_token')
     fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None)
-    try:
-        return orig_download(*args, **kwargs)
     except Exception as e:
-        # If SpeechBrain looks for 'custom.py' and it's missing (404),
-        # return a dummy file path instead of None to prevent a crash.
-        if fname == "custom.py" and ("404" in str(e) or "Not Found" in str(e)):
             dummy_path = os.path.abspath("dummy_custom.py")
             if not os.path.exists(dummy_path):
-                with open(dummy_path, "w") as f:
-                    f.write("# Dummy file for compatibility\n")
             return dummy_path
         raise e
 huggingface_hub.hf_hub_download = patched_download
 warnings.filterwarnings("ignore")
-# 2. LOAD MODELS
-# Load your SVM Classifier (trying both possible filenames)
-MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
-if not os.path.exists(MODEL_PATH):
-    MODEL_PATH = 'svm_model.joblib'
-print(f"Loading SVM classifier: {MODEL_PATH}")
-svm_model = joblib.load(MODEL_PATH)
-# Load SpeechBrain Feature Extractor
-print("Loading SpeechBrain ECAPA feature extractor...")
 feature_extractor = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-ecapa-voxceleb",
     savedir="pretrained_models/spkrec-ecapa-voxceleb"
 )
-# 3. DEFINE INFERENCE
 EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
 def predict_emotion(audio_path):
-    if audio_path is None:
-        return "Please upload an audio file."
-    # Load and Preprocess Audio
-    signal, fs = torchaudio.load(audio_path)
-    # Extract ECAPA-TDNN Embeddings
     with torch.no_grad():
-        embeddings = feature_extractor.encode_batch(signal)
-        # Convert to numpy and flatten (SVM expects 1D array of 192 features)
         embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)
-    # Predict with SVM
-    prediction = svm_model.predict(embeddings)[0]
-    # Return mapped label if numeric, otherwise return string
-    if isinstance(prediction, (int, np.integer)):
-        return EMOTIONS[prediction]
-    return prediction
-# 4. GRADIO INTERFACE
 demo = gr.Interface(
     fn=predict_emotion,
-    inputs=gr.Audio(type="filepath", label="Upload Speech (WAV)"),
-    outputs=gr.Label(label="Detected Emotion"),
-    title="Speech Emotion Recognition (RAVDESS)",
-    description="This app uses SpeechBrain ECAPA-TDNN embeddings and a pre-trained SVM to classify emotions."
 )
 if __name__ == "__main__":

 import pandas as pd
 import numpy as np
 import torch
+import warnings
 import gradio as gr
 import huggingface_hub
 from speechbrain.inference.classifiers import EncoderClassifier
+# --- 1. PRE-LOAD SETUP (Monkey Patch as before) ---
 orig_download = huggingface_hub.hf_hub_download
 def patched_download(*args, **kwargs):
+    if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token')
     fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None)
+    try: return orig_download(*args, **kwargs)
     except Exception as e:
+        if fname == "custom.py":
             dummy_path = os.path.abspath("dummy_custom.py")
             if not os.path.exists(dummy_path):
+                with open(dummy_path, "w") as f: f.write("# Dummy\n")
             return dummy_path
         raise e
 huggingface_hub.hf_hub_download = patched_download
 warnings.filterwarnings("ignore")
+# --- 2. LOAD MODELS ---
+# Using your specific SVM file
+SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
+print(f"Loading SVM: {SVM_PATH}")
+svm_model = joblib.load(SVM_PATH)
+print("Loading SpeechBrain Feature Extractor...")
 feature_extractor = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-ecapa-voxceleb",
     savedir="pretrained_models/spkrec-ecapa-voxceleb"
 )
+# --- 3. DEFINE INFERENCE ---
+# RAVDESS Standard Mapping (1-indexed in many datasets)
 EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
 def predict_emotion(audio_path):
+    if audio_path is None: return "Please upload audio."
+    # CRITICAL: Use SpeechBrain's loader.
+    # It automatically handles resampling to 16kHz and mono conversion.
+    signal = feature_extractor.load_audio(audio_path)
+    # Extract Embeddings
     with torch.no_grad():
+        # unsqueeze(0) adds the batch dimension [1, time]
+        embeddings = feature_extractor.encode_batch(signal.unsqueeze(0))
         embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)
+    # MATCH FEATURE NAMES: Your SVM was trained with named features
+    # '0_speechbrain_embedding' through '191_speechbrain_embedding'
+    feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
+    df_embeddings = pd.DataFrame(embeddings, columns=feature_names)
+    # Predict
+    if hasattr(svm_model, "predict_proba"):
+        probas = svm_model.predict_proba(df_embeddings)[0]
+        # Map probabilities to emotion names for Gradio Label
+        return {EMOTIONS[i]: float(probas[i]) for i in range(len(EMOTIONS))}
+    else:
+        pred_idx = int(svm_model.predict(df_embeddings)[0])
+        # If your SVM uses 1-8 labels, subtract 1; if 0-7, keep as is.
+        # Most RAVDESS SVMs use 0-7 for programming ease.
+        return EMOTIONS[pred_idx]
+# --- 4. INTERFACE ---
 demo = gr.Interface(
     fn=predict_emotion,
+    inputs=gr.Audio(type="filepath", label="Speech Input"),
+    outputs=gr.Label(num_top_classes=3),
+    title="Speech Emotion Classifier (Fixed Resampling)"
 )
 if __name__ == "__main__":