Spaces:

notuser77
/

ravdess

Sleeping

App Files Files Community

notuser77 commited on Dec 20, 2025

Commit

f32b656

verified ·

1 Parent(s): 5d55225

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -25

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import gradio as gr
 import huggingface_hub
 from speechbrain.inference.classifiers import EncoderClassifier
-# --- 1. PRE-LOAD SETUP (Monkey Patch as before) ---
 orig_download = huggingface_hub.hf_hub_download
 def patched_download(*args, **kwargs):
     if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token')
@@ -25,7 +25,6 @@ huggingface_hub.hf_hub_download = patched_download
 warnings.filterwarnings("ignore")
 # --- 2. LOAD MODELS ---
-# Using your specific SVM file
 SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
 print(f"Loading SVM: {SVM_PATH}")
 svm_model = joblib.load(SVM_PATH)
@@ -36,45 +35,54 @@ feature_extractor = EncoderClassifier.from_hparams(
     savedir="pretrained_models/spkrec-ecapa-voxceleb"
 )
-# --- 3. DEFINE INFERENCE ---
-# RAVDESS Standard Mapping (1-indexed in many datasets)
 EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
 def predict_emotion(audio_path):
-    if audio_path is None: return "Please upload audio."
-    # CRITICAL: Use SpeechBrain's loader.
-    # It automatically handles resampling to 16kHz and mono conversion.
     signal = feature_extractor.load_audio(audio_path)
-    # Extract Embeddings
     with torch.no_grad():
-        # unsqueeze(0) adds the batch dimension [1, time]
         embeddings = feature_extractor.encode_batch(signal.unsqueeze(0))
         embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)
-    # MATCH FEATURE NAMES: Your SVM was trained with named features
-    # '0_speechbrain_embedding' through '191_speechbrain_embedding'
     feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
     df_embeddings = pd.DataFrame(embeddings, columns=feature_names)
-    # Predict
-    if hasattr(svm_model, "predict_proba"):
-        probas = svm_model.predict_proba(df_embeddings)[0]
-        # Map probabilities to emotion names for Gradio Label
-        return {EMOTIONS[i]: float(probas[i]) for i in range(len(EMOTIONS))}
-    else:
-        pred_idx = int(svm_model.predict(df_embeddings)[0])
-        # If your SVM uses 1-8 labels, subtract 1; if 0-7, keep as is.
-        # Most RAVDESS SVMs use 0-7 for programming ease.
-        return EMOTIONS[pred_idx]
-# --- 4. INTERFACE ---
 demo = gr.Interface(
     fn=predict_emotion,
-    inputs=gr.Audio(type="filepath", label="Speech Input"),
-    outputs=gr.Label(num_top_classes=3),
-    title="Speech Emotion Classifier (Fixed Resampling)"
 )
 if __name__ == "__main__":

 import huggingface_hub
 from speechbrain.inference.classifiers import EncoderClassifier
+# --- 1. BOOTSTRAP (Monkey Patch for SpeechBrain 1.0.0 compatibility) ---
 orig_download = huggingface_hub.hf_hub_download
 def patched_download(*args, **kwargs):
     if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token')
 warnings.filterwarnings("ignore")
 # --- 2. LOAD MODELS ---
 SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
 print(f"Loading SVM: {SVM_PATH}")
 svm_model = joblib.load(SVM_PATH)
     savedir="pretrained_models/spkrec-ecapa-voxceleb"
 )
+# Standard RAVDESS mapping
 EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
+# --- 3. INFERENCE LOGIC ---
 def predict_emotion(audio_path):
+    if audio_path is None: return "Please upload an audio file."
+    # A. LOAD & PREPROCESS (Fixes the Bias)
     signal = feature_extractor.load_audio(audio_path)
+    # 1. Normalize Volume (Crucial for SVM stability)
+    # This prevents 'out-of-bounds' values that cause the Disgust/Surprised bias
+    if signal.abs().max() > 0:
+        signal = signal / signal.abs().max()
+    # 2. Extract Embeddings
     with torch.no_grad():
         embeddings = feature_extractor.encode_batch(signal.unsqueeze(0))
         embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)
+    # B. PREPARE DATASET FORMAT
+    # Ensure column names match what the SVM was trained on
     feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
     df_embeddings = pd.DataFrame(embeddings, columns=feature_names)
+    # C. PREDICT & HANDLE OUTPUT (Fixes the ValueError)
+    prediction = svm_model.predict(df_embeddings)[0]
+    # If the model returns a string ('calm'), return it directly
+    if isinstance(prediction, str):
+        return prediction.capitalize()
+    # If it returns a number, map it to the EMOTIONS list
+    try:
+        idx = int(prediction)
+        # Handle 1-based indexing (1-8) or 0-based (0-7)
+        if 1 <= idx <= 8: return EMOTIONS[idx-1].capitalize()
+        return EMOTIONS[idx].capitalize()
+    except:
+        return str(prediction)
+# --- 4. GRADIO INTERFACE ---
 demo = gr.Interface(
     fn=predict_emotion,
+    inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
+    outputs=gr.Textbox(label="Predicted Emotion"),
+    title="Speech Emotion Recognition",
+    description="Optimized for RAVDESS SVM. If accuracy is low, try to speak closer to the mic and minimize background noise."
 )
 if __name__ == "__main__":