Spaces:

notuser77
/

ravdess

Sleeping

App Files Files Community

notuser77 commited on Dec 20, 2025

Commit

8944133

verified ·

1 Parent(s): 4a71fb9

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -16

app.py CHANGED Viewed

@@ -5,10 +5,14 @@ import joblib
 import pandas as pd
 import numpy as np
 import os
 from speechbrain.inference.speaker import EncoderClassifier
-# 1. Load the SVM model
-# We check for both possible filenames you uploaded
 MODEL_PATH = 'svm_model.joblib'
 if not os.path.exists(MODEL_PATH):
     MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
@@ -17,53 +21,57 @@ print(f"Loading model from: {MODEL_PATH}")
 model = joblib.load(MODEL_PATH)
 # 2. Load the SpeechBrain ECAPA-TDNN feature extractor
-# This downloads the pre-trained weights from Hugging Face
 feature_extractor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
 def predict_emotion(audio_path):
     if audio_path is None:
         return "Please upload an audio file."
-    # 3. Load Audio
     signal, fs = torchaudio.load(audio_path)
-    # 4. Preprocess: Match the 16kHz requirement of ECAPA-TDNN
     if fs != 16000:
         resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
         signal = resampler(signal)
-    # Convert stereo to mono
     if signal.shape[0] > 1:
         signal = torch.mean(signal, dim=0, keepdim=True)
-    # 5. Extract 192-D Embeddings
     with torch.no_grad():
         embeddings = feature_extractor.encode_batch(signal)
-        # Squeeze and convert to numpy
         embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
-    # 6. Create Dataframe with exact feature names model expects
     feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
     X = pd.DataFrame(embeddings, columns=feature_names)
-    # 7. Predict
     try:
         # Get probability scores for each class
         probs = model.predict_proba(X)[0]
-        # Return a dictionary of {Emotion: Probability} for Gradio's Label component
         return {model.classes_[i]: float(probs[i]) for i in range(len(model.classes_))}
     except AttributeError:
-        # If the model wasn't trained with probability=True, just return the top label
         prediction = model.predict(X)[0]
         return str(prediction)
-# 8. Build Interface
 demo = gr.Interface(
     fn=predict_emotion,
-    inputs=gr.Audio(type="filepath", label="Upload or Record Speech"),
     outputs=gr.Label(label="Emotion Confidence"),
-    title="RAVDESS Speech Emotion Classifier",
-    description="This app uses ECAPA-TDNN embeddings and a Support Vector Machine to classify emotions in speech."
 )
 if __name__ == "__main__":

 import pandas as pd
 import numpy as np
 import os
+import warnings
 from speechbrain.inference.speaker import EncoderClassifier
+# Ignore the scikit-learn version warning (1.5.2 vs 1.7.x)
+warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
+# 1. Load your SVM model
+# We try both names you provided to be safe
 MODEL_PATH = 'svm_model.joblib'
 if not os.path.exists(MODEL_PATH):
     MODEL_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
 model = joblib.load(MODEL_PATH)
 # 2. Load the SpeechBrain ECAPA-TDNN feature extractor
+# NOTE: The pinned huggingface-hub==0.24.0 in requirements.txt fixes the TypeError
 feature_extractor = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
 def predict_emotion(audio_path):
     if audio_path is None:
         return "Please upload an audio file."
+    # 3. Load and Preprocess Audio
     signal, fs = torchaudio.load(audio_path)
+    # Resample to 16kHz (ECAPA requirement)
     if fs != 16000:
         resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
         signal = resampler(signal)
+    # Convert to mono
     if signal.shape[0] > 1:
         signal = torch.mean(signal, dim=0, keepdim=True)
+    # 4. Feature Extraction (192-D Embeddings)
     with torch.no_grad():
         embeddings = feature_extractor.encode_batch(signal)
         embeddings = embeddings.squeeze().cpu().numpy().reshape(1, -1)
+    # 5. Prediction
+    # Create DataFrame with exact feature names the SVM expects
     feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
     X = pd.DataFrame(embeddings, columns=feature_names)
     try:
         # Get probability scores for each class
         probs = model.predict_proba(X)[0]
+        # model.classes_ contains the emotion names
         return {model.classes_[i]: float(probs[i]) for i in range(len(model.classes_))}
     except AttributeError:
+        # Fallback if probability=False was used during training
         prediction = model.predict(X)[0]
         return str(prediction)
+# 6. Gradio Interface
+description = (
+    "Extracts ECAPA-TDNN embeddings via SpeechBrain and classifies them using an SVM. "
+    "Best results with 3-5 second speech clips."
+)
 demo = gr.Interface(
     fn=predict_emotion,
+    inputs=gr.Audio(type="filepath", label="Input Audio"),
     outputs=gr.Label(label="Emotion Confidence"),
+    title="Speech Emotion Recognition",
+    description=description
 )
 if __name__ == "__main__":