ravdess / app.py
notuser77's picture
Update app.py
512e413 verified
import os
import joblib
import pandas as pd
import numpy as np
import torch
import warnings
import gradio as gr
import huggingface_hub
from speechbrain.inference.classifiers import EncoderClassifier
# --- 1. BOOTSTRAP (Monkey Patch for SpeechBrain 1.0.0 compatibility) ---
orig_download = huggingface_hub.hf_hub_download
def patched_download(*args, **kwargs):
if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token')
fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None)
try: return orig_download(*args, **kwargs)
except Exception as e:
if fname == "custom.py":
dummy_path = os.path.abspath("dummy_custom.py")
if not os.path.exists(dummy_path):
with open(dummy_path, "w") as f: f.write("# Dummy\n")
return dummy_path
raise e
huggingface_hub.hf_hub_download = patched_download
warnings.filterwarnings("ignore")
# --- 2. LOAD MODELS ---
SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
print(f"Loading SVM: {SVM_PATH}")
svm_model = joblib.load(SVM_PATH)
print("Loading SpeechBrain Feature Extractor...")
feature_extractor = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
savedir="pretrained_models/spkrec-ecapa-voxceleb"
)
# Standard RAVDESS mapping
EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
# --- 3. INFERENCE LOGIC ---
def predict_emotion(audio_path):
if audio_path is None:
return "No audio provided"
# 1. Load and Resample to 16kHz (Critical for SpeechBrain)
signal, fs = torchaudio.load(audio_path)
if fs != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
signal = resampler(signal)
# 2. Trim Silence (VAD)
# Using librosa to remove dead air that biases the model toward 'Calm'
signal_np = signal.squeeze().numpy()
trimmed_signal, _ = librosa.effects.trim(signal_np, top_db=20)
signal = torch.from_numpy(trimmed_signal)
# 3. Extract Embeddings
with torch.no_grad():
# Ensure signal is [batch, time]
embeddings = feature_extractor.encode_batch(signal.unsqueeze(0))
# 4. L2 Normalization (Important for Cosine-based ECAPA models)
# This ensures the vector magnitude is 1.0, making it volume-invariant
embeddings = F.normalize(embeddings, p=2, dim=2)
# Reshape for SVM (1, 192)
embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)
# 5. Predict
feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
df_embeddings = pd.DataFrame(embeddings, columns=feature_names)
prediction = svm_model.predict(df_embeddings)[0]
return prediction
# --- 4. GRADIO INTERFACE ---
demo = gr.Interface(
fn=predict_emotion,
inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
outputs=gr.Textbox(label="Predicted Emotion"),
title="Speech Emotion Recognition",
description="Optimized for RAVDESS SVM. If accuracy is low, try to speak closer to the mic and minimize background noise."
)
if __name__ == "__main__":
demo.launch()