import os import joblib import pandas as pd import numpy as np import torch import warnings import gradio as gr import huggingface_hub from speechbrain.inference.classifiers import EncoderClassifier # --- 1. BOOTSTRAP (Monkey Patch for SpeechBrain 1.0.0 compatibility) --- orig_download = huggingface_hub.hf_hub_download def patched_download(*args, **kwargs): if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token') fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None) try: return orig_download(*args, **kwargs) except Exception as e: if fname == "custom.py": dummy_path = os.path.abspath("dummy_custom.py") if not os.path.exists(dummy_path): with open(dummy_path, "w") as f: f.write("# Dummy\n") return dummy_path raise e huggingface_hub.hf_hub_download = patched_download warnings.filterwarnings("ignore") # --- 2. LOAD MODELS --- SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl' print(f"Loading SVM: {SVM_PATH}") svm_model = joblib.load(SVM_PATH) print("Loading SpeechBrain Feature Extractor...") feature_extractor = EncoderClassifier.from_hparams( source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb" ) # Standard RAVDESS mapping EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised'] # --- 3. INFERENCE LOGIC --- def predict_emotion(audio_path): if audio_path is None: return "No audio provided" # 1. Load and Resample to 16kHz (Critical for SpeechBrain) signal, fs = torchaudio.load(audio_path) if fs != 16000: resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000) signal = resampler(signal) # 2. Trim Silence (VAD) # Using librosa to remove dead air that biases the model toward 'Calm' signal_np = signal.squeeze().numpy() trimmed_signal, _ = librosa.effects.trim(signal_np, top_db=20) signal = torch.from_numpy(trimmed_signal) # 3. Extract Embeddings with torch.no_grad(): # Ensure signal is [batch, time] embeddings = feature_extractor.encode_batch(signal.unsqueeze(0)) # 4. L2 Normalization (Important for Cosine-based ECAPA models) # This ensures the vector magnitude is 1.0, making it volume-invariant embeddings = F.normalize(embeddings, p=2, dim=2) # Reshape for SVM (1, 192) embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1) # 5. Predict feature_names = [f"{i}_speechbrain_embedding" for i in range(192)] df_embeddings = pd.DataFrame(embeddings, columns=feature_names) prediction = svm_model.predict(df_embeddings)[0] return prediction # --- 4. GRADIO INTERFACE --- demo = gr.Interface( fn=predict_emotion, inputs=gr.Audio(type="filepath", label="Record or Upload Audio"), outputs=gr.Textbox(label="Predicted Emotion"), title="Speech Emotion Recognition", description="Optimized for RAVDESS SVM. If accuracy is low, try to speak closer to the mic and minimize background noise." ) if __name__ == "__main__": demo.launch()