File size: 3,212 Bytes
a2fc1ad 3c7ab19 5d55225 3c7ab19 a2fc1ad 76d8f21 f32b656 a2fc1ad 5d55225 76d8f21 5d55225 2b4f79e 5d55225 76d8f21 5d55225 76d8f21 2b4f79e a2fc1ad 2b4f79e 8944133 5d55225 0861246 5d55225 2b4f79e 3c7ab19 2b4f79e 0861246 f32b656 3c7ab19 f32b656 0861246 512e413 f32b656 512e413 0861246 512e413 5d55225 512e413 3c7ab19 512e413 5d55225 f32b656 512e413 f32b656 4a71fb9 0861246 f32b656 0861246 4a71fb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import os
import joblib
import pandas as pd
import numpy as np
import torch
import warnings
import gradio as gr
import huggingface_hub
from speechbrain.inference.classifiers import EncoderClassifier
# --- 1. BOOTSTRAP (Monkey Patch for SpeechBrain 1.0.0 compatibility) ---
orig_download = huggingface_hub.hf_hub_download
def patched_download(*args, **kwargs):
if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token')
fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None)
try: return orig_download(*args, **kwargs)
except Exception as e:
if fname == "custom.py":
dummy_path = os.path.abspath("dummy_custom.py")
if not os.path.exists(dummy_path):
with open(dummy_path, "w") as f: f.write("# Dummy\n")
return dummy_path
raise e
huggingface_hub.hf_hub_download = patched_download
warnings.filterwarnings("ignore")
# --- 2. LOAD MODELS ---
SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
print(f"Loading SVM: {SVM_PATH}")
svm_model = joblib.load(SVM_PATH)
print("Loading SpeechBrain Feature Extractor...")
feature_extractor = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
savedir="pretrained_models/spkrec-ecapa-voxceleb"
)
# Standard RAVDESS mapping
EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
# --- 3. INFERENCE LOGIC ---
def predict_emotion(audio_path):
if audio_path is None:
return "No audio provided"
# 1. Load and Resample to 16kHz (Critical for SpeechBrain)
signal, fs = torchaudio.load(audio_path)
if fs != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
signal = resampler(signal)
# 2. Trim Silence (VAD)
# Using librosa to remove dead air that biases the model toward 'Calm'
signal_np = signal.squeeze().numpy()
trimmed_signal, _ = librosa.effects.trim(signal_np, top_db=20)
signal = torch.from_numpy(trimmed_signal)
# 3. Extract Embeddings
with torch.no_grad():
# Ensure signal is [batch, time]
embeddings = feature_extractor.encode_batch(signal.unsqueeze(0))
# 4. L2 Normalization (Important for Cosine-based ECAPA models)
# This ensures the vector magnitude is 1.0, making it volume-invariant
embeddings = F.normalize(embeddings, p=2, dim=2)
# Reshape for SVM (1, 192)
embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)
# 5. Predict
feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
df_embeddings = pd.DataFrame(embeddings, columns=feature_names)
prediction = svm_model.predict(df_embeddings)[0]
return prediction
# --- 4. GRADIO INTERFACE ---
demo = gr.Interface(
fn=predict_emotion,
inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
outputs=gr.Textbox(label="Predicted Emotion"),
title="Speech Emotion Recognition",
description="Optimized for RAVDESS SVM. If accuracy is low, try to speak closer to the mic and minimize background noise."
)
if __name__ == "__main__":
demo.launch() |