|
|
import os |
|
|
import joblib |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import torch |
|
|
import warnings |
|
|
import gradio as gr |
|
|
import huggingface_hub |
|
|
from speechbrain.inference.classifiers import EncoderClassifier |
|
|
|
|
|
|
|
|
orig_download = huggingface_hub.hf_hub_download |
|
|
def patched_download(*args, **kwargs): |
|
|
if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token') |
|
|
fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None) |
|
|
try: return orig_download(*args, **kwargs) |
|
|
except Exception as e: |
|
|
if fname == "custom.py": |
|
|
dummy_path = os.path.abspath("dummy_custom.py") |
|
|
if not os.path.exists(dummy_path): |
|
|
with open(dummy_path, "w") as f: f.write("# Dummy\n") |
|
|
return dummy_path |
|
|
raise e |
|
|
huggingface_hub.hf_hub_download = patched_download |
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
|
|
|
SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl' |
|
|
print(f"Loading SVM: {SVM_PATH}") |
|
|
svm_model = joblib.load(SVM_PATH) |
|
|
|
|
|
print("Loading SpeechBrain Feature Extractor...") |
|
|
feature_extractor = EncoderClassifier.from_hparams( |
|
|
source="speechbrain/spkrec-ecapa-voxceleb", |
|
|
savedir="pretrained_models/spkrec-ecapa-voxceleb" |
|
|
) |
|
|
|
|
|
|
|
|
EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised'] |
|
|
|
|
|
|
|
|
def predict_emotion(audio_path): |
|
|
if audio_path is None: |
|
|
return "No audio provided" |
|
|
|
|
|
|
|
|
signal, fs = torchaudio.load(audio_path) |
|
|
if fs != 16000: |
|
|
resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000) |
|
|
signal = resampler(signal) |
|
|
|
|
|
|
|
|
|
|
|
signal_np = signal.squeeze().numpy() |
|
|
trimmed_signal, _ = librosa.effects.trim(signal_np, top_db=20) |
|
|
signal = torch.from_numpy(trimmed_signal) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
|
|
|
embeddings = feature_extractor.encode_batch(signal.unsqueeze(0)) |
|
|
|
|
|
|
|
|
|
|
|
embeddings = F.normalize(embeddings, p=2, dim=2) |
|
|
|
|
|
|
|
|
embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1) |
|
|
|
|
|
|
|
|
feature_names = [f"{i}_speechbrain_embedding" for i in range(192)] |
|
|
df_embeddings = pd.DataFrame(embeddings, columns=feature_names) |
|
|
prediction = svm_model.predict(df_embeddings)[0] |
|
|
|
|
|
return prediction |
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=predict_emotion, |
|
|
inputs=gr.Audio(type="filepath", label="Record or Upload Audio"), |
|
|
outputs=gr.Textbox(label="Predicted Emotion"), |
|
|
title="Speech Emotion Recognition", |
|
|
description="Optimized for RAVDESS SVM. If accuracy is low, try to speak closer to the mic and minimize background noise." |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |