Spaces:

notuser77
/

ravdess

Sleeping

File size: 3,212 Bytes

import os
import joblib
import pandas as pd
import numpy as np
import torch
import warnings
import gradio as gr
import huggingface_hub
from speechbrain.inference.classifiers import EncoderClassifier

# --- 1. BOOTSTRAP (Monkey Patch for SpeechBrain 1.0.0 compatibility) ---
orig_download = huggingface_hub.hf_hub_download
def patched_download(*args, **kwargs):
    if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token')
    fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None)
    try: return orig_download(*args, **kwargs)
    except Exception as e:
        if fname == "custom.py":
            dummy_path = os.path.abspath("dummy_custom.py")
            if not os.path.exists(dummy_path):
                with open(dummy_path, "w") as f: f.write("# Dummy\n")
            return dummy_path
        raise e
huggingface_hub.hf_hub_download = patched_download
warnings.filterwarnings("ignore")

# --- 2. LOAD MODELS ---
SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
print(f"Loading SVM: {SVM_PATH}")
svm_model = joblib.load(SVM_PATH)

print("Loading SpeechBrain Feature Extractor...")
feature_extractor = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="pretrained_models/spkrec-ecapa-voxceleb"
)

# Standard RAVDESS mapping
EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

# --- 3. INFERENCE LOGIC ---
def predict_emotion(audio_path):
    if audio_path is None:
        return "No audio provided"

    # 1. Load and Resample to 16kHz (Critical for SpeechBrain)
    signal, fs = torchaudio.load(audio_path)
    if fs != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
        signal = resampler(signal)
    
    # 2. Trim Silence (VAD)
    # Using librosa to remove dead air that biases the model toward 'Calm'
    signal_np = signal.squeeze().numpy()
    trimmed_signal, _ = librosa.effects.trim(signal_np, top_db=20)
    signal = torch.from_numpy(trimmed_signal)

    # 3. Extract Embeddings
    with torch.no_grad():
        # Ensure signal is [batch, time]
        embeddings = feature_extractor.encode_batch(signal.unsqueeze(0))
        
        # 4. L2 Normalization (Important for Cosine-based ECAPA models)
        # This ensures the vector magnitude is 1.0, making it volume-invariant
        embeddings = F.normalize(embeddings, p=2, dim=2)
        
        # Reshape for SVM (1, 192)
        embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)

    # 5. Predict
    feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
    df_embeddings = pd.DataFrame(embeddings, columns=feature_names)
    prediction = svm_model.predict(df_embeddings)[0]
    
    return prediction
# --- 4. GRADIO INTERFACE ---
demo = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
    outputs=gr.Textbox(label="Predicted Emotion"),
    title="Speech Emotion Recognition",
    description="Optimized for RAVDESS SVM. If accuracy is low, try to speak closer to the mic and minimize background noise."
)

if __name__ == "__main__":
    demo.launch()