File size: 3,212 Bytes
a2fc1ad
3c7ab19
 
 
 
5d55225
3c7ab19
a2fc1ad
76d8f21
 
f32b656
a2fc1ad
 
5d55225
76d8f21
5d55225
2b4f79e
5d55225
76d8f21
 
5d55225
76d8f21
2b4f79e
a2fc1ad
2b4f79e
8944133
5d55225
 
 
 
0861246
5d55225
2b4f79e
 
3c7ab19
2b4f79e
0861246
f32b656
3c7ab19
 
f32b656
0861246
512e413
 
 
 
 
 
 
 
f32b656
512e413
 
 
 
 
 
 
0861246
512e413
5d55225
512e413
 
 
 
 
 
3c7ab19
512e413
 
5d55225
 
f32b656
 
512e413
f32b656
4a71fb9
0861246
f32b656
 
 
 
0861246
 
 
4a71fb9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import joblib
import pandas as pd
import numpy as np
import torch
import warnings
import gradio as gr
import huggingface_hub
from speechbrain.inference.classifiers import EncoderClassifier

# --- 1. BOOTSTRAP (Monkey Patch for SpeechBrain 1.0.0 compatibility) ---
orig_download = huggingface_hub.hf_hub_download
def patched_download(*args, **kwargs):
    if 'use_auth_token' in kwargs: kwargs['token'] = kwargs.pop('use_auth_token')
    fname = kwargs.get('filename') or (args[1] if len(args) > 1 else None)
    try: return orig_download(*args, **kwargs)
    except Exception as e:
        if fname == "custom.py":
            dummy_path = os.path.abspath("dummy_custom.py")
            if not os.path.exists(dummy_path):
                with open(dummy_path, "w") as f: f.write("# Dummy\n")
            return dummy_path
        raise e
huggingface_hub.hf_hub_download = patched_download
warnings.filterwarnings("ignore")

# --- 2. LOAD MODELS ---
SVM_PATH = 'ravdess_svm_speechbrain_ecapa_voxceleb_no_processor_cv_8class.pkl'
print(f"Loading SVM: {SVM_PATH}")
svm_model = joblib.load(SVM_PATH)

print("Loading SpeechBrain Feature Extractor...")
feature_extractor = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="pretrained_models/spkrec-ecapa-voxceleb"
)

# Standard RAVDESS mapping
EMOTIONS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

# --- 3. INFERENCE LOGIC ---
def predict_emotion(audio_path):
    if audio_path is None:
        return "No audio provided"

    # 1. Load and Resample to 16kHz (Critical for SpeechBrain)
    signal, fs = torchaudio.load(audio_path)
    if fs != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
        signal = resampler(signal)
    
    # 2. Trim Silence (VAD)
    # Using librosa to remove dead air that biases the model toward 'Calm'
    signal_np = signal.squeeze().numpy()
    trimmed_signal, _ = librosa.effects.trim(signal_np, top_db=20)
    signal = torch.from_numpy(trimmed_signal)

    # 3. Extract Embeddings
    with torch.no_grad():
        # Ensure signal is [batch, time]
        embeddings = feature_extractor.encode_batch(signal.unsqueeze(0))
        
        # 4. L2 Normalization (Important for Cosine-based ECAPA models)
        # This ensures the vector magnitude is 1.0, making it volume-invariant
        embeddings = F.normalize(embeddings, p=2, dim=2)
        
        # Reshape for SVM (1, 192)
        embeddings = embeddings.cpu().numpy().squeeze().reshape(1, -1)

    # 5. Predict
    feature_names = [f"{i}_speechbrain_embedding" for i in range(192)]
    df_embeddings = pd.DataFrame(embeddings, columns=feature_names)
    prediction = svm_model.predict(df_embeddings)[0]
    
    return prediction
# --- 4. GRADIO INTERFACE ---
demo = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
    outputs=gr.Textbox(label="Predicted Emotion"),
    title="Speech Emotion Recognition",
    description="Optimized for RAVDESS SVM. If accuracy is low, try to speak closer to the mic and minimize background noise."
)

if __name__ == "__main__":
    demo.launch()