SamOp224's picture
Upload SER models, predict script, and config
6cd0752 verified
#!/usr/bin/env python3
"""
Speech Emotion Recognition - Prediction Script
Usage: python predict.py <path_to_wav_file> [model_dir]
"""
import os, sys, numpy as np, librosa
SAMPLE_RATE = 16000
MAX_LEN = 200
N_MELS = 128
N_MFCC = 40
N_FFT = 2048
HOP_LENGTH = 512
EMOTION_LABELS = ["angry", "disgust", "fear", "happy", "neutral", "sad"]
def extract_features(wav, sr=SAMPLE_RATE, max_len=MAX_LEN):
mel = librosa.feature.melspectrogram(y=wav, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH)
mel_db = librosa.power_to_db(mel, ref=np.max)
mfcc = librosa.feature.mfcc(y=wav, sr=sr, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
zcr = librosa.feature.zero_crossing_rate(wav, frame_length=N_FFT, hop_length=HOP_LENGTH)
rms = librosa.feature.rms(y=wav, frame_length=N_FFT, hop_length=HOP_LENGTH)
features = np.vstack([mel_db, mfcc, zcr, rms])
mean = features.mean(axis=1, keepdims=True)
std = features.std(axis=1, keepdims=True)
features = (features - mean) / (std + 1e-8)
T = features.shape[1]
if T < max_len:
features = np.pad(features, ((0,0),(0,max_len-T)), mode="constant")
else:
features = features[:, :max_len]
return features[:, :, np.newaxis].astype(np.float32)
def extract_emotion2vec_embedding(wav_path):
try:
from funasr import AutoModel
model = AutoModel(model="iic/emotion2vec_base", hub="hf", disable_update=True)
res = model.generate(wav_path, output_dir=None, granularity="utterance", extract_embedding=True)
emb = np.array(res[0]["feats"]).flatten()[:768]
if len(emb) < 768:
emb = np.pad(emb, (0, 768-len(emb)))
return emb.astype(np.float32)
except Exception as e:
print(f"emotion2vec failed: {e}, using zeros")
return np.zeros(768, dtype=np.float32)
def predict_emotion(file_path, model_dir="./outputs"):
import tensorflow as tf
wav, sr = librosa.load(file_path, sr=SAMPLE_RATE)
spec = extract_features(wav)[np.newaxis] # (1, 170, 200, 1)
e2v = extract_emotion2vec_embedding(file_path)[np.newaxis] # (1, 768)
fusion = tf.keras.models.load_model(os.path.join(model_dir, "fusion_model.keras"))
probs = fusion.predict({"spec_input": spec, "e2v_input": e2v}, verbose=0)[0]
idx = np.argmax(probs)
label = EMOTION_LABELS[idx]
conf = probs[idx] * 100
print(f"\nPredicted Emotion: {label.upper()}")
print(f"Confidence: {conf:.1f}%\n")
bar_w = 40
for i in sorted(range(len(EMOTION_LABELS)), key=lambda i: -probs[i]):
bl = int(probs[i] * bar_w)
bar = "█" * bl + "░" * (bar_w - bl)
m = " ◄" if i == idx else ""
print(f" {EMOTION_LABELS[i]:>8s} [{bar}] {probs[i]*100:5.1f}%{m}")
return label, conf, {EMOTION_LABELS[i]: float(probs[i]*100) for i in range(len(EMOTION_LABELS))}
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python predict.py <wav_file> [model_dir]")
sys.exit(1)
predict_emotion(sys.argv[1], sys.argv[2] if len(sys.argv)>2 else "./outputs")