Instructions to use SamOp224/speech-emotion-recognition with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Keras
How to use SamOp224/speech-emotion-recognition with Keras:
# Available backend options are: "jax", "torch", "tensorflow". import os os.environ["KERAS_BACKEND"] = "jax" import keras model = keras.saving.load_model("hf://SamOp224/speech-emotion-recognition") - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """ | |
| Speech Emotion Recognition - Prediction Script | |
| Usage: python predict.py <path_to_wav_file> [model_dir] | |
| """ | |
| import os, sys, numpy as np, librosa | |
| SAMPLE_RATE = 16000 | |
| MAX_LEN = 200 | |
| N_MELS = 128 | |
| N_MFCC = 40 | |
| N_FFT = 2048 | |
| HOP_LENGTH = 512 | |
| EMOTION_LABELS = ["angry", "disgust", "fear", "happy", "neutral", "sad"] | |
| def extract_features(wav, sr=SAMPLE_RATE, max_len=MAX_LEN): | |
| mel = librosa.feature.melspectrogram(y=wav, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH) | |
| mel_db = librosa.power_to_db(mel, ref=np.max) | |
| mfcc = librosa.feature.mfcc(y=wav, sr=sr, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH) | |
| zcr = librosa.feature.zero_crossing_rate(wav, frame_length=N_FFT, hop_length=HOP_LENGTH) | |
| rms = librosa.feature.rms(y=wav, frame_length=N_FFT, hop_length=HOP_LENGTH) | |
| features = np.vstack([mel_db, mfcc, zcr, rms]) | |
| mean = features.mean(axis=1, keepdims=True) | |
| std = features.std(axis=1, keepdims=True) | |
| features = (features - mean) / (std + 1e-8) | |
| T = features.shape[1] | |
| if T < max_len: | |
| features = np.pad(features, ((0,0),(0,max_len-T)), mode="constant") | |
| else: | |
| features = features[:, :max_len] | |
| return features[:, :, np.newaxis].astype(np.float32) | |
| def extract_emotion2vec_embedding(wav_path): | |
| try: | |
| from funasr import AutoModel | |
| model = AutoModel(model="iic/emotion2vec_base", hub="hf", disable_update=True) | |
| res = model.generate(wav_path, output_dir=None, granularity="utterance", extract_embedding=True) | |
| emb = np.array(res[0]["feats"]).flatten()[:768] | |
| if len(emb) < 768: | |
| emb = np.pad(emb, (0, 768-len(emb))) | |
| return emb.astype(np.float32) | |
| except Exception as e: | |
| print(f"emotion2vec failed: {e}, using zeros") | |
| return np.zeros(768, dtype=np.float32) | |
| def predict_emotion(file_path, model_dir="./outputs"): | |
| import tensorflow as tf | |
| wav, sr = librosa.load(file_path, sr=SAMPLE_RATE) | |
| spec = extract_features(wav)[np.newaxis] # (1, 170, 200, 1) | |
| e2v = extract_emotion2vec_embedding(file_path)[np.newaxis] # (1, 768) | |
| fusion = tf.keras.models.load_model(os.path.join(model_dir, "fusion_model.keras")) | |
| probs = fusion.predict({"spec_input": spec, "e2v_input": e2v}, verbose=0)[0] | |
| idx = np.argmax(probs) | |
| label = EMOTION_LABELS[idx] | |
| conf = probs[idx] * 100 | |
| print(f"\nPredicted Emotion: {label.upper()}") | |
| print(f"Confidence: {conf:.1f}%\n") | |
| bar_w = 40 | |
| for i in sorted(range(len(EMOTION_LABELS)), key=lambda i: -probs[i]): | |
| bl = int(probs[i] * bar_w) | |
| bar = "█" * bl + "░" * (bar_w - bl) | |
| m = " ◄" if i == idx else "" | |
| print(f" {EMOTION_LABELS[i]:>8s} [{bar}] {probs[i]*100:5.1f}%{m}") | |
| return label, conf, {EMOTION_LABELS[i]: float(probs[i]*100) for i in range(len(EMOTION_LABELS))} | |
| if __name__ == "__main__": | |
| if len(sys.argv) < 2: | |
| print("Usage: python predict.py <wav_file> [model_dir]") | |
| sys.exit(1) | |
| predict_emotion(sys.argv[1], sys.argv[2] if len(sys.argv)>2 else "./outputs") | |