SamOp224
/

speech-emotion-recognition

Audio Classification

speech-emotion-recognition

Model card Files Files and versions

speech-emotion-recognition / outputs /predict.py

SamOp224's picture

Upload SER models, predict script, and config

6cd0752 verified about 1 month ago

history blame contribute delete

3.08 kB

	#!/usr/bin/env python3
	"""
	Speech Emotion Recognition - Prediction Script
	Usage: python predict.py <path_to_wav_file> [model_dir]
	"""
	import os, sys, numpy as np, librosa

	SAMPLE_RATE = 16000
	MAX_LEN = 200
	N_MELS = 128
	N_MFCC = 40
	N_FFT = 2048
	HOP_LENGTH = 512
	EMOTION_LABELS = ["angry", "disgust", "fear", "happy", "neutral", "sad"]

	def extract_features(wav, sr=SAMPLE_RATE, max_len=MAX_LEN):
	mel = librosa.feature.melspectrogram(y=wav, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH)
	mel_db = librosa.power_to_db(mel, ref=np.max)
	mfcc = librosa.feature.mfcc(y=wav, sr=sr, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
	zcr = librosa.feature.zero_crossing_rate(wav, frame_length=N_FFT, hop_length=HOP_LENGTH)
	rms = librosa.feature.rms(y=wav, frame_length=N_FFT, hop_length=HOP_LENGTH)
	features = np.vstack([mel_db, mfcc, zcr, rms])
	mean = features.mean(axis=1, keepdims=True)
	std = features.std(axis=1, keepdims=True)
	features = (features - mean) / (std + 1e-8)
	T = features.shape[1]
	if T < max_len:
	features = np.pad(features, ((0,0),(0,max_len-T)), mode="constant")
	else:
	features = features[:, :max_len]
	return features[:, :, np.newaxis].astype(np.float32)

	def extract_emotion2vec_embedding(wav_path):
	try:
	from funasr import AutoModel
	model = AutoModel(model="iic/emotion2vec_base", hub="hf", disable_update=True)
	res = model.generate(wav_path, output_dir=None, granularity="utterance", extract_embedding=True)
	emb = np.array(res[0]["feats"]).flatten()[:768]
	if len(emb) < 768:
	emb = np.pad(emb, (0, 768-len(emb)))
	return emb.astype(np.float32)
	except Exception as e:
	print(f"emotion2vec failed: {e}, using zeros")
	return np.zeros(768, dtype=np.float32)

	def predict_emotion(file_path, model_dir="./outputs"):
	import tensorflow as tf
	wav, sr = librosa.load(file_path, sr=SAMPLE_RATE)
	spec = extract_features(wav)[np.newaxis] # (1, 170, 200, 1)
	e2v = extract_emotion2vec_embedding(file_path)[np.newaxis] # (1, 768)

	fusion = tf.keras.models.load_model(os.path.join(model_dir, "fusion_model.keras"))
	probs = fusion.predict({"spec_input": spec, "e2v_input": e2v}, verbose=0)[0]

	idx = np.argmax(probs)
	label = EMOTION_LABELS[idx]
	conf = probs[idx] * 100

	print(f"\nPredicted Emotion: {label.upper()}")
	print(f"Confidence: {conf:.1f}%\n")
	bar_w = 40
	for i in sorted(range(len(EMOTION_LABELS)), key=lambda i: -probs[i]):
	bl = int(probs[i] * bar_w)
	bar = "█" * bl + "░" * (bar_w - bl)
	m = " ◄" if i == idx else ""
	print(f" {EMOTION_LABELS[i]:>8s} [{bar}] {probs[i]*100:5.1f}%{m}")

	return label, conf, {EMOTION_LABELS[i]: float(probs[i]*100) for i in range(len(EMOTION_LABELS))}

	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print("Usage: python predict.py <wav_file> [model_dir]")
	sys.exit(1)
	predict_emotion(sys.argv[1], sys.argv[2] if len(sys.argv)>2 else "./outputs")