Spaces:
Sleeping
Sleeping
File size: 2,172 Bytes
01568da 15a41ac 01568da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import gradio as gr
import numpy as np
import librosa
import tensorflow as tf
# Load trained models
model_female = tf.keras.models.load_model("emotion_recognition_female.h5")
model_male = tf.keras.models.load_model("emotion_recognition_male.h5")
# Constants
FRAME_LENGTH = 2048
HOP_LENGTH = 512
emotion_to_code = {
'neutral': 0, 'happy': 1, 'sad': 2,
'angry': 3, 'fear': 4, 'disgust': 5
}
code_to_emotion = {v: k for k, v in emotion_to_code.items()}
# Preprocess audio
def preprocess_audio(path):
try:
y, sr = librosa.load(path, sr=22050)
y, _ = librosa.effects.trim(y, top_db=25)
if len(y) > 180000:
y = y[:180000]
else:
y = np.pad(y, (0, 180000-len(y)))
return y, sr
except:
sr = 22050
y = np.random.randn(180000) * 0.1
return y, sr
# Extract features (ZCR + RMS + 13 MFCC)
def extract_features(y, sr):
zcr = librosa.feature.zero_crossing_rate(y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)
rms = librosa.feature.rms(y=y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=HOP_LENGTH)
features = np.vstack([zcr, rms, mfcc])
features = features.T # shape (352, 15)
return features
# Predict function
def predict(audio_file, gender):
y, sr = preprocess_audio(audio_file)
features = extract_features(y, sr)
features = np.expand_dims(features, axis=0) # add batch dim
if gender == "female":
preds = model_female.predict(features)
else:
preds = model_male.predict(features)
pred_idx = np.argmax(preds)
emotion = code_to_emotion[pred_idx]
confidence = float(np.max(preds))
return {emotion: confidence}
# Gradio interface
demo = gr.Interface(
fn=predict,
inputs=[
gr.Audio(type="filepath", label="Upload Audio"),
gr.Radio(choices=["female", "male"], label="Select Gender")
],
outputs=gr.Label(num_top_classes=3, label="Predicted Emotion"),
title="🎙️ Speech Emotion Recognition Demo",
description="Upload an audio file and select gender to predict emotion."
)
demo.launch()
|