File size: 2,172 Bytes
01568da
 
 
 
 
 
15a41ac
 
01568da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import numpy as np
import librosa
import tensorflow as tf

# Load trained models
model_female = tf.keras.models.load_model("emotion_recognition_female.h5")
model_male = tf.keras.models.load_model("emotion_recognition_male.h5")

# Constants
FRAME_LENGTH = 2048
HOP_LENGTH = 512
emotion_to_code = {
    'neutral': 0, 'happy': 1, 'sad': 2,
    'angry': 3, 'fear': 4, 'disgust': 5
}
code_to_emotion = {v: k for k, v in emotion_to_code.items()}

# Preprocess audio
def preprocess_audio(path):
    try:
        y, sr = librosa.load(path, sr=22050)
        y, _ = librosa.effects.trim(y, top_db=25)
        if len(y) > 180000:
            y = y[:180000]
        else:
            y = np.pad(y, (0, 180000-len(y)))
        return y, sr
    except:
        sr = 22050
        y = np.random.randn(180000) * 0.1
        return y, sr

# Extract features (ZCR + RMS + 13 MFCC)
def extract_features(y, sr):
    zcr = librosa.feature.zero_crossing_rate(y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)
    rms = librosa.feature.rms(y=y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=HOP_LENGTH)
    features = np.vstack([zcr, rms, mfcc])
    features = features.T  # shape (352, 15)
    return features

# Predict function
def predict(audio_file, gender):
    y, sr = preprocess_audio(audio_file)
    features = extract_features(y, sr)
    features = np.expand_dims(features, axis=0)  # add batch dim

    if gender == "female":
        preds = model_female.predict(features)
    else:
        preds = model_male.predict(features)

    pred_idx = np.argmax(preds)
    emotion = code_to_emotion[pred_idx]
    confidence = float(np.max(preds))

    return {emotion: confidence}

# Gradio interface
demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Audio(type="filepath", label="Upload Audio"),
        gr.Radio(choices=["female", "male"], label="Select Gender")
    ],
    outputs=gr.Label(num_top_classes=3, label="Predicted Emotion"),
    title="🎙️ Speech Emotion Recognition Demo",
    description="Upload an audio file and select gender to predict emotion."
)

demo.launch()