File size: 4,443 Bytes
5d7352c
6069c51
5d7352c
 
f1445b2
5d7352c
6069c51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d7352c
6069c51
 
 
 
 
 
5d7352c
 
f1445b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6069c51
f1445b2
6069c51
f1445b2
 
 
6069c51
f1445b2
 
 
 
 
 
 
6069c51
f1445b2
 
 
6069c51
f1445b2
 
 
 
 
 
 
 
 
 
 
 
 
 
6069c51
 
 
 
 
f1445b2
6069c51
 
 
 
f1445b2
 
6069c51
f1445b2
 
 
 
6069c51
 
 
5d7352c
6069c51
5d7352c
 
f1445b2
5d7352c
 
6069c51
5d7352c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import torch
import librosa
import numpy as np
import os

# Define PyTorch model class (must match the structure used during conversion)
class EmotionClassifier(torch.nn.Module):
    def __init__(self, input_shape, num_classes):
        super().__init__()
        # Adjust this architecture to match your converted model
        self.flatten = torch.nn.Flatten()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(input_shape, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(64, num_classes)
        )
        
    def forward(self, x):
        x = self.flatten(x)
        return self.layers(x)

# Create model instance
input_shape = 13 * 128  # n_mfcc * max_length
num_classes = 7  # Number of emotions
model = EmotionClassifier(input_shape, num_classes)

# Load the saved model weights
model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# Define emotions
emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]

def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
    """Extract MFCC features from an audio file"""
    try:
        audio, sr = librosa.load(audio_path, sr=sample_rate)
        
        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
        
        # Pad or truncate to fixed length
        if mfccs.shape[1] < max_length:
            pad_width = max_length - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_length]
        
        return mfccs
    except Exception as e:
        print(f"Error in feature extraction: {e}")
        return None

def predict_emotion(audio):
    """Predict emotion from audio input"""
    try:
        # Process audio input
        if isinstance(audio, str):  # File path
            features = extract_features(audio)
        else:  # Audio array from microphone
            # Handle microphone input
            if isinstance(audio, tuple):
                audio_array, sample_rate = audio
            else:
                audio_array = audio
                sample_rate = 16000
                
            # Convert to mono if stereo
            if len(np.array(audio_array).shape) > 1:
                audio_array = np.mean(audio_array, axis=1)
                
            # Extract features
            mfccs = librosa.feature.mfcc(y=np.array(audio_array), sr=sample_rate, n_mfcc=13)
            
            # Pad or truncate to fixed length
            max_length = 128
            if mfccs.shape[1] < max_length:
                pad_width = max_length - mfccs.shape[1]
                mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
            else:
                mfccs = mfccs[:, :max_length]
                
            features = mfccs
        
        if features is None:
            return {emotion: 0.0 for emotion in emotions}
        
        # Flatten the features (adjust based on your model's input expectations)
        features_flat = features.reshape(1, -1)
        
        # Convert to PyTorch tensor
        features_tensor = torch.tensor(features_flat, dtype=torch.float32)
        
        # Get predictions
        with torch.no_grad():
            outputs = model(features_tensor)
            probabilities = torch.nn.functional.softmax(outputs, dim=1)
        
        # Format results
        result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
        return result
        
    except Exception as e:
        print(f"Error in prediction: {e}")
        import traceback
        traceback.print_exc()
        return {emotion: 1/len(emotions) for emotion in emotions}

# Create Gradio interface
demo = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
    outputs=gr.Label(num_top_classes=7),
    title="Speech Emotion Recognition",
    description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions."
)

demo.launch()