File size: 6,168 Bytes
5d7352c
6069c51
5d7352c
 
f1445b2
e5b4dac
5d7352c
e5b4dac
6069c51
e5b4dac
6069c51
e5b4dac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6069c51
 
e5b4dac
6069c51
e5b4dac
 
5d7352c
e5b4dac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6069c51
e5b4dac
 
f1445b2
e5b4dac
f1445b2
 
e5b4dac
 
 
 
f1445b2
e5b4dac
 
 
 
 
 
 
 
 
 
 
 
 
f1445b2
e5b4dac
 
 
 
f1445b2
e5b4dac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1445b2
e5b4dac
 
f1445b2
 
 
6069c51
e5b4dac
 
 
f1445b2
e5b4dac
 
 
f1445b2
 
 
e5b4dac
 
 
 
 
 
 
 
 
 
 
 
 
 
f1445b2
 
e5b4dac
f1445b2
e5b4dac
 
6069c51
e5b4dac
6069c51
 
 
f1445b2
e5b4dac
6069c51
e5b4dac
f1445b2
 
 
6069c51
 
5d7352c
6069c51
5d7352c
 
f1445b2
e5b4dac
5d7352c
e5b4dac
5d7352c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import gradio as gr
import torch
import librosa
import numpy as np
import os
import traceback

# Define your PyTorch model class to match the conversion
class EmotionClassifier(torch.nn.Module):
    def __init__(self, input_features, hidden_sizes, num_classes):
        super().__init__()
        
        # Build the sequential model
        layers = []
        prev_size = input_features
        
        # Add hidden layers
        for size in hidden_sizes:
            layers.append(torch.nn.Linear(prev_size, size))
            layers.append(torch.nn.ReLU())
            prev_size = size
        
        # Add output layer
        layers.append(torch.nn.Linear(prev_size, num_classes))
        
        # Create the model
        self.model = torch.nn.Sequential(*layers)
        
    def forward(self, x):
        return self.model(x)

# Define emotions list - make sure this matches your model's output classes
emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised", "calm"]  # Added "calm" as the 8th emotion based on your model

# Load the PyTorch model
try:
    print("Loading PyTorch model...")
    
    # Parameters determined from the Keras model
    input_features = 768  # From the Keras model's first layer weights
    hidden_sizes = [256, 128, 64]  # From the Keras model architecture
    num_classes = 8  # From the Keras model's output layer
    
    model = EmotionClassifier(input_features, hidden_sizes, num_classes)
    
    model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()
    print("Model loaded successfully")
except Exception as e:
    print(f"Error loading model: {e}")
    traceback.print_exc()
    model = None

def extract_features(audio_path, sample_rate=16000):
    """Extract features from an audio file that match what your model expects"""
    try:
        print(f"Extracting features from {audio_path}")
        audio, sr = librosa.load(audio_path, sr=sample_rate)
        
        # We need to extract features that match what your model was trained on
        # Based on your model, it seems to expect 768 features
        # Let's extract MFCCs, spectral features, and more to get a rich feature set
        
        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
        mfccs_mean = np.mean(mfccs.T, axis=0)
        mfccs_var = np.var(mfccs.T, axis=0)
        
        # Extract spectral features
        chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
        chroma_mean = np.mean(chroma.T, axis=0)
        chroma_var = np.var(chroma.T, axis=0)
        
        # Extract mel spectrogram
        mel = librosa.feature.melspectrogram(y=audio, sr=sr)
        mel_mean = np.mean(mel.T, axis=0)
        mel_var = np.var(mel.T, axis=0)
        
        # Extract spectral contrast
        contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
        contrast_mean = np.mean(contrast.T, axis=0)
        contrast_var = np.var(contrast.T, axis=0)
        
        # Combine all features
        features = np.hstack([
            mfccs_mean, mfccs_var,
            chroma_mean, chroma_var,
            mel_mean[:200], mel_var[:200],  # Limit to 200 features to avoid exceeding 768
            contrast_mean, contrast_var
        ])
        
        # Ensure we have exactly 768 features
        if len(features) < 768:
            # Pad with zeros if needed
            features = np.pad(features, (0, 768 - len(features)))
        elif len(features) > 768:
            # Truncate if too many
            features = features[:768]
            
        print(f"Extracted {len(features)} features")
        return features
    except Exception as e:
        print(f"Error extracting features: {e}")
        traceback.print_exc()
        return None

def predict_emotion(audio):
    """Predict emotion from audio input"""
    if model is None:
        return {emotion: 1/len(emotions) for emotion in emotions}
    
    try:
        print(f"Processing audio input: {type(audio)}")
        
        # Process audio based on input type
        if isinstance(audio, str):  # File path
            features = extract_features(audio)
        else:  # Audio array from microphone
            # Save to a temporary file
            import tempfile
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
                if isinstance(audio, tuple):
                    audio_array, sample_rate = audio
                else:
                    audio_array = audio
                    sample_rate = 16000
                    
                import soundfile as sf
                sf.write(temp_file.name, audio_array, sample_rate)
                features = extract_features(temp_file.name)
                # Clean up
                os.remove(temp_file.name)
        
        if features is None:
            return {emotion: 1/len(emotions) for emotion in emotions}
        
        # Convert features to PyTorch tensor
        features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
        
        # Make prediction
        with torch.no_grad():
            outputs = model(features_tensor)
            probabilities = torch.nn.functional.softmax(outputs, dim=1)
        
        # Format result
        result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
        print(f"Prediction result: {result}")
        return result
    except Exception as e:
        print(f"Error in prediction: {e}")
        traceback.print_exc()
        return {emotion: 1/len(emotions) for emotion in emotions}

# Create Gradio interface
demo = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
    outputs=gr.Label(num_top_classes=8),  # Updated to match the 8 emotions
    title="Speech Emotion Recognition",
    description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, surprised, and calm emotions."
)

demo.launch()