HaryaniAnjali commited on
Commit
6069c51
·
verified ·
1 Parent(s): 6c8529b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -26
app.py CHANGED
@@ -1,14 +1,40 @@
1
  import gradio as gr
2
- import tensorflow as tf
3
  import librosa
4
  import numpy as np
5
  import os
6
 
7
- # Load the model directly from the .h5 file
8
- model_path = os.path.join(os.path.dirname(__file__), 'wav2vec_model.h5')
9
- model = tf.keras.models.load_model(model_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- # Define emotions list
 
 
 
 
 
12
  emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
13
 
14
  def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
@@ -32,30 +58,25 @@ def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
32
  return None
33
 
34
  def predict_emotion(audio):
35
- """Predict emotion from audio input
36
-
37
- This function accepts both file path (when uploading) and audio array
38
- (when recording via microphone) as input
39
- """
40
  try:
41
- # Check if audio is a file path or audio array
42
  if isinstance(audio, str): # File path
43
  features = extract_features(audio)
44
  else: # Audio array from microphone
45
- # If audio is a tuple (audio array, sample rate)
46
  if isinstance(audio, tuple):
47
  audio_array, sample_rate = audio
48
  else:
49
- # If only audio array is provided, assume sample rate
50
  audio_array = audio
51
  sample_rate = 16000
52
 
53
  # Convert to mono if stereo
54
- if len(audio_array.shape) > 1:
55
  audio_array = np.mean(audio_array, axis=1)
56
 
57
  # Extract features
58
- mfccs = librosa.feature.mfcc(y=audio_array, sr=sample_rate, n_mfcc=13)
59
 
60
  # Pad or truncate to fixed length
61
  max_length = 128
@@ -70,30 +91,34 @@ def predict_emotion(audio):
70
  if features is None:
71
  return {emotion: 0.0 for emotion in emotions}
72
 
73
- # Reshape for model input
74
- features = np.expand_dims(features, axis=0)
 
 
 
75
 
76
- # Make prediction
77
- predictions = model.predict(features)
 
 
78
 
79
  # Format results
80
- result = {emotion: float(predictions[0][i]) for i, emotion in enumerate(emotions)}
81
  return result
82
 
83
  except Exception as e:
84
  print(f"Error in prediction: {e}")
85
- return {emotion: 0.0 for emotion in emotions}
 
 
86
 
87
- # Create Gradio interface with both file upload and microphone
88
  demo = gr.Interface(
89
  fn=predict_emotion,
90
  inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
91
  outputs=gr.Label(num_top_classes=7),
92
  title="Speech Emotion Recognition",
93
- description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions.",
94
- examples=[
95
- ["example1.wav"], # Add example files here if you have them
96
- ]
97
  )
98
 
99
  demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
  import librosa
4
  import numpy as np
5
  import os
6
 
7
+ # Define PyTorch model class (must match the structure used during conversion)
8
+ class EmotionClassifier(torch.nn.Module):
9
+ def __init__(self, input_shape, num_classes):
10
+ super().__init__()
11
+ # Adjust this architecture to match your converted model
12
+ self.flatten = torch.nn.Flatten()
13
+ self.layers = torch.nn.Sequential(
14
+ torch.nn.Linear(input_shape, 128),
15
+ torch.nn.ReLU(),
16
+ torch.nn.Dropout(0.3),
17
+ torch.nn.Linear(128, 64),
18
+ torch.nn.ReLU(),
19
+ torch.nn.Dropout(0.3),
20
+ torch.nn.Linear(64, num_classes)
21
+ )
22
+
23
+ def forward(self, x):
24
+ x = self.flatten(x)
25
+ return self.layers(x)
26
+
27
+ # Create model instance
28
+ input_shape = 13 * 128 # n_mfcc * max_length
29
+ num_classes = 7 # Number of emotions
30
+ model = EmotionClassifier(input_shape, num_classes)
31
 
32
+ # Load the saved model weights
33
+ model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
34
+ model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
35
+ model.eval()
36
+
37
+ # Define emotions
38
  emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
39
 
40
  def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
 
58
  return None
59
 
60
  def predict_emotion(audio):
61
+ """Predict emotion from audio input"""
 
 
 
 
62
  try:
63
+ # Process audio input
64
  if isinstance(audio, str): # File path
65
  features = extract_features(audio)
66
  else: # Audio array from microphone
67
+ # Handle microphone input
68
  if isinstance(audio, tuple):
69
  audio_array, sample_rate = audio
70
  else:
 
71
  audio_array = audio
72
  sample_rate = 16000
73
 
74
  # Convert to mono if stereo
75
+ if len(np.array(audio_array).shape) > 1:
76
  audio_array = np.mean(audio_array, axis=1)
77
 
78
  # Extract features
79
+ mfccs = librosa.feature.mfcc(y=np.array(audio_array), sr=sample_rate, n_mfcc=13)
80
 
81
  # Pad or truncate to fixed length
82
  max_length = 128
 
91
  if features is None:
92
  return {emotion: 0.0 for emotion in emotions}
93
 
94
+ # Flatten the features (adjust based on your model's input expectations)
95
+ features_flat = features.reshape(1, -1)
96
+
97
+ # Convert to PyTorch tensor
98
+ features_tensor = torch.tensor(features_flat, dtype=torch.float32)
99
 
100
+ # Get predictions
101
+ with torch.no_grad():
102
+ outputs = model(features_tensor)
103
+ probabilities = torch.nn.functional.softmax(outputs, dim=1)
104
 
105
  # Format results
106
+ result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
107
  return result
108
 
109
  except Exception as e:
110
  print(f"Error in prediction: {e}")
111
+ import traceback
112
+ traceback.print_exc()
113
+ return {emotion: 1/len(emotions) for emotion in emotions}
114
 
115
+ # Create Gradio interface
116
  demo = gr.Interface(
117
  fn=predict_emotion,
118
  inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
119
  outputs=gr.Label(num_top_classes=7),
120
  title="Speech Emotion Recognition",
121
+ description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions."
 
 
 
122
  )
123
 
124
  demo.launch()