THP2903 commited on
Commit
df52549
·
verified ·
1 Parent(s): 561f006

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -96
app.py CHANGED
@@ -5,91 +5,91 @@ import cv2
5
  import os
6
  import numpy as np
7
  import tensorflow as tf
8
-
9
- emotion_labels = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
10
-
11
- def trained_model(model_path):
12
-
13
- input_visual = tf.keras.Input((120, 120, 3, 10), name="input_visual") # 90 - 120
14
- input_audio_cnn = tf.keras.Input((150, 512, 1), name="input_audio_cnn")
15
- input_audio_wave = tf.keras.Input((20, 13077), name="input_audio_wave")
16
-
17
- # Visual branch
18
- x_v = tf.keras.layers.Conv3D(10, (3, 3, 3), strides=(2, 2, 1), padding='same')(input_visual)
19
- x_v = tf.keras.layers.BatchNormalization()(x_v)
20
- x_v = tf.keras.layers.ReLU()(x_v)
21
- x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
22
-
23
- x_v = tf.keras.layers.Conv3D(40, (3, 3, 3), strides=(2, 2, 1), padding='same')(x_v)
24
- x_v = tf.keras.layers.BatchNormalization()(x_v)
25
- x_v = tf.keras.layers.ReLU()(x_v)
26
- x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
27
-
28
- x_v = tf.keras.layers.Flatten()(x_v)
29
-
30
- x_v = tf.keras.layers.Dropout(0.2)(x_v)
31
- x_v = tf.keras.layers.Dense(500)(x_v)
32
- x_v = tf.keras.layers.BatchNormalization()(x_v)
33
- x_v = tf.keras.layers.ReLU()(x_v)
34
-
35
- # Audio cnn branch
36
- x_c = tf.keras.layers.Conv2D(5, (3, 3), strides=(2, 2), padding='same')(input_audio_cnn)
37
- x_c = tf.keras.layers.BatchNormalization()(x_c)
38
- x_c = tf.keras.layers.ReLU()(x_c)
39
- x_c = tf.keras.layers.MaxPooling2D((3, 3))(x_c)
40
-
41
- x_c = tf.keras.layers.Conv2D(30, (3, 3), strides=(2, 2), padding='same')(x_c)
42
- x_c = tf.keras.layers.BatchNormalization()(x_c)
43
- x_c = tf.keras.layers.ReLU()(x_c)
44
- x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
45
-
46
- x_c = tf.keras.layers.Conv2D(100, (3, 3), strides=(1, 1), padding='same')(x_c)
47
- x_c = tf.keras.layers.BatchNormalization()(x_c)
48
- x_c = tf.keras.layers.ReLU()(x_c)
49
- x_c = tf.keras.layers.Conv2D(200, (3, 3), strides=(1, 1), padding='same')(x_c)
50
- x_c = tf.keras.layers.BatchNormalization()(x_c)
51
- x_c = tf.keras.layers.ReLU()(x_c)
52
- x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
53
-
54
- x_c = tf.keras.layers.Flatten()(x_c)
55
-
56
- x_c = tf.keras.layers.Dropout(0.2)(x_c)
57
- x_c = tf.keras.layers.Dense(500)(x_c)
58
- x_c = tf.keras.layers.BatchNormalization()(x_c)
59
- x_c = tf.keras.layers.ReLU()(x_c)
60
-
61
- # Audio wave branch
62
- x_w = tf.keras.layers.LSTM(500)(input_audio_wave)
63
- x_w = tf.keras.layers.RepeatVector(20)(x_w)
64
- x_w = tf.keras.layers.LSTM(500)(x_w)
65
-
66
- x_w = tf.keras.layers.Flatten()(x_w)
67
-
68
- x_w = tf.keras.layers.Dropout(0.2)(x_w)
69
- x_w = tf.keras.layers.Dense(500)(x_w)
70
- x_w = tf.keras.layers.BatchNormalization()(x_w)
71
- x_w = tf.keras.layers.ReLU()(x_w)
72
-
73
- # Audio fusion
74
- x_a = x_c + x_w
75
- x_a = tf.keras.layers.Dense(500)(x_a)
76
- x_a = tf.keras.layers.BatchNormalization()(x_a)
77
- x_a = tf.keras.layers.ReLU()(x_a)
78
-
79
- # Fusion
80
- x = x_a + x_v
81
- x = tf.keras.layers.Dense(500)(x)
82
- x = tf.keras.layers.BatchNormalization()(x)
83
- x = tf.keras.layers.ReLU()(x)
84
-
85
- # Output
86
- x = tf.keras.layers.Dropout(0.1)(x)
87
- x = tf.keras.layers.Dense(6, activation='softmax', name='output_classification')(x) # 8 - 6
88
 
89
 
90
- model = model.load(model_path)
91
 
92
- return model
93
 
94
  def process_video_audio(video_path, audio_path):
95
 
@@ -136,28 +136,66 @@ def process_video_audio(video_path, audio_path):
136
  train_audio_cnn = tf.convert_to_tensor(train_audio_cnn, dtype=tf.float16)
137
 
138
  return train_visual, train_audio_wave, train_audio_cnn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
 
 
 
 
 
140
  # Định nghĩa giao diện Gradio
141
- def gradio_interface(video, audio):
 
 
 
 
 
 
142
 
143
- train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video, audio)
144
- model = trained_model("./model_vui_ve.h5")
145
- output = model.predict({"input_visual": train_visual, "input_audio_cnn": train_audio_cnn, "input_audio_wave": train_audio_wave})
146
- emo_index = tf.math.argmax(output)
147
 
148
- return emotion_labels[emo_index]
149
 
150
  iface = gr.Interface(
151
- fn=gradio_interface,
152
  inputs=[
153
- gr.Video(),
154
- gr.Audio()
155
  ],
156
- outputs=[
157
- gr.Text()
158
- ],
159
- live=True,
160
- title="Video and Audio Processing with Emotion Recognition"
161
  )
162
 
163
  iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import os
6
  import numpy as np
7
  import tensorflow as tf
8
+ model = load_model("./model_vui_ve.h5")
9
+ # emotion_labels = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
10
+
11
+ # def trained_model(model_path):
12
+
13
+ # input_visual = tf.keras.Input((120, 120, 3, 10), name="input_visual") # 90 - 120
14
+ # input_audio_cnn = tf.keras.Input((150, 512, 1), name="input_audio_cnn")
15
+ # input_audio_wave = tf.keras.Input((20, 13077), name="input_audio_wave")
16
+
17
+ # # Visual branch
18
+ # x_v = tf.keras.layers.Conv3D(10, (3, 3, 3), strides=(2, 2, 1), padding='same')(input_visual)
19
+ # x_v = tf.keras.layers.BatchNormalization()(x_v)
20
+ # x_v = tf.keras.layers.ReLU()(x_v)
21
+ # x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
22
+
23
+ # x_v = tf.keras.layers.Conv3D(40, (3, 3, 3), strides=(2, 2, 1), padding='same')(x_v)
24
+ # x_v = tf.keras.layers.BatchNormalization()(x_v)
25
+ # x_v = tf.keras.layers.ReLU()(x_v)
26
+ # x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
27
+
28
+ # x_v = tf.keras.layers.Flatten()(x_v)
29
+
30
+ # x_v = tf.keras.layers.Dropout(0.2)(x_v)
31
+ # x_v = tf.keras.layers.Dense(500)(x_v)
32
+ # x_v = tf.keras.layers.BatchNormalization()(x_v)
33
+ # x_v = tf.keras.layers.ReLU()(x_v)
34
+
35
+ # # Audio cnn branch
36
+ # x_c = tf.keras.layers.Conv2D(5, (3, 3), strides=(2, 2), padding='same')(input_audio_cnn)
37
+ # x_c = tf.keras.layers.BatchNormalization()(x_c)
38
+ # x_c = tf.keras.layers.ReLU()(x_c)
39
+ # x_c = tf.keras.layers.MaxPooling2D((3, 3))(x_c)
40
+
41
+ # x_c = tf.keras.layers.Conv2D(30, (3, 3), strides=(2, 2), padding='same')(x_c)
42
+ # x_c = tf.keras.layers.BatchNormalization()(x_c)
43
+ # x_c = tf.keras.layers.ReLU()(x_c)
44
+ # x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
45
+
46
+ # x_c = tf.keras.layers.Conv2D(100, (3, 3), strides=(1, 1), padding='same')(x_c)
47
+ # x_c = tf.keras.layers.BatchNormalization()(x_c)
48
+ # x_c = tf.keras.layers.ReLU()(x_c)
49
+ # x_c = tf.keras.layers.Conv2D(200, (3, 3), strides=(1, 1), padding='same')(x_c)
50
+ # x_c = tf.keras.layers.BatchNormalization()(x_c)
51
+ # x_c = tf.keras.layers.ReLU()(x_c)
52
+ # x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
53
+
54
+ # x_c = tf.keras.layers.Flatten()(x_c)
55
+
56
+ # x_c = tf.keras.layers.Dropout(0.2)(x_c)
57
+ # x_c = tf.keras.layers.Dense(500)(x_c)
58
+ # x_c = tf.keras.layers.BatchNormalization()(x_c)
59
+ # x_c = tf.keras.layers.ReLU()(x_c)
60
+
61
+ # # Audio wave branch
62
+ # x_w = tf.keras.layers.LSTM(500)(input_audio_wave)
63
+ # x_w = tf.keras.layers.RepeatVector(20)(x_w)
64
+ # x_w = tf.keras.layers.LSTM(500)(x_w)
65
+
66
+ # x_w = tf.keras.layers.Flatten()(x_w)
67
+
68
+ # x_w = tf.keras.layers.Dropout(0.2)(x_w)
69
+ # x_w = tf.keras.layers.Dense(500)(x_w)
70
+ # x_w = tf.keras.layers.BatchNormalization()(x_w)
71
+ # x_w = tf.keras.layers.ReLU()(x_w)
72
+
73
+ # # Audio fusion
74
+ # x_a = x_c + x_w
75
+ # x_a = tf.keras.layers.Dense(500)(x_a)
76
+ # x_a = tf.keras.layers.BatchNormalization()(x_a)
77
+ # x_a = tf.keras.layers.ReLU()(x_a)
78
+
79
+ # # Fusion
80
+ # x = x_a + x_v
81
+ # x = tf.keras.layers.Dense(500)(x)
82
+ # x = tf.keras.layers.BatchNormalization()(x)
83
+ # x = tf.keras.layers.ReLU()(x)
84
+
85
+ # # Output
86
+ # x = tf.keras.layers.Dropout(0.1)(x)
87
+ # x = tf.keras.layers.Dense(6, activation='softmax', name='output_classification')(x) # 8 - 6
88
 
89
 
90
+ # model = model.load(model_path)
91
 
92
+ # return model
93
 
94
  def process_video_audio(video_path, audio_path):
95
 
 
136
  train_audio_cnn = tf.convert_to_tensor(train_audio_cnn, dtype=tf.float16)
137
 
138
  return train_visual, train_audio_wave, train_audio_cnn
139
+
140
+ def predict_emotion(video_path, audio_path):
141
+ train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
142
+
143
+
144
+ predictions = model.predict({
145
+ "input_visual": train_visual,
146
+ "input_audio_cnn": train_audio_cnn,
147
+ "input_audio_wave": train_audio_wave
148
+ })
149
+
150
+ predicted_label = np.argmax(predictions)
151
+ return predicted_label
152
+
153
 
154
+
155
+ predicted_label = predict_emotion(video_path)
156
+ emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
157
+ predicted_emotion = emotion_dict[predicted_label]
158
+ print("Predicted Emotion: ", predicted_emotion)
159
  # Định nghĩa giao diện Gradio
160
+
161
+ def predict_emotion_gradio(video, audio):
162
+ predicted_label = predict_emotion(video, audio)
163
+ predicted_emotion = emotion_dict[predicted_label]
164
+ return predicted_emotion
165
+
166
+ # def gradio_interface(video, audio):
167
 
168
+ # train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video, audio)
169
+ # model = trained_model("./model_vui_ve.h5")
170
+ # output = model.predict({"input_visual": train_visual, "input_audio_cnn": train_audio_cnn, "input_audio_wave": train_audio_wave})
171
+ # emo_index = tf.math.argmax(output)
172
 
173
+ # return emotion_labels[emo_index]
174
 
175
  iface = gr.Interface(
176
+ fn=predict_emotion_gradio,
177
  inputs=[
178
+ gr.Video(label="Upload a video"),
179
+ gr.Audio(label="Upload a audio")
180
  ],
181
+ outputs=gr.Textbox(label="Predicted Emotion"),
182
+ title="Emotion Recognition from Video",
183
+ description="Upload a video and get the predicted emotion."
 
 
184
  )
185
 
186
  iface.launch()
187
+
188
+ # iface = gr.Interface(
189
+ # fn=gradio_interface,
190
+ # inputs=[
191
+ # gr.Video(),
192
+ # gr.Audio()
193
+ # ],
194
+ # outputs=[
195
+ # gr.Text()
196
+ # ],
197
+ # live=True,
198
+ # title="Video and Audio Processing with Emotion Recognition"
199
+ # )
200
+
201
+ # iface.launch()