THP2903 commited on
Commit
a817465
·
verified ·
1 Parent(s): 1fd3c84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -128
app.py CHANGED
@@ -9,90 +9,89 @@ import tensorflow as tf
9
  from tensorflow.keras.models import load_model
10
 
11
 
12
- # emotion_labels = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
13
-
14
- # def trained_model(model_path):
15
-
16
- # input_visual = tf.keras.Input((120, 120, 3, 10), name="input_visual") # 90 - 120
17
- # input_audio_cnn = tf.keras.Input((150, 512, 1), name="input_audio_cnn")
18
- # input_audio_wave = tf.keras.Input((20, 13077), name="input_audio_wave")
19
-
20
- # # Visual branch
21
- # x_v = tf.keras.layers.Conv3D(10, (3, 3, 3), strides=(2, 2, 1), padding='same')(input_visual)
22
- # x_v = tf.keras.layers.BatchNormalization()(x_v)
23
- # x_v = tf.keras.layers.ReLU()(x_v)
24
- # x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
25
-
26
- # x_v = tf.keras.layers.Conv3D(40, (3, 3, 3), strides=(2, 2, 1), padding='same')(x_v)
27
- # x_v = tf.keras.layers.BatchNormalization()(x_v)
28
- # x_v = tf.keras.layers.ReLU()(x_v)
29
- # x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
30
-
31
- # x_v = tf.keras.layers.Flatten()(x_v)
32
-
33
- # x_v = tf.keras.layers.Dropout(0.2)(x_v)
34
- # x_v = tf.keras.layers.Dense(500)(x_v)
35
- # x_v = tf.keras.layers.BatchNormalization()(x_v)
36
- # x_v = tf.keras.layers.ReLU()(x_v)
37
-
38
- # # Audio cnn branch
39
- # x_c = tf.keras.layers.Conv2D(5, (3, 3), strides=(2, 2), padding='same')(input_audio_cnn)
40
- # x_c = tf.keras.layers.BatchNormalization()(x_c)
41
- # x_c = tf.keras.layers.ReLU()(x_c)
42
- # x_c = tf.keras.layers.MaxPooling2D((3, 3))(x_c)
43
-
44
- # x_c = tf.keras.layers.Conv2D(30, (3, 3), strides=(2, 2), padding='same')(x_c)
45
- # x_c = tf.keras.layers.BatchNormalization()(x_c)
46
- # x_c = tf.keras.layers.ReLU()(x_c)
47
- # x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
48
-
49
- # x_c = tf.keras.layers.Conv2D(100, (3, 3), strides=(1, 1), padding='same')(x_c)
50
- # x_c = tf.keras.layers.BatchNormalization()(x_c)
51
- # x_c = tf.keras.layers.ReLU()(x_c)
52
- # x_c = tf.keras.layers.Conv2D(200, (3, 3), strides=(1, 1), padding='same')(x_c)
53
- # x_c = tf.keras.layers.BatchNormalization()(x_c)
54
- # x_c = tf.keras.layers.ReLU()(x_c)
55
- # x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
56
-
57
- # x_c = tf.keras.layers.Flatten()(x_c)
58
-
59
- # x_c = tf.keras.layers.Dropout(0.2)(x_c)
60
- # x_c = tf.keras.layers.Dense(500)(x_c)
61
- # x_c = tf.keras.layers.BatchNormalization()(x_c)
62
- # x_c = tf.keras.layers.ReLU()(x_c)
63
-
64
- # # Audio wave branch
65
- # x_w = tf.keras.layers.LSTM(500)(input_audio_wave)
66
- # x_w = tf.keras.layers.RepeatVector(20)(x_w)
67
- # x_w = tf.keras.layers.LSTM(500)(x_w)
68
-
69
- # x_w = tf.keras.layers.Flatten()(x_w)
70
-
71
- # x_w = tf.keras.layers.Dropout(0.2)(x_w)
72
- # x_w = tf.keras.layers.Dense(500)(x_w)
73
- # x_w = tf.keras.layers.BatchNormalization()(x_w)
74
- # x_w = tf.keras.layers.ReLU()(x_w)
75
-
76
- # # Audio fusion
77
- # x_a = x_c + x_w
78
- # x_a = tf.keras.layers.Dense(500)(x_a)
79
- # x_a = tf.keras.layers.BatchNormalization()(x_a)
80
- # x_a = tf.keras.layers.ReLU()(x_a)
81
-
82
- # # Fusion
83
- # x = x_a + x_v
84
- # x = tf.keras.layers.Dense(500)(x)
85
- # x = tf.keras.layers.BatchNormalization()(x)
86
- # x = tf.keras.layers.ReLU()(x)
87
-
88
- # # Output
89
- # x = tf.keras.layers.Dropout(0.1)(x)
90
- # x = tf.keras.layers.Dense(6, activation='softmax', name='output_classification')(x) # 8 - 6
91
 
92
 
93
- # model = model.load(model_path)
94
 
95
- # return model
96
 
97
  def process_video_audio(video_path, audio_path):
98
 
@@ -140,60 +139,60 @@ def process_video_audio(video_path, audio_path):
140
 
141
  return train_visual, train_audio_wave, train_audio_cnn
142
 
143
- def predict_emotion(video_path, audio_path):
144
- train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
145
 
146
- model = load_model("./model_vui_ve.h5")
147
- predictions = model.predict({
148
- "input_visual": train_visual,
149
- "input_audio_cnn": train_audio_cnn,
150
- "input_audio_wave": train_audio_wave
151
- })
152
 
153
- predicted_label = np.argmax(predictions)
154
- return predicted_label
155
-
156
- # Định nghĩa giao diện Gradio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
- def predict_emotion_gradio(video, audio):
159
- emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
160
- predicted_label = predict_emotion(video, audio)
161
- predicted_emotion = emotion_dict[predicted_label]
162
- return predicted_emotion
 
 
 
 
 
 
163
 
164
- # def gradio_interface(video, audio):
165
-
166
- # train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video, audio)
167
- # model = trained_model("./model_vui_ve.h5")
168
- # output = model.predict({"input_visual": train_visual, "input_audio_cnn": train_audio_cnn, "input_audio_wave": train_audio_wave})
169
- # emo_index = tf.math.argmax(output)
170
-
171
- # return emotion_labels[emo_index]
172
 
173
  iface = gr.Interface(
174
- fn=predict_emotion_gradio,
175
  inputs=[
176
- gr.Video(label="Upload a video"
177
- ),
178
- gr.Audio(label="Upload a audio")
179
  ],
180
- outputs=gr.Textbox(label="Predicted Emotion"),
181
- title="Emotion Recognition from Video",
182
- description="Upload a video and get the predicted emotion."
 
 
183
  )
184
-
185
  iface.launch()
186
-
187
- # iface = gr.Interface(
188
- # fn=gradio_interface,
189
- # inputs=[
190
- # gr.Video(),
191
- # gr.Audio()
192
- # ],
193
- # outputs=[
194
- # gr.Text()
195
- # ],
196
- # live=True,
197
- # title="Video and Audio Processing with Emotion Recognition"
198
- # )
199
- # iface.launch()
 
9
  from tensorflow.keras.models import load_model
10
 
11
 
12
+
13
+ def trained_model(model_path):
14
+
15
+ input_visual = tf.keras.Input((120, 120, 3, 10), name="input_visual") # 90 - 120
16
+ input_audio_cnn = tf.keras.Input((150, 512, 1), name="input_audio_cnn")
17
+ input_audio_wave = tf.keras.Input((20, 13077), name="input_audio_wave")
18
+
19
+ # Visual branch
20
+ x_v = tf.keras.layers.Conv3D(10, (3, 3, 3), strides=(2, 2, 1), padding='same')(input_visual)
21
+ x_v = tf.keras.layers.BatchNormalization()(x_v)
22
+ x_v = tf.keras.layers.ReLU()(x_v)
23
+ x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
24
+
25
+ x_v = tf.keras.layers.Conv3D(40, (3, 3, 3), strides=(2, 2, 1), padding='same')(x_v)
26
+ x_v = tf.keras.layers.BatchNormalization()(x_v)
27
+ x_v = tf.keras.layers.ReLU()(x_v)
28
+ x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
29
+
30
+ x_v = tf.keras.layers.Flatten()(x_v)
31
+
32
+ x_v = tf.keras.layers.Dropout(0.2)(x_v)
33
+ x_v = tf.keras.layers.Dense(500)(x_v)
34
+ x_v = tf.keras.layers.BatchNormalization()(x_v)
35
+ x_v = tf.keras.layers.ReLU()(x_v)
36
+
37
+ # Audio cnn branch
38
+ x_c = tf.keras.layers.Conv2D(5, (3, 3), strides=(2, 2), padding='same')(input_audio_cnn)
39
+ x_c = tf.keras.layers.BatchNormalization()(x_c)
40
+ x_c = tf.keras.layers.ReLU()(x_c)
41
+ x_c = tf.keras.layers.MaxPooling2D((3, 3))(x_c)
42
+
43
+ x_c = tf.keras.layers.Conv2D(30, (3, 3), strides=(2, 2), padding='same')(x_c)
44
+ x_c = tf.keras.layers.BatchNormalization()(x_c)
45
+ x_c = tf.keras.layers.ReLU()(x_c)
46
+ x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
47
+
48
+ x_c = tf.keras.layers.Conv2D(100, (3, 3), strides=(1, 1), padding='same')(x_c)
49
+ x_c = tf.keras.layers.BatchNormalization()(x_c)
50
+ x_c = tf.keras.layers.ReLU()(x_c)
51
+ x_c = tf.keras.layers.Conv2D(200, (3, 3), strides=(1, 1), padding='same')(x_c)
52
+ x_c = tf.keras.layers.BatchNormalization()(x_c)
53
+ x_c = tf.keras.layers.ReLU()(x_c)
54
+ x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
55
+
56
+ x_c = tf.keras.layers.Flatten()(x_c)
57
+
58
+ x_c = tf.keras.layers.Dropout(0.2)(x_c)
59
+ x_c = tf.keras.layers.Dense(500)(x_c)
60
+ x_c = tf.keras.layers.BatchNormalization()(x_c)
61
+ x_c = tf.keras.layers.ReLU()(x_c)
62
+
63
+ # Audio wave branch
64
+ x_w = tf.keras.layers.LSTM(500)(input_audio_wave)
65
+ x_w = tf.keras.layers.RepeatVector(20)(x_w)
66
+ x_w = tf.keras.layers.LSTM(500)(x_w)
67
+
68
+ x_w = tf.keras.layers.Flatten()(x_w)
69
+
70
+ x_w = tf.keras.layers.Dropout(0.2)(x_w)
71
+ x_w = tf.keras.layers.Dense(500)(x_w)
72
+ x_w = tf.keras.layers.BatchNormalization()(x_w)
73
+ x_w = tf.keras.layers.ReLU()(x_w)
74
+
75
+ # Audio fusion
76
+ x_a = x_c + x_w
77
+ x_a = tf.keras.layers.Dense(500)(x_a)
78
+ x_a = tf.keras.layers.BatchNormalization()(x_a)
79
+ x_a = tf.keras.layers.ReLU()(x_a)
80
+
81
+ # Fusion
82
+ x = x_a + x_v
83
+ x = tf.keras.layers.Dense(500)(x)
84
+ x = tf.keras.layers.BatchNormalization()(x)
85
+ x = tf.keras.layers.ReLU()(x)
86
+
87
+ # Output
88
+ x = tf.keras.layers.Dropout(0.1)(x)
89
+ x = tf.keras.layers.Dense(6, activation='softmax', name='output_classification')(x) # 8 - 6
 
90
 
91
 
92
+ model = model.load(model_path)
93
 
94
+ return model
95
 
96
  def process_video_audio(video_path, audio_path):
97
 
 
139
 
140
  return train_visual, train_audio_wave, train_audio_cnn
141
 
142
+ # def predict_emotion(video_path, audio_path):
143
+ # train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
144
 
145
+ # model = load_model("./model_vui_ve.h5")
146
+ # predictions = model.predict({
147
+ # "input_visual": train_visual,
148
+ # "input_audio_cnn": train_audio_cnn,
149
+ # "input_audio_wave": train_audio_wave
150
+ # })
151
 
152
+ # predicted_label = np.argmax(predictions)
153
+ # return predicted_label
154
+
155
+ # # Định nghĩa giao diện Gradio
156
+
157
+ # def predict_emotion_gradio(video, audio):
158
+ # emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
159
+ # predicted_label = predict_emotion(video, audio)
160
+ # predicted_emotion = emotion_dict[predicted_label]
161
+ # return predicted_emotion
162
+
163
+ def gradio_interface(video, audio):
164
+ emotion_labels = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
165
+ train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video, audio)
166
+ model = trained_model("./model_vui_ve.h5")
167
+ output = model.predict({"input_visual": train_visual, "input_audio_cnn": train_audio_cnn, "input_audio_wave": train_audio_wave})
168
+ emo_index = tf.math.argmax(output)
169
+
170
+ return emotion_labels[emo_index]
171
 
172
+ # iface = gr.Interface(
173
+ # fn=predict_emotion_gradio,
174
+ # inputs=[
175
+ # gr.Video(label="Upload a video"
176
+ # ),
177
+ # gr.Audio(label="Upload a audio")
178
+ # ],
179
+ # outputs=gr.Textbox(label="Predicted Emotion"),
180
+ # title="Emotion Recognition from Video",
181
+ # description="Upload a video and get the predicted emotion."
182
+ # )
183
 
184
+ # iface.launch()
 
 
 
 
 
 
 
185
 
186
  iface = gr.Interface(
187
+ fn=gradio_interface,
188
  inputs=[
189
+ gr.Video(),
190
+ gr.Audio()
 
191
  ],
192
+ outputs=[
193
+ gr.Text()
194
+ ],
195
+ live=True,
196
+ title="Video and Audio Processing with Emotion Recognition"
197
  )
 
198
  iface.launch()