THP2903 commited on
Commit
6cd4025
·
verified ·
1 Parent(s): a817465

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -127
app.py CHANGED
@@ -10,88 +10,88 @@ from tensorflow.keras.models import load_model
10
 
11
 
12
 
13
- def trained_model(model_path):
14
-
15
- input_visual = tf.keras.Input((120, 120, 3, 10), name="input_visual") # 90 - 120
16
- input_audio_cnn = tf.keras.Input((150, 512, 1), name="input_audio_cnn")
17
- input_audio_wave = tf.keras.Input((20, 13077), name="input_audio_wave")
18
-
19
- # Visual branch
20
- x_v = tf.keras.layers.Conv3D(10, (3, 3, 3), strides=(2, 2, 1), padding='same')(input_visual)
21
- x_v = tf.keras.layers.BatchNormalization()(x_v)
22
- x_v = tf.keras.layers.ReLU()(x_v)
23
- x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
24
-
25
- x_v = tf.keras.layers.Conv3D(40, (3, 3, 3), strides=(2, 2, 1), padding='same')(x_v)
26
- x_v = tf.keras.layers.BatchNormalization()(x_v)
27
- x_v = tf.keras.layers.ReLU()(x_v)
28
- x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
29
-
30
- x_v = tf.keras.layers.Flatten()(x_v)
31
-
32
- x_v = tf.keras.layers.Dropout(0.2)(x_v)
33
- x_v = tf.keras.layers.Dense(500)(x_v)
34
- x_v = tf.keras.layers.BatchNormalization()(x_v)
35
- x_v = tf.keras.layers.ReLU()(x_v)
36
-
37
- # Audio cnn branch
38
- x_c = tf.keras.layers.Conv2D(5, (3, 3), strides=(2, 2), padding='same')(input_audio_cnn)
39
- x_c = tf.keras.layers.BatchNormalization()(x_c)
40
- x_c = tf.keras.layers.ReLU()(x_c)
41
- x_c = tf.keras.layers.MaxPooling2D((3, 3))(x_c)
42
-
43
- x_c = tf.keras.layers.Conv2D(30, (3, 3), strides=(2, 2), padding='same')(x_c)
44
- x_c = tf.keras.layers.BatchNormalization()(x_c)
45
- x_c = tf.keras.layers.ReLU()(x_c)
46
- x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
47
-
48
- x_c = tf.keras.layers.Conv2D(100, (3, 3), strides=(1, 1), padding='same')(x_c)
49
- x_c = tf.keras.layers.BatchNormalization()(x_c)
50
- x_c = tf.keras.layers.ReLU()(x_c)
51
- x_c = tf.keras.layers.Conv2D(200, (3, 3), strides=(1, 1), padding='same')(x_c)
52
- x_c = tf.keras.layers.BatchNormalization()(x_c)
53
- x_c = tf.keras.layers.ReLU()(x_c)
54
- x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
55
-
56
- x_c = tf.keras.layers.Flatten()(x_c)
57
-
58
- x_c = tf.keras.layers.Dropout(0.2)(x_c)
59
- x_c = tf.keras.layers.Dense(500)(x_c)
60
- x_c = tf.keras.layers.BatchNormalization()(x_c)
61
- x_c = tf.keras.layers.ReLU()(x_c)
62
-
63
- # Audio wave branch
64
- x_w = tf.keras.layers.LSTM(500)(input_audio_wave)
65
- x_w = tf.keras.layers.RepeatVector(20)(x_w)
66
- x_w = tf.keras.layers.LSTM(500)(x_w)
67
-
68
- x_w = tf.keras.layers.Flatten()(x_w)
69
-
70
- x_w = tf.keras.layers.Dropout(0.2)(x_w)
71
- x_w = tf.keras.layers.Dense(500)(x_w)
72
- x_w = tf.keras.layers.BatchNormalization()(x_w)
73
- x_w = tf.keras.layers.ReLU()(x_w)
74
-
75
- # Audio fusion
76
- x_a = x_c + x_w
77
- x_a = tf.keras.layers.Dense(500)(x_a)
78
- x_a = tf.keras.layers.BatchNormalization()(x_a)
79
- x_a = tf.keras.layers.ReLU()(x_a)
80
-
81
- # Fusion
82
- x = x_a + x_v
83
- x = tf.keras.layers.Dense(500)(x)
84
- x = tf.keras.layers.BatchNormalization()(x)
85
- x = tf.keras.layers.ReLU()(x)
86
-
87
- # Output
88
- x = tf.keras.layers.Dropout(0.1)(x)
89
- x = tf.keras.layers.Dense(6, activation='softmax', name='output_classification')(x) # 8 - 6
90
 
91
 
92
- model = model.load(model_path)
93
 
94
- return model
95
 
96
  def process_video_audio(video_path, audio_path):
97
 
@@ -139,60 +139,60 @@ def process_video_audio(video_path, audio_path):
139
 
140
  return train_visual, train_audio_wave, train_audio_cnn
141
 
142
- # def predict_emotion(video_path, audio_path):
143
- # train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
144
 
145
- # model = load_model("./model_vui_ve.h5")
146
- # predictions = model.predict({
147
- # "input_visual": train_visual,
148
- # "input_audio_cnn": train_audio_cnn,
149
- # "input_audio_wave": train_audio_wave
150
- # })
151
 
152
- # predicted_label = np.argmax(predictions)
153
- # return predicted_label
154
-
155
- # # Định nghĩa giao diện Gradio
156
-
157
- # def predict_emotion_gradio(video, audio):
158
- # emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
159
- # predicted_label = predict_emotion(video, audio)
160
- # predicted_emotion = emotion_dict[predicted_label]
161
- # return predicted_emotion
162
-
163
- def gradio_interface(video, audio):
164
- emotion_labels = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
165
- train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video, audio)
166
- model = trained_model("./model_vui_ve.h5")
167
- output = model.predict({"input_visual": train_visual, "input_audio_cnn": train_audio_cnn, "input_audio_wave": train_audio_wave})
168
- emo_index = tf.math.argmax(output)
169
 
170
- return emotion_labels[emo_index]
171
-
172
- # iface = gr.Interface(
173
- # fn=predict_emotion_gradio,
174
- # inputs=[
175
- # gr.Video(label="Upload a video"
176
- # ),
177
- # gr.Audio(label="Upload a audio")
178
- # ],
179
- # outputs=gr.Textbox(label="Predicted Emotion"),
180
- # title="Emotion Recognition from Video",
181
- # description="Upload a video and get the predicted emotion."
182
- # )
183
-
184
- # iface.launch()
185
 
186
  iface = gr.Interface(
187
- fn=gradio_interface,
188
  inputs=[
189
- gr.Video(),
190
- gr.Audio()
191
- ],
192
- outputs=[
193
- gr.Text()
194
  ],
195
- live=True,
196
- title="Video and Audio Processing with Emotion Recognition"
 
197
  )
 
198
  iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
 
13
+ # def trained_model(model_path):
14
+
15
+ # input_visual = tf.keras.Input((120, 120, 3, 10), name="input_visual") # 90 - 120
16
+ # input_audio_cnn = tf.keras.Input((150, 512, 1), name="input_audio_cnn")
17
+ # input_audio_wave = tf.keras.Input((20, 13077), name="input_audio_wave")
18
+
19
+ # # Visual branch
20
+ # x_v = tf.keras.layers.Conv3D(10, (3, 3, 3), strides=(2, 2, 1), padding='same')(input_visual)
21
+ # x_v = tf.keras.layers.BatchNormalization()(x_v)
22
+ # x_v = tf.keras.layers.ReLU()(x_v)
23
+ # x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
24
+
25
+ # x_v = tf.keras.layers.Conv3D(40, (3, 3, 3), strides=(2, 2, 1), padding='same')(x_v)
26
+ # x_v = tf.keras.layers.BatchNormalization()(x_v)
27
+ # x_v = tf.keras.layers.ReLU()(x_v)
28
+ # x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
29
+
30
+ # x_v = tf.keras.layers.Flatten()(x_v)
31
+
32
+ # x_v = tf.keras.layers.Dropout(0.2)(x_v)
33
+ # x_v = tf.keras.layers.Dense(500)(x_v)
34
+ # x_v = tf.keras.layers.BatchNormalization()(x_v)
35
+ # x_v = tf.keras.layers.ReLU()(x_v)
36
+
37
+ # # Audio cnn branch
38
+ # x_c = tf.keras.layers.Conv2D(5, (3, 3), strides=(2, 2), padding='same')(input_audio_cnn)
39
+ # x_c = tf.keras.layers.BatchNormalization()(x_c)
40
+ # x_c = tf.keras.layers.ReLU()(x_c)
41
+ # x_c = tf.keras.layers.MaxPooling2D((3, 3))(x_c)
42
+
43
+ # x_c = tf.keras.layers.Conv2D(30, (3, 3), strides=(2, 2), padding='same')(x_c)
44
+ # x_c = tf.keras.layers.BatchNormalization()(x_c)
45
+ # x_c = tf.keras.layers.ReLU()(x_c)
46
+ # x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
47
+
48
+ # x_c = tf.keras.layers.Conv2D(100, (3, 3), strides=(1, 1), padding='same')(x_c)
49
+ # x_c = tf.keras.layers.BatchNormalization()(x_c)
50
+ # x_c = tf.keras.layers.ReLU()(x_c)
51
+ # x_c = tf.keras.layers.Conv2D(200, (3, 3), strides=(1, 1), padding='same')(x_c)
52
+ # x_c = tf.keras.layers.BatchNormalization()(x_c)
53
+ # x_c = tf.keras.layers.ReLU()(x_c)
54
+ # x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
55
+
56
+ # x_c = tf.keras.layers.Flatten()(x_c)
57
+
58
+ # x_c = tf.keras.layers.Dropout(0.2)(x_c)
59
+ # x_c = tf.keras.layers.Dense(500)(x_c)
60
+ # x_c = tf.keras.layers.BatchNormalization()(x_c)
61
+ # x_c = tf.keras.layers.ReLU()(x_c)
62
+
63
+ # # Audio wave branch
64
+ # x_w = tf.keras.layers.LSTM(500)(input_audio_wave)
65
+ # x_w = tf.keras.layers.RepeatVector(20)(x_w)
66
+ # x_w = tf.keras.layers.LSTM(500)(x_w)
67
+
68
+ # x_w = tf.keras.layers.Flatten()(x_w)
69
+
70
+ # x_w = tf.keras.layers.Dropout(0.2)(x_w)
71
+ # x_w = tf.keras.layers.Dense(500)(x_w)
72
+ # x_w = tf.keras.layers.BatchNormalization()(x_w)
73
+ # x_w = tf.keras.layers.ReLU()(x_w)
74
+
75
+ # # Audio fusion
76
+ # x_a = x_c + x_w
77
+ # x_a = tf.keras.layers.Dense(500)(x_a)
78
+ # x_a = tf.keras.layers.BatchNormalization()(x_a)
79
+ # x_a = tf.keras.layers.ReLU()(x_a)
80
+
81
+ # # Fusion
82
+ # x = x_a + x_v
83
+ # x = tf.keras.layers.Dense(500)(x)
84
+ # x = tf.keras.layers.BatchNormalization()(x)
85
+ # x = tf.keras.layers.ReLU()(x)
86
+
87
+ # # Output
88
+ # x = tf.keras.layers.Dropout(0.1)(x)
89
+ # x = tf.keras.layers.Dense(6, activation='softmax', name='output_classification')(x) # 8 - 6
90
 
91
 
92
+ # model = model.load(model_path)
93
 
94
+ # return model
95
 
96
  def process_video_audio(video_path, audio_path):
97
 
 
139
 
140
  return train_visual, train_audio_wave, train_audio_cnn
141
 
142
+ def predict_emotion(video_path, audio_path):
143
+ train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
144
 
145
+ model = load_model("model_vui_ve.h5")
146
+ predictions = model.predict({
147
+ "input_visual": train_visual,
148
+ "input_audio_cnn": train_audio_cnn,
149
+ "input_audio_wave": train_audio_wave
150
+ })
151
 
152
+ predicted_label = np.argmax(predictions)
153
+ return predicted_label
154
+
155
+ # Định nghĩa giao diện Gradio
156
+
157
+ def predict_emotion_gradio(video, audio):
158
+ emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
159
+ predicted_label = predict_emotion(video, audio)
160
+ predicted_emotion = emotion_dict[predicted_label]
161
+ return predicted_emotion
162
+
163
+ # def gradio_interface(video, audio):
164
+ # emotion_labels = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
165
+ # train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video, audio)
166
+ # model = trained_model("./model_vui_ve.h5")
167
+ # output = model.predict({"input_visual": train_visual, "input_audio_cnn": train_audio_cnn, "input_audio_wave": train_audio_wave})
168
+ # emo_index = tf.math.argmax(output)
169
 
170
+ # return emotion_labels[emo_index]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  iface = gr.Interface(
173
+ fn=predict_emotion_gradio,
174
  inputs=[
175
+ gr.Video(label="Upload a video"
176
+ ),
177
+ gr.Audio(label="Upload a audio")
 
 
178
  ],
179
+ outputs=gr.Textbox(label="Predicted Emotion"),
180
+ title="Emotion Recognition from Video",
181
+ description="Upload a video and get the predicted emotion."
182
  )
183
+
184
  iface.launch()
185
+
186
+ # iface = gr.Interface(
187
+ # fn=gradio_interface,
188
+ # inputs=[
189
+ # gr.Video(),
190
+ # gr.Audio()
191
+ # ],
192
+ # outputs=[
193
+ # gr.Text()
194
+ # ],
195
+ # live=True,
196
+ # title="Video and Audio Processing with Emotion Recognition"
197
+ # )
198
+ # iface.launch()