THP2903 commited on
Commit
f0d3073
·
verified ·
1 Parent(s): 2f39b71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -115
app.py CHANGED
@@ -8,91 +8,7 @@ import numpy as np
8
  import tensorflow as tf
9
  from tensorflow.keras.models import load_model
10
 
11
- # def trained_model(model_path):
12
-
13
- # input_visual = tf.keras.Input((120, 120, 3, 10), name="input_visual") # 90 - 120
14
- # input_audio_cnn = tf.keras.Input((150, 512, 1), name="input_audio_cnn")
15
- # input_audio_wave = tf.keras.Input((20, 13077), name="input_audio_wave")
16
-
17
- # # Visual branch
18
- # x_v = tf.keras.layers.Conv3D(10, (3, 3, 3), strides=(2, 2, 1), padding='same')(input_visual)
19
- # x_v = tf.keras.layers.BatchNormalization()(x_v)
20
- # x_v = tf.keras.layers.ReLU()(x_v)
21
- # x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
22
-
23
- # x_v = tf.keras.layers.Conv3D(40, (3, 3, 3), strides=(2, 2, 1), padding='same')(x_v)
24
- # x_v = tf.keras.layers.BatchNormalization()(x_v)
25
- # x_v = tf.keras.layers.ReLU()(x_v)
26
- # x_v = tf.keras.layers.MaxPooling3D((3, 3, 1))(x_v)
27
-
28
- # x_v = tf.keras.layers.Flatten()(x_v)
29
-
30
- # x_v = tf.keras.layers.Dropout(0.2)(x_v)
31
- # x_v = tf.keras.layers.Dense(500)(x_v)
32
- # x_v = tf.keras.layers.BatchNormalization()(x_v)
33
- # x_v = tf.keras.layers.ReLU()(x_v)
34
-
35
- # # Audio cnn branch
36
- # x_c = tf.keras.layers.Conv2D(5, (3, 3), strides=(2, 2), padding='same')(input_audio_cnn)
37
- # x_c = tf.keras.layers.BatchNormalization()(x_c)
38
- # x_c = tf.keras.layers.ReLU()(x_c)
39
- # x_c = tf.keras.layers.MaxPooling2D((3, 3))(x_c)
40
-
41
- # x_c = tf.keras.layers.Conv2D(30, (3, 3), strides=(2, 2), padding='same')(x_c)
42
- # x_c = tf.keras.layers.BatchNormalization()(x_c)
43
- # x_c = tf.keras.layers.ReLU()(x_c)
44
- # x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
45
-
46
- # x_c = tf.keras.layers.Conv2D(100, (3, 3), strides=(1, 1), padding='same')(x_c)
47
- # x_c = tf.keras.layers.BatchNormalization()(x_c)
48
- # x_c = tf.keras.layers.ReLU()(x_c)
49
- # x_c = tf.keras.layers.Conv2D(200, (3, 3), strides=(1, 1), padding='same')(x_c)
50
- # x_c = tf.keras.layers.BatchNormalization()(x_c)
51
- # x_c = tf.keras.layers.ReLU()(x_c)
52
- # x_c = tf.keras.layers.MaxPooling2D((2, 2))(x_c)
53
-
54
- # x_c = tf.keras.layers.Flatten()(x_c)
55
-
56
- # x_c = tf.keras.layers.Dropout(0.2)(x_c)
57
- # x_c = tf.keras.layers.Dense(500)(x_c)
58
- # x_c = tf.keras.layers.BatchNormalization()(x_c)
59
- # x_c = tf.keras.layers.ReLU()(x_c)
60
-
61
- # # Audio wave branch
62
- # x_w = tf.keras.layers.LSTM(500)(input_audio_wave)
63
- # x_w = tf.keras.layers.RepeatVector(20)(x_w)
64
- # x_w = tf.keras.layers.LSTM(500)(x_w)
65
-
66
- # x_w = tf.keras.layers.Flatten()(x_w)
67
-
68
- # x_w = tf.keras.layers.Dropout(0.2)(x_w)
69
- # x_w = tf.keras.layers.Dense(500)(x_w)
70
- # x_w = tf.keras.layers.BatchNormalization()(x_w)
71
- # x_w = tf.keras.layers.ReLU()(x_w)
72
-
73
- # # Audio fusion
74
- # x_a = x_c + x_w
75
- # x_a = tf.keras.layers.Dense(500)(x_a)
76
- # x_a = tf.keras.layers.BatchNormalization()(x_a)
77
- # x_a = tf.keras.layers.ReLU()(x_a)
78
-
79
- # # Fusion
80
- # x = x_a + x_v
81
- # x = tf.keras.layers.Dense(500)(x)
82
- # x = tf.keras.layers.BatchNormalization()(x)
83
- # x = tf.keras.layers.ReLU()(x)
84
-
85
- # # Output
86
- # x = tf.keras.layers.Dropout(0.1)(x)
87
- # x = tf.keras.layers.Dense(6, activation='softmax', name='output_classification')(x) # 8 - 6
88
-
89
-
90
- # model = model.load(model_path)
91
-
92
- # return model
93
-
94
  def process_video_audio(video_path, audio_path):
95
-
96
  wav = pt.tensor(list(audio_path[1]))
97
 
98
  train_visual = pt.zeros([1, 120, 120, 3, 10])
@@ -133,16 +49,16 @@ def process_video_audio(video_path, audio_path):
133
  frame_idx += 1
134
  cap.release()
135
 
136
- train_visual = tf.convert_to_tensor(train_visual, dtype=tf.float16)
137
- train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave, dtype=tf.float16), (1, 20, 13077))
138
- train_audio_cnn = tf.convert_to_tensor(train_audio_cnn, dtype=tf.float16)
139
 
140
  return last_frame, train_visual, train_audio_wave, train_audio_cnn
141
 
142
  def predict_emotion(video_path, audio_path):
143
  last_frame, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
144
 
145
- model = load_model("model_vui_ve.h5")
146
  predictions = model.predict({
147
  "input_visual": train_visual,
148
  "input_audio_cnn": train_audio_cnn,
@@ -152,23 +68,12 @@ def predict_emotion(video_path, audio_path):
152
  predicted_label = np.argmax(predictions)
153
  return last_frame, predicted_label
154
 
155
- # Định nghĩa giao diện Gradio
156
-
157
  def predict_emotion_gradio(video_path, audio_path):
158
  emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
159
  last_frame, predicted_label = predict_emotion(video_path, audio_path)
160
  predicted_emotion = emotion_dict[predicted_label]
161
  return last_frame, predicted_emotion
162
 
163
- # def gradio_interface(video, audio):
164
- # emotion_labels = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
165
- # train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video, audio)
166
- # model = trained_model("./model_vui_ve.h5")
167
- # output = model.predict({"input_visual": train_visual, "input_audio_cnn": train_audio_cnn, "input_audio_wave": train_audio_wave})
168
- # emo_index = tf.math.argmax(output)
169
-
170
- # return emotion_labels[emo_index]
171
-
172
  iface = gr.Interface(
173
  fn=predict_emotion_gradio,
174
  inputs=[
@@ -176,24 +81,11 @@ iface = gr.Interface(
176
  gr.Audio(label="Upload a audio")
177
  ],
178
  outputs=[
179
- gr.Textbox(label="Predicted Emotion"),
180
- gr.Image(label = "image frame last")],
 
181
  title="Emotion Recognition from Video",
182
  description="Upload a video and get the predicted emotion."
183
  )
184
 
185
  iface.launch()
186
-
187
- # iface = gr.Interface(
188
- # fn=gradio_interface,
189
- # inputs=[
190
- # gr.Video(),
191
- # gr.Audio()
192
- # ],
193
- # outputs=[
194
- # gr.Text()
195
- # ],
196
- # live=True,
197
- # title="Video and Audio Processing with Emotion Recognition"
198
- # )
199
- # iface.launch()
 
8
  import tensorflow as tf
9
  from tensorflow.keras.models import load_model
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def process_video_audio(video_path, audio_path):
 
12
  wav = pt.tensor(list(audio_path[1]))
13
 
14
  train_visual = pt.zeros([1, 120, 120, 3, 10])
 
49
  frame_idx += 1
50
  cap.release()
51
 
52
+ train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16)
53
+ train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077))
54
+ train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16)
55
 
56
  return last_frame, train_visual, train_audio_wave, train_audio_cnn
57
 
58
  def predict_emotion(video_path, audio_path):
59
  last_frame, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
60
 
61
+ model = load_model("model_vui_ve.h5", compile=False)
62
  predictions = model.predict({
63
  "input_visual": train_visual,
64
  "input_audio_cnn": train_audio_cnn,
 
68
  predicted_label = np.argmax(predictions)
69
  return last_frame, predicted_label
70
 
 
 
71
  def predict_emotion_gradio(video_path, audio_path):
72
  emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
73
  last_frame, predicted_label = predict_emotion(video_path, audio_path)
74
  predicted_emotion = emotion_dict[predicted_label]
75
  return last_frame, predicted_emotion
76
 
 
 
 
 
 
 
 
 
 
77
  iface = gr.Interface(
78
  fn=predict_emotion_gradio,
79
  inputs=[
 
81
  gr.Audio(label="Upload a audio")
82
  ],
83
  outputs=[
84
+ gr.Image(label="Last Frame"),
85
+ gr.Textbox(label="Predicted Emotion")
86
+ ],
87
  title="Emotion Recognition from Video",
88
  description="Upload a video and get the predicted emotion."
89
  )
90
 
91
  iface.launch()