Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -117,6 +117,7 @@ def process_video_audio(video_path, audio_path):
|
|
| 117 |
|
| 118 |
cap = cv2.VideoCapture(video_path)
|
| 119 |
frame_idx = 0
|
|
|
|
| 120 |
for i in range(100):
|
| 121 |
ret, frame = cap.read()
|
| 122 |
if ret and (i % 10 == 0):
|
|
@@ -130,17 +131,18 @@ def process_video_audio(video_path, audio_path):
|
|
| 130 |
else:
|
| 131 |
resized_frame = cv2.resize(frame, (120, 120))
|
| 132 |
train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame)
|
|
|
|
| 133 |
frame_idx += 1
|
| 134 |
cap.release()
|
| 135 |
-
|
| 136 |
train_visual = tf.convert_to_tensor(train_visual, dtype=tf.float16)
|
| 137 |
train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave, dtype=tf.float16), (1, 20, 13077))
|
| 138 |
train_audio_cnn = tf.convert_to_tensor(train_audio_cnn, dtype=tf.float16)
|
| 139 |
|
| 140 |
-
return train_visual, train_audio_wave, train_audio_cnn
|
| 141 |
|
| 142 |
def predict_emotion(video_path, audio_path):
|
| 143 |
-
train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
|
| 144 |
|
| 145 |
model = load_model("model_vui_ve.h5")
|
| 146 |
predictions = model.predict({
|
|
@@ -150,15 +152,15 @@ def predict_emotion(video_path, audio_path):
|
|
| 150 |
})
|
| 151 |
|
| 152 |
predicted_label = np.argmax(predictions)
|
| 153 |
-
return predicted_label
|
| 154 |
|
| 155 |
# Định nghĩa giao diện Gradio
|
| 156 |
|
| 157 |
-
def predict_emotion_gradio(
|
| 158 |
emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
|
| 159 |
-
predicted_label = predict_emotion(
|
| 160 |
predicted_emotion = emotion_dict[predicted_label]
|
| 161 |
-
return predicted_emotion
|
| 162 |
|
| 163 |
# def gradio_interface(video, audio):
|
| 164 |
# emotion_labels = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
|
|
@@ -172,11 +174,12 @@ def predict_emotion_gradio(video, audio):
|
|
| 172 |
iface = gr.Interface(
|
| 173 |
fn=predict_emotion_gradio,
|
| 174 |
inputs=[
|
| 175 |
-
gr.Video(label="Upload a video"
|
| 176 |
-
),
|
| 177 |
gr.Audio(label="Upload a audio")
|
| 178 |
],
|
| 179 |
-
outputs=
|
|
|
|
|
|
|
| 180 |
title="Emotion Recognition from Video",
|
| 181 |
description="Upload a video and get the predicted emotion."
|
| 182 |
)
|
|
|
|
| 117 |
|
| 118 |
cap = cv2.VideoCapture(video_path)
|
| 119 |
frame_idx = 0
|
| 120 |
+
last_frame = None
|
| 121 |
for i in range(100):
|
| 122 |
ret, frame = cap.read()
|
| 123 |
if ret and (i % 10 == 0):
|
|
|
|
| 131 |
else:
|
| 132 |
resized_frame = cv2.resize(frame, (120, 120))
|
| 133 |
train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame)
|
| 134 |
+
last_frame = frame
|
| 135 |
frame_idx += 1
|
| 136 |
cap.release()
|
| 137 |
+
|
| 138 |
train_visual = tf.convert_to_tensor(train_visual, dtype=tf.float16)
|
| 139 |
train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave, dtype=tf.float16), (1, 20, 13077))
|
| 140 |
train_audio_cnn = tf.convert_to_tensor(train_audio_cnn, dtype=tf.float16)
|
| 141 |
|
| 142 |
+
return last_frame, train_visual, train_audio_wave, train_audio_cnn
|
| 143 |
|
| 144 |
def predict_emotion(video_path, audio_path):
|
| 145 |
+
last_frame, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
|
| 146 |
|
| 147 |
model = load_model("model_vui_ve.h5")
|
| 148 |
predictions = model.predict({
|
|
|
|
| 152 |
})
|
| 153 |
|
| 154 |
predicted_label = np.argmax(predictions)
|
| 155 |
+
return last_frame, predicted_label
|
| 156 |
|
| 157 |
# Định nghĩa giao diện Gradio
|
| 158 |
|
| 159 |
+
def predict_emotion_gradio(video_path, audio_path):
|
| 160 |
emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
|
| 161 |
+
last_frame, predicted_label = predict_emotion(video_path, audio_path)
|
| 162 |
predicted_emotion = emotion_dict[predicted_label]
|
| 163 |
+
return last_frame, predicted_emotion
|
| 164 |
|
| 165 |
# def gradio_interface(video, audio):
|
| 166 |
# emotion_labels = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
|
|
|
|
| 174 |
iface = gr.Interface(
|
| 175 |
fn=predict_emotion_gradio,
|
| 176 |
inputs=[
|
| 177 |
+
gr.Video(label="Upload a video"),
|
|
|
|
| 178 |
gr.Audio(label="Upload a audio")
|
| 179 |
],
|
| 180 |
+
outputs=[
|
| 181 |
+
gr.Textbox(label="Predicted Emotion"),
|
| 182 |
+
gr.Image(label = image frame last)],
|
| 183 |
title="Emotion Recognition from Video",
|
| 184 |
description="Upload a video and get the predicted emotion."
|
| 185 |
)
|