DPL-Project / app.py
THP2903's picture
Update app.py
5e76cfb verified
raw
history blame
3.26 kB
import gradio as gr
import torch as pt
import torchaudio
import cv2
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
def process_video_audio(video_path, audio_path):
wav = pt.tensor(list(audio_path[1]))
train_visual = pt.zeros([1, 120, 120, 3, 10])
train_audio_wave = pt.zeros([1, 261540])
train_audio_cnn = pt.zeros([1, 150, 512, 1])
mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150})
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
if len(wav) > 261540:
print(wav.shape)
train_audio_wave[0, :] = wav[:261540]
else:
print(wav.shape)
train_audio_wave[0, :len(wav)] = wav[:]
train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])
print(train_audio_cnn[0].shape)
cap = cv2.VideoCapture(video_path)
frame_idx = 0
last_frame = None
for i in range(100):
ret, frame = cap.read()
if ret and (i % 10 == 0):
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
if len(faces) > 0:
(x, y, w, h) = faces[0]
face = frame[y:y+h, x:x+w]
resized_face = cv2.resize(face, (120, 120))
train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face)
else:
resized_frame = cv2.resize(frame, (120, 120))
train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame)
last_frame = frame
frame_idx += 1
cap.release()
train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16)
train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077))
train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16)
return last_frame, train_visual, train_audio_wave, train_audio_cnn
def predict_emotion(video_path, audio_path):
last_frame, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
model = load_model("./model_vui_ve")
predictions = model.predict({
"input_visual": train_visual,
"input_audio_cnn": train_audio_cnn,
"input_audio_wave": train_audio_wave
})
predicted_label = np.argmax(predictions)
return last_frame, predicted_label
def predict_emotion_gradio(video_path, audio_path):
emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
last_frame, predicted_label = predict_emotion(video_path, audio_path)
predicted_emotion = emotion_dict[predicted_label]
return last_frame, predicted_emotion
iface = gr.Interface(
fn=predict_emotion_gradio,
inputs=[
gr.Video(label="Upload a video"),
gr.Audio(label="Upload a audio")
],
outputs=[
gr.Image(label="Last Frame"),
gr.Textbox(label="Predicted Emotion")
],
title="Emotion Recognition from Video",
description="Upload a video and get the predicted emotion."
)
iface.launch()