import gradio as gr import torch as pt import torchaudio import cv2 import os import numpy as np import tensorflow as tf from tensorflow.keras.models import load_model from moviepy.editor import VideoFileClip def convert_video_to_audio_moviepy(video_file, output_ext="wav"): """Converts video to audio using MoviePy library that uses `ffmpeg` under the hood""" filename, ext = os.path.splitext(video_file) clip = VideoFileClip(video_file) audio_path = f"{filename}.{output_ext}" clip.audio.write_audiofile(audio_path) return audio_path def process_video_audio(video_path): audio_path = convert_video_to_audio_moviepy(video_path) wav, sr = torchaudio.load(audio_path) train_visual = pt.zeros([1, 120, 120, 3, 10]) train_audio_wave = pt.zeros([1, 261540]) train_audio_cnn = pt.zeros([1, 150, 512, 1]) mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150}) face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') if len(wav[0]) > 261540: print(wav.shape) train_audio_wave[0, :] = wav[0][:261540] else: print(wav.shape) train_audio_wave[0, :len(wav[0])] = wav[0][:] train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0]) print(train_audio_cnn[0].shape) cap = cv2.VideoCapture(video_path) frame_idx = 0 last_frame = None for i in range(100): ret, frame = cap.read() if ret and (i % 10 == 0): gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)) if len(faces) > 0: (x, y, w, h) = faces[0] face = frame[y:y+h, x:x+w] resized_face = cv2.resize(face, (120, 120)) train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face) else: resized_frame = cv2.resize(frame, (120, 120)) train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame) last_frame = frame frame_idx += 1 cap.release() train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16) train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077)) train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16) return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn def predict_emotion(video_path): last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path) model = load_model("model_vui_ve2392.keras") predictions = model.predict({ "input_visual": train_visual, "input_audio_cnn": train_audio_cnn, "input_audio_wave": train_audio_wave }) predicted_label = np.argmax(predictions) return last_frame, audio_path, predicted_label def predict_emotion_gradio(video_path): emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'} last_frame, audio_path, predicted_label = predict_emotion(video_path) predicted_emotion = emotion_dict[predicted_label] return last_frame, audio_path, predicted_emotion iface = gr.Interface( fn=predict_emotion_gradio, inputs=[ gr.Video(label="Upload a video") ], outputs=[ gr.Image(label="Last Frame"), gr.Audio(label = "Audio"), gr.Textbox(label="Predicted Emotion") ], title="Emotion recognition with multi-modal neural network", description="Upload a video and get the predicted emotion." ) iface.launch()