Spaces:
Sleeping
Sleeping
File size: 3,784 Bytes
1a638f1 bbaddce e6e97e9 bbaddce b249890 4303f04 bd3db00 d997553 46da3ad d997553 1fcfc81 836cfdf d997553 836cfdf d997553 836cfdf cb15e76 b5476f5 5037272 b249890 d8cd545 62b8b49 f981299 1fcfc81 836cfdf 2eef7a9 b249890 1fcfc81 836cfdf 74a0dce 769127d 1fcfc81 b249890 16e2339 b249890 1f82907 b249890 1f82907 16e2339 b249890 16e2339 f0d3073 83034e9 6fc4cb7 df52549 836cfdf 6fc4cb7 df52549 5037272 5e76cfb 6cd4025 df52549 6cd4025 6fc4cb7 6cd4025 22dfecc 5037272 6fc4cb7 6cd4025 22dfecc e115a53 a0c8815 1a638f1 22dfecc 1a638f1 22dfecc 1a638f1 2eef7a9 22dfecc 1a638f1 6cd4025 22dfecc 7082540 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import gradio as gr
import torch as pt
import torchaudio
import cv2
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from moviepy.editor import VideoFileClip
def convert_video_to_audio_moviepy(video_file, output_ext="wav"):
"""Converts video to audio using MoviePy library that uses `ffmpeg` under the hood"""
filename, ext = os.path.splitext(video_file)
clip = VideoFileClip(video_file)
audio_path = f"{filename}.{output_ext}"
clip.audio.write_audiofile(audio_path)
return audio_path
def process_video_audio(video_path):
audio_path = convert_video_to_audio_moviepy(video_path)
wav, sr = torchaudio.load(audio_path)
train_visual = pt.zeros([1, 120, 120, 3, 10])
train_audio_wave = pt.zeros([1, 261540])
train_audio_cnn = pt.zeros([1, 150, 512, 1])
mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150})
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
if len(wav[0]) > 261540:
print(wav.shape)
train_audio_wave[0, :] = wav[0][:261540]
else:
print(wav.shape)
train_audio_wave[0, :len(wav[0])] = wav[0][:]
train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])
print(train_audio_cnn[0].shape)
cap = cv2.VideoCapture(video_path)
frame_idx = 0
last_frame = None
for i in range(100):
ret, frame = cap.read()
if ret and (i % 10 == 0):
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
if len(faces) > 0:
(x, y, w, h) = faces[0]
face = frame[y:y+h, x:x+w]
resized_face = cv2.resize(face, (120, 120))
train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face)
else:
resized_frame = cv2.resize(frame, (120, 120))
train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame)
last_frame = frame
frame_idx += 1
cap.release()
train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16)
train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077))
train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16)
return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn
def predict_emotion(video_path):
last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path)
model = load_model("model_vui_ve2392.keras")
predictions = model.predict({
"input_visual": train_visual,
"input_audio_cnn": train_audio_cnn,
"input_audio_wave": train_audio_wave
})
predicted_label = np.argmax(predictions)
return last_frame, audio_path, predicted_label
def predict_emotion_gradio(video_path):
emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}
last_frame, audio_path, predicted_label = predict_emotion(video_path)
predicted_emotion = emotion_dict[predicted_label]
return last_frame, audio_path, predicted_emotion
iface = gr.Interface(
fn=predict_emotion_gradio,
inputs=[
gr.Video(label="Upload a video")
],
outputs=[
gr.Image(label="Last Frame"),
gr.Audio(label = "Audio"),
gr.Textbox(label="Predicted Emotion")
],
title="Emotion recognition with multi-modal neural network",
description="Upload a video and get the predicted emotion."
)
iface.launch()
|