Spaces:

THP2903
/

DPL-Project

Sleeping

App Files Files Community

DPL-Project / app.py

THP2903

Update app.py

5e76cfb verified almost 2 years ago

raw

history blame

3.26 kB

	import gradio as gr
	import torch as pt
	import torchaudio
	import cv2
	import os

	import numpy as np
	import tensorflow as tf
	from tensorflow.keras.models import load_model

	def process_video_audio(video_path, audio_path):
	wav = pt.tensor(list(audio_path[1]))

	train_visual = pt.zeros([1, 120, 120, 3, 10])
	train_audio_wave = pt.zeros([1, 261540])
	train_audio_cnn = pt.zeros([1, 150, 512, 1])

	mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150})

	face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

	if len(wav) > 261540:
	print(wav.shape)
	train_audio_wave[0, :] = wav[:261540]
	else:
	print(wav.shape)
	train_audio_wave[0, :len(wav)] = wav[:]
	train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])

	print(train_audio_cnn[0].shape)

	cap = cv2.VideoCapture(video_path)
	frame_idx = 0
	last_frame = None
	for i in range(100):
	ret, frame = cap.read()
	if ret and (i % 10 == 0):
	gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
	faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
	if len(faces) > 0:
	(x, y, w, h) = faces[0]
	face = frame[y:y+h, x:x+w]
	resized_face = cv2.resize(face, (120, 120))
	train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face)
	else:
	resized_frame = cv2.resize(frame, (120, 120))
	train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame)
	last_frame = frame
	frame_idx += 1
	cap.release()

	train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16)
	train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077))
	train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16)

	return last_frame, train_visual, train_audio_wave, train_audio_cnn

	def predict_emotion(video_path, audio_path):
	last_frame, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)

	model = load_model("./model_vui_ve")

	predictions = model.predict({
	"input_visual": train_visual,
	"input_audio_cnn": train_audio_cnn,
	"input_audio_wave": train_audio_wave
	})

	predicted_label = np.argmax(predictions)
	return last_frame, predicted_label

	def predict_emotion_gradio(video_path, audio_path):
	emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
	last_frame, predicted_label = predict_emotion(video_path, audio_path)
	predicted_emotion = emotion_dict[predicted_label]
	return last_frame, predicted_emotion

	iface = gr.Interface(
	fn=predict_emotion_gradio,
	inputs=[
	gr.Video(label="Upload a video"),
	gr.Audio(label="Upload a audio")
	],
	outputs=[
	gr.Image(label="Last Frame"),
	gr.Textbox(label="Predicted Emotion")
	],
	title="Emotion Recognition from Video",
	description="Upload a video and get the predicted emotion."
	)

	iface.launch()