Spaces:

THP2903
/

DPL-Project

Sleeping

App Files Files Community

DPL-Project / app.py

THP2903

Update app.py

689cf50 verified over 1 year ago

raw

history blame

6.01 kB

	# import gradio as gr
	# import torch as pt
	# import torchaudio
	# import cv2
	# import os
	# import numpy as np
	# import tensorflow as tf
	# from tensorflow.keras.models import load_model
	# from moviepy.editor import VideoFileClip

	import gradio as gr
	import torch as pt
	import torchaudio
	import cv2
	import os
	import numpy as np
	import tensorflow as tf
	from tensorflow.keras.models import load_model
	from moviepy.editor import VideoFileClip
	import socketIO_client as sio

	def convert_video_to_audio_moviepy(video_file, output_ext="wav"):
	"""Converts video to audio using MoviePy library that uses `ffmpeg` under the hood"""
	filename, ext = os.path.splitext(video_file)
	clip = VideoFileClip(video_file)
	audio_path = f"{filename}.{output_ext}"
	clip.audio.write_audiofile(audio_path)
	return audio_path

	def process_video_audio(video_path):
	audio_path = convert_video_to_audio_moviepy(video_path)

	wav, sr = torchaudio.load(audio_path)

	train_visual = pt.zeros([1, 120, 120, 3, 10])
	train_audio_wave = pt.zeros([1, 261540])
	train_audio_cnn = pt.zeros([1, 150, 512, 1])

	mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150})

	face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

	if len(wav[0]) > 261540:
	print(wav.shape)
	train_audio_wave[0, :] = wav[0][:261540]
	else:
	print(wav.shape)
	train_audio_wave[0, :len(wav[0])] = wav[0][:]
	train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])

	print(train_audio_cnn[0].shape)

	cap = cv2.VideoCapture(video_path)
	frame_idx = 0
	last_frame = None
	for i in range(100):
	ret, frame = cap.read()
	if ret and (i % 10 == 0):
	gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
	faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
	if len(faces) > 0:
	(x, y, w, h) = faces[0]
	face = frame[y:y+h, x:x+w]
	resized_face = cv2.resize(face, (120, 120))
	train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face)
	else:
	resized_frame = cv2.resize(frame, (120, 120))
	train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame)
	last_frame = frame
	frame_idx += 1
	cap.release()

	train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16)
	train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077))
	train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16)

	return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn

	def predict_emotion(video_path):
	last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path)

	model = load_model("model_vui_ve.keras")

	predictions = model.predict({
	"input_visual": train_visual,
	"input_audio_cnn": train_audio_cnn,
	"input_audio_wave": train_audio_wave
	})

	predicted_label = np.argmax(predictions)
	return last_frame, audio_path, predicted_label

	# def predict_emotion_gradio(video_path):
	# emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
	# last_frame, audio_path, predicted_label = predict_emotion(video_path)
	# predicted_emotion = emotion_dict[predicted_label]
	# return last_frame, audio_path, predicted_emotion

	# iface = gr.Interface(
	# fn=predict_emotion_gradio,
	# inputs=[
	# gr.Video(label="Upload a video")
	# ],
	# outputs=[
	# gr.Image(label="Last Frame"),
	# gr.Audio(label = "Audio"),
	# gr.Textbox(label="Predicted Emotion")
	# ],
	# title="Emotion Recognition from Video",
	# description="Upload a video and get the predicted emotion."
	# )

	# iface.launch()

	def run_chat_server(app):
	"""Runs a chat server using socket.IO"""
	clients = []
	messages = []

	@app.route('/chat', methods=['GET', 'POST'])
	def chat():
	return app.socketio.send(messages)

	@app.socketio.on('message')
	def handle_message(message):
	clients.append(message['client'])
	messages.append(message)
	app.logger.info(f'Received message: {message}')
	app.socketio.emit('message', message, skip_sid=True)

	@app.socketio.on('connect')
	def handle_connect():
	app.logger.info('Client connected')

	@app.socketio.on('disconnect')
	def handle_disconnect():
	app.logger.info('Client disconnected')

	if __name__ == '__main__':
	app.run(debug=True)

	def predict_emotion_with_chat(video_path):
	last_frame, audio_path, predicted_label = predict_emotion(video_path)
	predicted_emotion = emotion_dict[predicted_label]

	# Connect to the chat server
	client = sio.Client()
	client.connect('http://localhost:5000/chat')

	# Send the predicted emotion to the chat server
	client.emit('message', {'client': 'Emotion Recognition', 'message'
	: f'Predicted emotion: {predicted_emotion}'})

	# Receive messages from the chat server
	for msg in client.events:
	print(msg)

	return last_frame, audio_path, predicted_emotion, messages

	iface = gr.Interface(
	fn=predict_emotion_with_chat,
	inputs=[
	gr.Video(label="Upload a video")
	],
	outputs=[
	gr.Image(label="Last Frame"),
	gr.Audio(label="Audio"),
	gr.Textbox(label="Predicted Emotion"),
	gr.Chatbot(label="Chat")
	],
	title="Emotion Recognition with Chat",
	description="Upload a video and get the predicted emotion. Chat with others in real-time."
	)

	# Start the Gradio interface and the chat server
	from flask import Flask
	app = Flask(__name__)
	app.config['SECRET_KEY'] = 'secret'
	app.socketio = sio.SocketIO(app)
	run_chat_server(app)
	iface.launch()