# import gradio as gr # import torch as pt # import torchaudio # import cv2 # import os # import numpy as np # import tensorflow as tf # from tensorflow.keras.models import load_model # from moviepy.editor import VideoFileClip import gradio as gr import torch as pt import torchaudio import cv2 import os import numpy as np import tensorflow as tf from tensorflow.keras.models import load_model from moviepy.editor import VideoFileClip import socketIO_client as sio def convert_video_to_audio_moviepy(video_file, output_ext="wav"): """Converts video to audio using MoviePy library that uses `ffmpeg` under the hood""" filename, ext = os.path.splitext(video_file) clip = VideoFileClip(video_file) audio_path = f"{filename}.{output_ext}" clip.audio.write_audiofile(audio_path) return audio_path def process_video_audio(video_path): audio_path = convert_video_to_audio_moviepy(video_path) wav, sr = torchaudio.load(audio_path) train_visual = pt.zeros([1, 120, 120, 3, 10]) train_audio_wave = pt.zeros([1, 261540]) train_audio_cnn = pt.zeros([1, 150, 512, 1]) mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150}) face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') if len(wav[0]) > 261540: print(wav.shape) train_audio_wave[0, :] = wav[0][:261540] else: print(wav.shape) train_audio_wave[0, :len(wav[0])] = wav[0][:] train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0]) print(train_audio_cnn[0].shape) cap = cv2.VideoCapture(video_path) frame_idx = 0 last_frame = None for i in range(100): ret, frame = cap.read() if ret and (i % 10 == 0): gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)) if len(faces) > 0: (x, y, w, h) = faces[0] face = frame[y:y+h, x:x+w] resized_face = cv2.resize(face, (120, 120)) train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face) else: resized_frame = cv2.resize(frame, (120, 120)) train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame) last_frame = frame frame_idx += 1 cap.release() train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16) train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077)) train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16) return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn def predict_emotion(video_path): last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path) model = load_model("model_vui_ve.keras") predictions = model.predict({ "input_visual": train_visual, "input_audio_cnn": train_audio_cnn, "input_audio_wave": train_audio_wave }) predicted_label = np.argmax(predictions) return last_frame, audio_path, predicted_label # def predict_emotion_gradio(video_path): # emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'} # last_frame, audio_path, predicted_label = predict_emotion(video_path) # predicted_emotion = emotion_dict[predicted_label] # return last_frame, audio_path, predicted_emotion # iface = gr.Interface( # fn=predict_emotion_gradio, # inputs=[ # gr.Video(label="Upload a video") # ], # outputs=[ # gr.Image(label="Last Frame"), # gr.Audio(label = "Audio"), # gr.Textbox(label="Predicted Emotion") # ], # title="Emotion Recognition from Video", # description="Upload a video and get the predicted emotion." # ) # iface.launch() def run_chat_server(app): """Runs a chat server using socket.IO""" clients = [] messages = [] @app.route('/chat', methods=['GET', 'POST']) def chat(): return app.socketio.send(messages) @app.socketio.on('message') def handle_message(message): clients.append(message['client']) messages.append(message) app.logger.info(f'Received message: {message}') app.socketio.emit('message', message, skip_sid=True) @app.socketio.on('connect') def handle_connect(): app.logger.info('Client connected') @app.socketio.on('disconnect') def handle_disconnect(): app.logger.info('Client disconnected') if __name__ == '__main__': app.run(debug=True) def predict_emotion_with_chat(video_path): last_frame, audio_path, predicted_label = predict_emotion(video_path) predicted_emotion = emotion_dict[predicted_label] # Connect to the chat server client = sio.Client() client.connect('http://localhost:5000/chat') # Send the predicted emotion to the chat server client.emit('message', {'client': 'Emotion Recognition', 'message' : f'Predicted emotion: {predicted_emotion}'}) # Receive messages from the chat server for msg in client.events: print(msg) return last_frame, audio_path, predicted_emotion, messages iface = gr.Interface( fn=predict_emotion_with_chat, inputs=[ gr.Video(label="Upload a video") ], outputs=[ gr.Image(label="Last Frame"), gr.Audio(label="Audio"), gr.Textbox(label="Predicted Emotion"), gr.Chatbot(label="Chat") ], title="Emotion Recognition with Chat", description="Upload a video and get the predicted emotion. Chat with others in real-time." ) # Start the Gradio interface and the chat server from flask import Flask app = Flask(__name__) app.config['SECRET_KEY'] = 'secret' app.socketio = sio.SocketIO(app) run_chat_server(app) iface.launch()