Spaces:
Sleeping
Sleeping
File size: 6,008 Bytes
e115a53 1a638f1 bbaddce e6e97e9 bbaddce b249890 4303f04 bd3db00 d997553 e115a53 46da3ad d997553 836cfdf d997553 836cfdf d997553 836cfdf cb15e76 b5476f5 b249890 d8cd545 f981299 1a638f1 836cfdf b249890 1a638f1 836cfdf 74a0dce 769127d 1a638f1 b249890 16e2339 b249890 1f82907 b249890 1f82907 16e2339 b249890 16e2339 f0d3073 83034e9 6fc4cb7 df52549 836cfdf 6fc4cb7 df52549 025dc88 5e76cfb 6cd4025 df52549 6cd4025 6fc4cb7 6cd4025 e115a53 6fc4cb7 6cd4025 e115a53 689cf50 e115a53 6cd4025 1a638f1 e115a53 1a638f1 e115a53 689cf50 1a638f1 e115a53 1a638f1 6cd4025 e115a53 1a638f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# import gradio as gr
# import torch as pt
# import torchaudio
# import cv2
# import os
# import numpy as np
# import tensorflow as tf
# from tensorflow.keras.models import load_model
# from moviepy.editor import VideoFileClip
import gradio as gr
import torch as pt
import torchaudio
import cv2
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from moviepy.editor import VideoFileClip
import socketIO_client as sio
def convert_video_to_audio_moviepy(video_file, output_ext="wav"):
"""Converts video to audio using MoviePy library that uses `ffmpeg` under the hood"""
filename, ext = os.path.splitext(video_file)
clip = VideoFileClip(video_file)
audio_path = f"{filename}.{output_ext}"
clip.audio.write_audiofile(audio_path)
return audio_path
def process_video_audio(video_path):
audio_path = convert_video_to_audio_moviepy(video_path)
wav, sr = torchaudio.load(audio_path)
train_visual = pt.zeros([1, 120, 120, 3, 10])
train_audio_wave = pt.zeros([1, 261540])
train_audio_cnn = pt.zeros([1, 150, 512, 1])
mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150})
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
if len(wav[0]) > 261540:
print(wav.shape)
train_audio_wave[0, :] = wav[0][:261540]
else:
print(wav.shape)
train_audio_wave[0, :len(wav[0])] = wav[0][:]
train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])
print(train_audio_cnn[0].shape)
cap = cv2.VideoCapture(video_path)
frame_idx = 0
last_frame = None
for i in range(100):
ret, frame = cap.read()
if ret and (i % 10 == 0):
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
if len(faces) > 0:
(x, y, w, h) = faces[0]
face = frame[y:y+h, x:x+w]
resized_face = cv2.resize(face, (120, 120))
train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face)
else:
resized_frame = cv2.resize(frame, (120, 120))
train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame)
last_frame = frame
frame_idx += 1
cap.release()
train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16)
train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077))
train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16)
return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn
def predict_emotion(video_path):
last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path)
model = load_model("model_vui_ve.keras")
predictions = model.predict({
"input_visual": train_visual,
"input_audio_cnn": train_audio_cnn,
"input_audio_wave": train_audio_wave
})
predicted_label = np.argmax(predictions)
return last_frame, audio_path, predicted_label
# def predict_emotion_gradio(video_path):
# emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
# last_frame, audio_path, predicted_label = predict_emotion(video_path)
# predicted_emotion = emotion_dict[predicted_label]
# return last_frame, audio_path, predicted_emotion
# iface = gr.Interface(
# fn=predict_emotion_gradio,
# inputs=[
# gr.Video(label="Upload a video")
# ],
# outputs=[
# gr.Image(label="Last Frame"),
# gr.Audio(label = "Audio"),
# gr.Textbox(label="Predicted Emotion")
# ],
# title="Emotion Recognition from Video",
# description="Upload a video and get the predicted emotion."
# )
# iface.launch()
def run_chat_server(app):
"""Runs a chat server using socket.IO"""
clients = []
messages = []
@app.route('/chat', methods=['GET', 'POST'])
def chat():
return app.socketio.send(messages)
@app.socketio.on('message')
def handle_message(message):
clients.append(message['client'])
messages.append(message)
app.logger.info(f'Received message: {message}')
app.socketio.emit('message', message, skip_sid=True)
@app.socketio.on('connect')
def handle_connect():
app.logger.info('Client connected')
@app.socketio.on('disconnect')
def handle_disconnect():
app.logger.info('Client disconnected')
if __name__ == '__main__':
app.run(debug=True)
def predict_emotion_with_chat(video_path):
last_frame, audio_path, predicted_label = predict_emotion(video_path)
predicted_emotion = emotion_dict[predicted_label]
# Connect to the chat server
client = sio.Client()
client.connect('http://localhost:5000/chat')
# Send the predicted emotion to the chat server
client.emit('message', {'client': 'Emotion Recognition', 'message'
: f'Predicted emotion: {predicted_emotion}'})
# Receive messages from the chat server
for msg in client.events:
print(msg)
return last_frame, audio_path, predicted_emotion, messages
iface = gr.Interface(
fn=predict_emotion_with_chat,
inputs=[
gr.Video(label="Upload a video")
],
outputs=[
gr.Image(label="Last Frame"),
gr.Audio(label="Audio"),
gr.Textbox(label="Predicted Emotion"),
gr.Chatbot(label="Chat")
],
title="Emotion Recognition with Chat",
description="Upload a video and get the predicted emotion. Chat with others in real-time."
)
# Start the Gradio interface and the chat server
from flask import Flask
app = Flask(__name__)
app.config['SECRET_KEY'] = 'secret'
app.socketio = sio.SocketIO(app)
run_chat_server(app)
iface.launch()
|