THP2903 commited on
Commit
1fcfc81
·
verified ·
1 Parent(s): 0e64918

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -176
app.py CHANGED
@@ -8,179 +8,6 @@
8
  # # from tensorflow.keras.models import load_model
9
  # # from moviepy.editor import VideoFileClip
10
 
11
- # import gradio as gr
12
- # import torch as pt
13
- # import torchaudio
14
- # import cv2
15
- # import os
16
- # import numpy as np
17
- # import tensorflow as tf
18
- # from tensorflow.keras.models import load_model
19
- # from moviepy.editor import VideoFileClip
20
- # import socketIO_client as sio
21
-
22
- # def convert_video_to_audio_moviepy(video_file, output_ext="wav"):
23
- # """Converts video to audio using MoviePy library that uses `ffmpeg` under the hood"""
24
- # filename, ext = os.path.splitext(video_file)
25
- # clip = VideoFileClip(video_file)
26
- # audio_path = f"{filename}.{output_ext}"
27
- # clip.audio.write_audiofile(audio_path)
28
- # return audio_path
29
-
30
- # def process_video_audio(video_path):
31
- # audio_path = convert_video_to_audio_moviepy(video_path)
32
-
33
- # wav, sr = torchaudio.load(audio_path)
34
-
35
- # train_visual = pt.zeros([1, 120, 120, 3, 10])
36
- # train_audio_wave = pt.zeros([1, 261540])
37
- # train_audio_cnn = pt.zeros([1, 150, 512, 1])
38
-
39
- # mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150})
40
-
41
- # face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
42
-
43
- # if len(wav[0]) > 261540:
44
- # print(wav.shape)
45
- # train_audio_wave[0, :] = wav[0][:261540]
46
- # else:
47
- # print(wav.shape)
48
- # train_audio_wave[0, :len(wav[0])] = wav[0][:]
49
- # train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])
50
-
51
- # print(train_audio_cnn[0].shape)
52
-
53
- # cap = cv2.VideoCapture(video_path)
54
- # frame_idx = 0
55
- # last_frame = None
56
- # for i in range(100):
57
- # ret, frame = cap.read()
58
- # if ret and (i % 10 == 0):
59
- # gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
60
- # faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
61
- # if len(faces) > 0:
62
- # (x, y, w, h) = faces[0]
63
- # face = frame[y:y+h, x:x+w]
64
- # resized_face = cv2.resize(face, (120, 120))
65
- # train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face)
66
- # else:
67
- # resized_frame = cv2.resize(frame, (120, 120))
68
- # train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame)
69
- # last_frame = frame
70
- # frame_idx += 1
71
- # cap.release()
72
-
73
- # train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16)
74
- # train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077))
75
- # train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16)
76
-
77
- # return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn
78
-
79
- # def predict_emotion(video_path):
80
- # last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path)
81
-
82
- # model = load_model("model_vui_ve.keras")
83
-
84
- # predictions = model.predict({
85
- # "input_visual": train_visual,
86
- # "input_audio_cnn": train_audio_cnn,
87
- # "input_audio_wave": train_audio_wave
88
- # })
89
-
90
- # predicted_label = np.argmax(predictions)
91
- # return last_frame, audio_path, predicted_label
92
-
93
- # # def predict_emotion_gradio(video_path):
94
- # # emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
95
- # # last_frame, audio_path, predicted_label = predict_emotion(video_path)
96
- # # predicted_emotion = emotion_dict[predicted_label]
97
- # # return last_frame, audio_path, predicted_emotion
98
-
99
- # # iface = gr.Interface(
100
- # # fn=predict_emotion_gradio,
101
- # # inputs=[
102
- # # gr.Video(label="Upload a video")
103
- # # ],
104
- # # outputs=[
105
- # # gr.Image(label="Last Frame"),
106
- # # gr.Audio(label = "Audio"),
107
- # # gr.Textbox(label="Predicted Emotion")
108
- # # ],
109
- # # title="Emotion Recognition from Video",
110
- # # description="Upload a video and get the predicted emotion."
111
- # # )
112
-
113
- # # iface.launch()
114
-
115
- # def run_chat_server(app):
116
- # """Runs a chat server using socket.IO"""
117
- # clients = []
118
- # messages = []
119
-
120
- # @app.route('/chat', methods=['GET', 'POST'])
121
- # def chat():
122
- # return app.socketio.send(messages)
123
-
124
- # @app.socketio.on('message')
125
- # def handle_message(message):
126
- # clients.append(message['client'])
127
- # messages.append(message)
128
- # app.logger.info(f'Received message: {message}')
129
- # app.socketio.emit('message', message, skip_sid=True)
130
-
131
- # @app.socketio.on('connect')
132
- # def handle_connect():
133
- # app.logger.info('Client connected')
134
-
135
- # @app.socketio.on('disconnect')
136
- # def handle_disconnect():
137
- # app.logger.info('Client disconnected')
138
-
139
- # if __name__ == '__main__':
140
- # app.run(debug=True)
141
-
142
- # def predict_emotion_with_chat(video_path):
143
- # last_frame, audio_path, predicted_label = predict_emotion(video_path)
144
- # predicted_emotion = emotion_dict[predicted_label]
145
-
146
- # # Connect to the chat server
147
- # client = sio.Client()
148
- # client.connect('http://localhost:5000/chat')
149
-
150
- # # Send the predicted emotion to the chat server
151
- # client.emit('message', {'client': 'Emotion Recognition', 'message'
152
- # : f'Predicted emotion: {predicted_emotion}'})
153
-
154
- # # Receive messages from the chat server
155
- # for msg in client.events:
156
- # print(msg)
157
-
158
- # return last_frame, audio_path, predicted_emotion, messages
159
-
160
- # iface = gr.Interface(
161
- # fn=predict_emotion_with_chat,
162
- # inputs=[
163
- # gr.Video(label="Upload a video")
164
- # ],
165
- # outputs=[
166
- # gr.Image(label="Last Frame"),
167
- # gr.Audio(label="Audio"),
168
- # gr.Textbox(label="Predicted Emotion"),
169
- # gr.Chatbot(label="Chat")
170
- # ],
171
- # title="Emotion Recognition with Chat",
172
- # description="Upload a video and get the predicted emotion. Chat with others in real-time."
173
- # )
174
-
175
- # # Start the Gradio interface and the chat server
176
- # from flask import Flask
177
- # app = Flask(__name__)
178
- # app.config['SECRET_KEY'] = 'secret'
179
- # app.socketio = sio.SocketIO(app)
180
- # run_chat_server(app)
181
- # iface.launch()
182
-
183
-
184
  import gradio as gr
185
  import torch as pt
186
  import torchaudio
@@ -193,15 +20,14 @@ from moviepy.editor import VideoFileClip
193
  from flask import Flask
194
  from flask_socketio import SocketIO, emit
195
 
196
- # Function to convert video to audio
197
  def convert_video_to_audio_moviepy(video_file, output_ext="wav"):
 
198
  filename, ext = os.path.splitext(video_file)
199
  clip = VideoFileClip(video_file)
200
  audio_path = f"{filename}.{output_ext}"
201
  clip.audio.write_audiofile(audio_path)
202
  return audio_path
203
 
204
- # Process video and audio
205
  def process_video_audio(video_path):
206
  audio_path = convert_video_to_audio_moviepy(video_path)
207
 
@@ -216,11 +42,16 @@ def process_video_audio(video_path):
216
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
217
 
218
  if len(wav[0]) > 261540:
 
219
  train_audio_wave[0, :] = wav[0][:261540]
220
  else:
 
 
221
  train_audio_wave[0, :len(wav[0])] = wav[0][:]
222
  train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])
223
 
 
 
224
  cap = cv2.VideoCapture(video_path)
225
  frame_idx = 0
226
  last_frame = None
@@ -247,7 +78,6 @@ def process_video_audio(video_path):
247
 
248
  return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn
249
 
250
- # Predict emotion from video
251
  def predict_emotion(video_path):
252
  last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path)
253
 
@@ -262,6 +92,29 @@ def predict_emotion(video_path):
262
  predicted_label = np.argmax(predictions)
263
  return last_frame, audio_path, predicted_label
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  # Integrate chat functionality with emotion prediction
266
  def predict_emotion_with_chat(video_path):
267
  emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
 
8
  # # from tensorflow.keras.models import load_model
9
  # # from moviepy.editor import VideoFileClip
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  import gradio as gr
12
  import torch as pt
13
  import torchaudio
 
20
  from flask import Flask
21
  from flask_socketio import SocketIO, emit
22
 
 
23
  def convert_video_to_audio_moviepy(video_file, output_ext="wav"):
24
+ """Converts video to audio using MoviePy library that uses `ffmpeg` under the hood"""
25
  filename, ext = os.path.splitext(video_file)
26
  clip = VideoFileClip(video_file)
27
  audio_path = f"{filename}.{output_ext}"
28
  clip.audio.write_audiofile(audio_path)
29
  return audio_path
30
 
 
31
  def process_video_audio(video_path):
32
  audio_path = convert_video_to_audio_moviepy(video_path)
33
 
 
42
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
43
 
44
  if len(wav[0]) > 261540:
45
+ print(wav.shape)
46
  train_audio_wave[0, :] = wav[0][:261540]
47
  else:
48
+
49
+ print(wav.shape)
50
  train_audio_wave[0, :len(wav[0])] = wav[0][:]
51
  train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])
52
 
53
+ print(train_audio_cnn[0].shape)
54
+
55
  cap = cv2.VideoCapture(video_path)
56
  frame_idx = 0
57
  last_frame = None
 
78
 
79
  return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn
80
 
 
81
  def predict_emotion(video_path):
82
  last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path)
83
 
 
92
  predicted_label = np.argmax(predictions)
93
  return last_frame, audio_path, predicted_label
94
 
95
+ # def predict_emotion_gradio(video_path):
96
+ # emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
97
+ # last_frame, audio_path, predicted_label = predict_emotion(video_path)
98
+ # predicted_emotion = emotion_dict[predicted_label]
99
+ # return last_frame, audio_path, predicted_emotion
100
+
101
+ # iface = gr.Interface(
102
+ # fn=predict_emotion_gradio,
103
+ # inputs=[
104
+ # gr.Video(label="Upload a video")
105
+ # ],
106
+ # outputs=[
107
+ # gr.Image(label="Last Frame"),
108
+ # gr.Audio(label = "Audio"),
109
+ # gr.Textbox(label="Predicted Emotion")
110
+ # ],
111
+ # title="Emotion Recognition from Video",
112
+ # description="Upload a video and get the predicted emotion."
113
+ # )
114
+
115
+ # iface.launch()
116
+
117
+
118
  # Integrate chat functionality with emotion prediction
119
  def predict_emotion_with_chat(video_path):
120
  emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}