THP2903 commited on
Commit
836cfdf
·
verified ·
1 Parent(s): 9355adb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -12
app.py CHANGED
@@ -3,13 +3,24 @@ import torch as pt
3
  import torchaudio
4
  import cv2
5
  import os
6
-
7
  import numpy as np
8
  import tensorflow as tf
9
  from tensorflow.keras.models import load_model
10
 
11
- def process_video_audio(video_path, audio_path):
12
- wav = pt.tensor(list(audio_path[1]))
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  train_visual = pt.zeros([1, 120, 120, 3, 10])
15
  train_audio_wave = pt.zeros([1, 261540])
@@ -19,12 +30,12 @@ def process_video_audio(video_path, audio_path):
19
 
20
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
21
 
22
- if len(wav) > 261540:
23
  print(wav.shape)
24
- train_audio_wave[0, :] = wav[:261540]
25
  else:
26
  print(wav.shape)
27
- train_audio_wave[0, :len(wav)] = wav[:]
28
  train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])
29
 
30
  print(train_audio_cnn[0].shape)
@@ -55,8 +66,8 @@ def process_video_audio(video_path, audio_path):
55
 
56
  return last_frame, train_visual, train_audio_wave, train_audio_cnn
57
 
58
- def predict_emotion(video_path, audio_path):
59
- last_frame, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
60
 
61
  model = load_model("model_vui_ve.keras")
62
 
@@ -69,17 +80,16 @@ def predict_emotion(video_path, audio_path):
69
  predicted_label = np.argmax(predictions)
70
  return last_frame, predicted_label
71
 
72
- def predict_emotion_gradio(video_path, audio_path):
73
  emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
74
- last_frame, predicted_label = predict_emotion(video_path, audio_path)
75
  predicted_emotion = emotion_dict[predicted_label]
76
  return last_frame, predicted_emotion
77
 
78
  iface = gr.Interface(
79
  fn=predict_emotion_gradio,
80
  inputs=[
81
- gr.Video(label="Upload a video"),
82
- gr.Audio(label="Upload a audio")
83
  ],
84
  outputs=[
85
  gr.Image(label="Last Frame"),
 
3
  import torchaudio
4
  import cv2
5
  import os
6
+ import subprocess
7
  import numpy as np
8
  import tensorflow as tf
9
  from tensorflow.keras.models import load_model
10
 
11
+ def convert_video_to_audio_ffmpeg(video_file, output_ext="wav"):
12
+ """Converts video to audio directly using `ffmpeg` command with the help of subprocess module"""
13
+ filename, ext = os.path.splitext(video_file)
14
+ audio_file = f"{filename}.{output_ext}"
15
+ subprocess.call(["ffmpeg", "-y", "-i", video_file, audio_file],
16
+ stdout=subprocess.DEVNULL,
17
+ stderr=subprocess.STDOUT)
18
+ return audio_file
19
+
20
+ def process_video_audio(video_path):
21
+ audio_path = convert_video_to_audio_ffmpeg(video_path)
22
+
23
+ wav, sr = torchaudio.load(audio_path)
24
 
25
  train_visual = pt.zeros([1, 120, 120, 3, 10])
26
  train_audio_wave = pt.zeros([1, 261540])
 
30
 
31
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
32
 
33
+ if len(wav[0]) > 261540:
34
  print(wav.shape)
35
+ train_audio_wave[0, :] = wav[0][:261540]
36
  else:
37
  print(wav.shape)
38
+ train_audio_wave[0, :len(wav[0])] = wav[0][:]
39
  train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])
40
 
41
  print(train_audio_cnn[0].shape)
 
66
 
67
  return last_frame, train_visual, train_audio_wave, train_audio_cnn
68
 
69
+ def predict_emotion(video_path):
70
+ last_frame, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path)
71
 
72
  model = load_model("model_vui_ve.keras")
73
 
 
80
  predicted_label = np.argmax(predictions)
81
  return last_frame, predicted_label
82
 
83
+ def predict_emotion_gradio(video_path):
84
  emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
85
+ last_frame, predicted_label = predict_emotion(video_path)
86
  predicted_emotion = emotion_dict[predicted_label]
87
  return last_frame, predicted_emotion
88
 
89
  iface = gr.Interface(
90
  fn=predict_emotion_gradio,
91
  inputs=[
92
+ gr.Video(label="Upload a video")
 
93
  ],
94
  outputs=[
95
  gr.Image(label="Last Frame"),