Spaces:

THP2903
/

DPL-Project

Sleeping

App Files Files Community

THP2903 commited on Jun 29, 2024

Commit

836cfdf

verified ·

1 Parent(s): 9355adb

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -12

app.py CHANGED Viewed

@@ -3,13 +3,24 @@ import torch as pt
 import torchaudio
 import cv2
 import os
 import numpy as np
 import tensorflow as tf
 from tensorflow.keras.models import load_model
-def process_video_audio(video_path, audio_path):
-    wav = pt.tensor(list(audio_path[1]))
     train_visual = pt.zeros([1, 120, 120, 3, 10])
     train_audio_wave = pt.zeros([1, 261540])
@@ -19,12 +30,12 @@ def process_video_audio(video_path, audio_path):
     face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
-    if len(wav) > 261540:
         print(wav.shape)
-        train_audio_wave[0, :] = wav[:261540]
     else:
         print(wav.shape)
-        train_audio_wave[0, :len(wav)] = wav[:]
     train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])
     print(train_audio_cnn[0].shape)
@@ -55,8 +66,8 @@ def process_video_audio(video_path, audio_path):
     return last_frame, train_visual, train_audio_wave, train_audio_cnn
-def predict_emotion(video_path, audio_path):
-    last_frame, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path, audio_path)
     model = load_model("model_vui_ve.keras")
@@ -69,17 +80,16 @@ def predict_emotion(video_path, audio_path):
     predicted_label = np.argmax(predictions)
     return last_frame, predicted_label
-def predict_emotion_gradio(video_path, audio_path):
     emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
-    last_frame, predicted_label = predict_emotion(video_path, audio_path)
     predicted_emotion = emotion_dict[predicted_label]
     return last_frame, predicted_emotion
 iface = gr.Interface(
     fn=predict_emotion_gradio,
     inputs=[
-        gr.Video(label="Upload a video"),
-        gr.Audio(label="Upload a audio")
     ],
     outputs=[
         gr.Image(label="Last Frame"),

 import torchaudio
 import cv2
 import os
+import subprocess
 import numpy as np
 import tensorflow as tf
 from tensorflow.keras.models import load_model
+def convert_video_to_audio_ffmpeg(video_file, output_ext="wav"):
+    """Converts video to audio directly using `ffmpeg` command with the help of subprocess module"""
+    filename, ext = os.path.splitext(video_file)
+    audio_file = f"{filename}.{output_ext}"
+    subprocess.call(["ffmpeg", "-y", "-i", video_file, audio_file],
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.STDOUT)
+    return audio_file
+def process_video_audio(video_path):
+    audio_path = convert_video_to_audio_ffmpeg(video_path)
+    wav, sr = torchaudio.load(audio_path)
     train_visual = pt.zeros([1, 120, 120, 3, 10])
     train_audio_wave = pt.zeros([1, 261540])
     face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+    if len(wav[0]) > 261540:
         print(wav.shape)
+        train_audio_wave[0, :] = wav[0][:261540]
     else:
         print(wav.shape)
+        train_audio_wave[0, :len(wav[0])] = wav[0][:]
     train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])
     print(train_audio_cnn[0].shape)
     return last_frame, train_visual, train_audio_wave, train_audio_cnn
+def predict_emotion(video_path):
+    last_frame, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path)
     model = load_model("model_vui_ve.keras")
     predicted_label = np.argmax(predictions)
     return last_frame, predicted_label
+def predict_emotion_gradio(video_path):
     emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful'}
+    last_frame, predicted_label = predict_emotion(video_path)
     predicted_emotion = emotion_dict[predicted_label]
     return last_frame, predicted_emotion
 iface = gr.Interface(
     fn=predict_emotion_gradio,
     inputs=[
+        gr.Video(label="Upload a video")
     ],
     outputs=[
         gr.Image(label="Last Frame"),