import gradio as gr
import torch as pt
import torchaudio
import cv2
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from moviepy.editor import VideoFileClip

def convert_video_to_audio_moviepy(video_file, output_ext="wav"):
    """Converts video to audio using MoviePy library that uses `ffmpeg` under the hood"""
    filename, ext = os.path.splitext(video_file)
    clip = VideoFileClip(video_file)
    audio_path = f"{filename}.{output_ext}"
    clip.audio.write_audiofile(audio_path)
    return audio_path

def process_video_audio(video_path):
    audio_path = convert_video_to_audio_moviepy(video_path)
    
    wav, sr = torchaudio.load(audio_path)
    
    train_visual = pt.zeros([1, 120, 120, 3, 10])
    train_audio_wave = pt.zeros([1, 261540])
    train_audio_cnn = pt.zeros([1, 150, 512, 1])

    
    mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150})
    
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')


    if len(wav[0]) > 261540:
        print(wav.shape)
        train_audio_wave[0, :] = wav[0][:261540]

    else:

        print(wav.shape)
        train_audio_wave[0, :len(wav[0])] = wav[0][:]
    train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])

    print(train_audio_cnn[0].shape)
    
    cap = cv2.VideoCapture(video_path)
    frame_idx = 0
    last_frame = None
    for i in range(100):
        ret, frame = cap.read()
        if ret and (i % 10 == 0):
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
            if len(faces) > 0:
                (x, y, w, h) = faces[0]
                face = frame[y:y+h, x:x+w]
                resized_face = cv2.resize(face, (120, 120))
                train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face)
            else:
                resized_frame = cv2.resize(frame, (120, 120))
                train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame)
                last_frame = frame
            frame_idx += 1
    cap.release()
    
    train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16)
    train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077))
    train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16)

    return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn
    
def predict_emotion(video_path):
    last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path)
    
    model = load_model("model_vui_ve2392.keras")
    
    predictions = model.predict({
        "input_visual": train_visual,
        "input_audio_cnn": train_audio_cnn,
        "input_audio_wave": train_audio_wave
    })
    
    predicted_label = np.argmax(predictions)
    return last_frame, audio_path, predicted_label

def predict_emotion_gradio(video_path):
    emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}
    last_frame, audio_path, predicted_label = predict_emotion(video_path)
    predicted_emotion = emotion_dict[predicted_label]
    return last_frame, audio_path, predicted_emotion


iface = gr.Interface(
    fn=predict_emotion_gradio, 
    inputs=[
        gr.Video(label="Upload a video")
    ],
    outputs=[
        gr.Image(label="Last Frame"),
        gr.Audio(label = "Audio"),
        gr.Textbox(label="Predicted Emotion")
    ],
    title="Emotion recognition with multi-modal neural network",
    description="Upload a video and get the predicted emotion."
)

iface.launch()