import os
from json_tricks import load

import numpy as np

import librosa
from pydub import AudioSegment, effects
import noisereduce as nr

import tensorflow as tf
import keras
from keras.models import model_from_json
from keras.models import load_model

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

saved_model_path = r'./model8723.json'
saved_weights_path = r'./model8723_weights.h5'

#Reading the model from JSON file
with open(saved_model_path, 'r') as json_file:
    json_savedModel = json_file.read()
    
# Loading the model architecture, weights
model = tf.keras.models.model_from_json(json_savedModel)
model.load_weights(saved_weights_path)

# Compiling the model with similar parameters as the original model.
model.compile(loss='categorical_crossentropy', 
                optimizer='RMSProp', 
                metrics=['categorical_accuracy'])

print(model.summary())

def convert(y,sr):
    # convert from float to uint16
    y = np.array(y * (1<<15), dtype=np.int16)
    audio_segment = AudioSegment(
        y.tobytes(), 
        frame_rate=sr,
        sample_width=y.dtype.itemsize, 
        channels=1
    )
    return audio_segment

def preprocess(y,sr ):
    
    '''
    A process to an audio .wav file before execcuting a prediction.
      Arguments:
      - file_path - The system path to the audio file.
      - frame_length - Length of the frame over which to compute the speech features. default: 2048
      - hop_length - Number of samples to advance for each frame. default: 512

      Return:
        'X_3D' variable, containing a shape of: (batch, timesteps, feature) for a single file (batch = 1).
    '''
    total_length = 204288 
    frame_length = 2048
    hop_length = 512
    # Fetch sample rate.
    # _, sr = librosa.load(path = file_path, sr = None)
    # Load audio file
    rawsound = convert(y,sr)
    # y = y.astype(np.float32)
    # y /= np.max(np.abs(y))

    # rawsound = AudioSegment.from_mono_audiosegments(y)
    # Normalize to 5 dBFS 
    normalizedsound = effects.normalize(rawsound, headroom = 5.0) 
    # Transform the audio file to np.array of samples
    normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32') 
  
    final_x = nr.reduce_noise(normal_x, sr=sr) #updated 03/03/22
       
    # Features extraction 
    f1 = librosa.feature.rms(y = final_x, frame_length=frame_length, hop_length=hop_length,center=True,pad_mode='reflect').T # Energy - Root Mean Square   
    f2 = librosa.feature.zero_crossing_rate(final_x , frame_length=frame_length, hop_length=hop_length, center=True).T # ZCR      
    f3 = librosa.feature.mfcc(y = final_x, sr=sr, n_mfcc=13, hop_length = hop_length).T # MFCC

    X = np.concatenate((f1, f2, f3), axis = 1)
    # Pad the array
    padding_rows = 448-len(X)
        
    if padding_rows < 0:
        X = X[:448, :15]
    if padding_rows > 0:
        X = np.vstack(( X, np.zeros((padding_rows, 15))))

    X_3D = np.expand_dims(X, axis=0)
    
    return X_3D

emotions = {
    0 : 'neutral',
    1 : 'calm',
    2 : 'happy',
    3 : 'sad',
    4 : 'angry',
    5 : 'fearful',
    6 : 'disgust',
    7 : 'suprised'   
}
emo_list = list(emotions.values())

def is_silent(data):
    # Returns 'True' if below the 'silent' threshold
    return max(data) < 100
import pyaudio
import wave
from array import array
import struct
import time

# Initialize variables
RATE = 24414
CHUNK = 512
RECORD_SECONDS = 7.1

CHANNELS = 1
WAVE_OUTPUT_FILE = "./output.wav"


def EmotionRecogniser(stream,new_chunk):
    # process only when stream gets to length 7.1 seconds, else donot update prediction yet
    sr, y = new_chunk
    
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    # SESSION START
    print("** session started")
    total_predictions = [] # A list for all predictions in the session.
    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    # if len(stream) < int(RATE*RECORD_SECONDS):
    #     return stream, 'neutral'

    x = preprocess(y=stream,sr =sr) # 'output.wav' file preprocessing.
    print('x shape:', x.shape)
    # Model's prediction => an 8 emotion probabilities array.
    predictions = model.predict(x, use_multiprocessing=True)
    pred_list = list(predictions)
    pred_np = np.squeeze(np.array(pred_list).tolist(), axis=0) # Get rid of 'array' & 'dtype' statments.
    total_predictions.append(pred_np)
    
    #dict of emotions with their respective probabilities
    emotions_prob = dict(zip(emo_list, pred_np))
    max_emo = np.argmax(predictions)
    print('max emotion:', emotions.get(max_emo,-1))

    stream = stream[len(y):] # Reset the stream for the next session.
    emotions_prob

    return stream , emotions_prob

        # Present emotion distribution for the whole session.
        # total_predictions_np =  np.mean(np.array(total_predictions).tolist(), axis=0)
        # fig = plt.figure(figsize = (10, 5))
        # plt.bar(emo_list, total_predictions_np, color = 'indigo')
        # plt.ylabel("Mean probabilty (%)")
        # plt.title("Session Summary")
        # plt.show()

        # print(f"Emotions analyzed for: {(toc - tic):0.4f} seconds")
        # return str(emotions.get(np.argmax(total_predictions_np),-1))

##################################################

import gradio as gr
from transformers import pipeline
import numpy as np

# transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

# def transcribe(stream, new_chunk):
#     sr, y = new_chunk
#     y = y.astype(np.float32)
#     y /= np.max(np.abs(y))

#     if stream is not None:
#         stream = np.concatenate([stream, y])
#     else:
#         stream = y
#     return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]


demo = gr.Interface(
    EmotionRecogniser,
    ["state",gr.Audio(sources=["microphone"], streaming=True,every=1.0)],
    ["state",'label'],
    live=True,
)

demo.launch()