import streamlit as st
import cv2
from keras.models import load_model
import numpy as np
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.optimizers import Adam
from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate
from keras.models import Sequential, Model
from keras.utils import np_utils
from keras.utils import load_img
from keras.preprocessing import image, sequence
import cv2
from keras_preprocessing.sequence import pad_sequences
from tqdm import tqdm
from gtts import gTTS
import os
from pathlib import Path


vocab = np.load('./vocab.npy', allow_pickle=True)

vocab = vocab.item()

inv_vocab = {v:k for k,v in vocab.items()}


print("+"*50)
print("vocabulary loaded")

embedding_size = 128
vocab_size = len(vocab)
max_len = 40


image_model = Sequential()

image_model.add(Dense(embedding_size, input_shape=(2048,), activation='relu'))
image_model.add(RepeatVector(max_len))


language_model = Sequential()

language_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_len))
language_model.add(LSTM(256, return_sequences=True))
language_model.add(TimeDistributed(Dense(embedding_size)))


conca = Concatenate()([image_model.output, language_model.output])
x = LSTM(128, return_sequences=True)(conca)
x = LSTM(512, return_sequences=False)(x)
x = Dense(vocab_size)(x)
out = Activation('softmax')(x)
model = Model(inputs=[image_model.input, language_model.input], outputs = out)

model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])

model.load_weights('./mine_model_weights.h5')

print("="*150)
print("MODEL LOADED")

#resnet = ResNet50(include_top=False,weights='imagenet',input_shape=(224,224,3),pooling='avg')
resnet = load_model('./resnet.h5')

print("="*150)
print("RESNET MODEL LOADED")


st.header("Image Caption Generator📸")
st.subheader("Image")
image_file = st.file_uploader("Upload Images",type=["png","jpg","jpeg"])
save_folder = './static'
save_path = Path(save_folder,"file.jpg")
if image_file is not None:
	with open(save_path, mode='wb') as w:
	    w.write(image_file.getvalue())
if image_file is not None:
    # TO See details
    file_details = {"filename":image_file.name, "filetype":image_file.type,"filesize":image_file.size}
	#st.write(file_details)
    st.image(load_img(image_file), width=250)
    if st.button('predict'):
        image = cv2.imread('static/file.jpg')
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (224,224))
        image = np.reshape(image, (1,224,224,3))
        incept = resnet.predict(image).reshape(1,2048)
        print("="*50)
        print("Predict Features")

        text_in = ['startofseq']

        final = ''

        print("="*50)
        print("GETING Captions")

        count = 0
        while tqdm(count < 20):

            count += 1

            encoded = []
            for i in text_in:
                encoded.append(vocab[i])

            padded = pad_sequences([encoded], maxlen=max_len, padding='post', truncating='post').reshape(1,max_len)

            sampled_index = np.argmax(model.predict([incept, padded]))

            sampled_word = inv_vocab[sampled_index]

            if sampled_word != 'endofseq' and sampled_word != '.' :
                final = final + ' ' + sampled_word

            text_in.append(sampled_word)
        st.subheader(final)
        language = 'en'
        myobj = gTTS(text=final, lang=language, slow=False)
        myobj.save("static/caption.mp3")
        audio_file = open('./static/caption.mp3', 'rb')
        audio_bytes = audio_file.read()
        st.audio(audio_bytes, format='audio/mp3')