import streamlit as st import cv2 from keras.models import load_model import numpy as np from tensorflow.keras.applications import ResNet50 from tensorflow.keras.optimizers import Adam from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate from keras.models import Sequential, Model from keras.utils import np_utils from keras.utils import load_img from keras.preprocessing import image, sequence import cv2 from keras_preprocessing.sequence import pad_sequences from tqdm import tqdm from gtts import gTTS import os from pathlib import Path vocab = np.load('./vocab.npy', allow_pickle=True) vocab = vocab.item() inv_vocab = {v:k for k,v in vocab.items()} print("+"*50) print("vocabulary loaded") embedding_size = 128 vocab_size = len(vocab) max_len = 40 image_model = Sequential() image_model.add(Dense(embedding_size, input_shape=(2048,), activation='relu')) image_model.add(RepeatVector(max_len)) language_model = Sequential() language_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_len)) language_model.add(LSTM(256, return_sequences=True)) language_model.add(TimeDistributed(Dense(embedding_size))) conca = Concatenate()([image_model.output, language_model.output]) x = LSTM(128, return_sequences=True)(conca) x = LSTM(512, return_sequences=False)(x) x = Dense(vocab_size)(x) out = Activation('softmax')(x) model = Model(inputs=[image_model.input, language_model.input], outputs = out) model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy']) model.load_weights('./mine_model_weights.h5') print("="*150) print("MODEL LOADED") #resnet = ResNet50(include_top=False,weights='imagenet',input_shape=(224,224,3),pooling='avg') resnet = load_model('./resnet.h5') print("="*150) print("RESNET MODEL LOADED") st.header("Image Caption Generator📸") st.subheader("Image") image_file = st.file_uploader("Upload Images",type=["png","jpg","jpeg"]) save_folder = './static' save_path = Path(save_folder,"file.jpg") if image_file is not None: with open(save_path, mode='wb') as w: w.write(image_file.getvalue()) if image_file is not None: # TO See details file_details = {"filename":image_file.name, "filetype":image_file.type,"filesize":image_file.size} #st.write(file_details) st.image(load_img(image_file), width=250) if st.button('predict'): image = cv2.imread('static/file.jpg') image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image = cv2.resize(image, (224,224)) image = np.reshape(image, (1,224,224,3)) incept = resnet.predict(image).reshape(1,2048) print("="*50) print("Predict Features") text_in = ['startofseq'] final = '' print("="*50) print("GETING Captions") count = 0 while tqdm(count < 20): count += 1 encoded = [] for i in text_in: encoded.append(vocab[i]) padded = pad_sequences([encoded], maxlen=max_len, padding='post', truncating='post').reshape(1,max_len) sampled_index = np.argmax(model.predict([incept, padded])) sampled_word = inv_vocab[sampled_index] if sampled_word != 'endofseq' and sampled_word != '.' : final = final + ' ' + sampled_word text_in.append(sampled_word) st.subheader(final) language = 'en' myobj = gTTS(text=final, lang=language, slow=False) myobj.save("static/caption.mp3") audio_file = open('./static/caption.mp3', 'rb') audio_bytes = audio_file.read() st.audio(audio_bytes, format='audio/mp3')