import os import pickle import numpy as np from tqdm.notebook import tqdm from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input from tensorflow.keras.preprocessing.image import load_img, img_to_array from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Model from tensorflow.keras.utils import to_categorical, plot_model from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add # load vgg16 model model = VGG16() # restructure the model model = Model(inputs=model.inputs, outputs=model.layers[-2].output) with open('features.pkl', 'rb') as f: features = pickle.load(f) with open('captions.txt', 'r') as f: next(f) captions_doc = f.read() # # create mapping of image to captions mapping = {} # process lines for line in captions_doc.split('\n'): # split the line by comma(,) tokens = line.split(',') if len(line) < 2: continue image_id, caption = tokens[0], tokens[1:] # remove extension from image ID image_id = image_id.split('.')[0] # convert caption list to string caption = " ".join(caption) # create list if needed if image_id not in mapping: mapping[image_id] = [] # store the caption mapping[image_id].append(caption) def clean(mapping): for key, captions in mapping.items(): for i in range(len(captions)): # take one caption at a time caption = captions[i] # preprocessing steps # convert to lowercase caption = caption.lower() # delete digits, special chars, etc., caption = caption.replace('[^A-Za-z]', '') # delete additional spaces caption = caption.replace('\s+', ' ') # add start and end tags to the caption caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq' captions[i] = caption clean(mapping) all_captions = [] for key in mapping: for caption in mapping[key]: all_captions.append(caption) # tokenize the text tokenizer = Tokenizer() tokenizer.fit_on_texts(all_captions) vocab_size = len(tokenizer.word_index) + 1 # get maximum length of the caption available max_length = max(len(caption.split()) for caption in all_captions) # create data generator to get data in batch (avoids session crash) def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size): # loop over images X1, X2, y = list(), list(), list() n = 0 while 1: for key in data_keys: n += 1 captions = mapping[key] # process each caption for caption in captions: # encode the sequence seq = tokenizer.texts_to_sequences([caption])[0] # split the sequence into X, y pairs for i in range(1, len(seq)): # split into input and output pairs in_seq, out_seq = seq[:i], seq[i] # pad input sequence in_seq = pad_sequences([in_seq], maxlen=max_length)[0] # encode output sequence out_seq = to_categorical([out_seq], num_classes=vocab_size)[0] # store the sequences X1.append(features[key][0]) X2.append(in_seq) y.append(out_seq) if n == batch_size: X1, X2, y = np.array(X1), np.array(X2), np.array(y) yield [X1, X2], y X1, X2, y = list(), list(), list() n = 0 # encoder model # image feature layers inputs1 = Input(shape=(4096,)) fe1 = Dropout(0.4)(inputs1) fe2 = Dense(256, activation='relu')(fe1) # sequence feature layers inputs2 = Input(shape=(max_length,)) se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2) se2 = Dropout(0.4)(se1) se3 = LSTM(256)(se2) # decoder model decoder1 = add([fe2, se3]) decoder2 = Dense(256, activation='relu')(decoder1) outputs = Dense(vocab_size, activation='softmax')(decoder2) model = Model(inputs=[inputs1, inputs2], outputs=outputs) model.compile(loss='categorical_crossentropy', optimizer='adam') from keras.models import load_model model = load_model("best_model.h5") def idx_to_word(integer, tokenizer): for word, index in tokenizer.word_index.items(): if index == integer: return word return None # generate caption for an image def predict_caption(model, image, tokenizer, max_length): # add start tag for generation process in_text = 'startseq' # iterate over the max length of sequence for i in range(max_length): # encode input sequence sequence = tokenizer.texts_to_sequences([in_text])[0] # pad the sequence sequence = pad_sequences([sequence], max_length) # predict next word yhat = model.predict([image, sequence], verbose=0) # get index with high probability yhat = np.argmax(yhat) # convert index to word word = idx_to_word(yhat, tokenizer) # stop if word not found if word is None: break # append word as input for generating next word in_text += " " + word # stop if we reach end tag if word == 'endseq': break return in_text vgg_model = VGG16() # restructure the model vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output) def generate_caption(image_path): image_path = image_path # load image image = load_img(image_path, target_size=(224, 224)) # convert image pixels to numpy array image = img_to_array(image) # reshape data for model image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) # preprocess image for vgg image = preprocess_input(image) # extract features feature = vgg_model.predict(image, verbose=0) # predict from the trained model return predict_caption(model, feature, tokenizer, max_length)[9: -7]