Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| import numpy as np | |
| from tqdm.notebook import tqdm | |
| from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input | |
| from tensorflow.keras.preprocessing.image import load_img, img_to_array | |
| from tensorflow.keras.preprocessing.text import Tokenizer | |
| from tensorflow.keras.preprocessing.sequence import pad_sequences | |
| from tensorflow.keras.models import Model | |
| from tensorflow.keras.utils import to_categorical, plot_model | |
| from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add | |
| # load vgg16 model | |
| model = VGG16() | |
| # restructure the model | |
| model = Model(inputs=model.inputs, outputs=model.layers[-2].output) | |
| with open('features.pkl', 'rb') as f: | |
| features = pickle.load(f) | |
| with open('captions.txt', 'r') as f: | |
| next(f) | |
| captions_doc = f.read() | |
| # # create mapping of image to captions | |
| mapping = {} | |
| # process lines | |
| for line in captions_doc.split('\n'): | |
| # split the line by comma(,) | |
| tokens = line.split(',') | |
| if len(line) < 2: | |
| continue | |
| image_id, caption = tokens[0], tokens[1:] | |
| # remove extension from image ID | |
| image_id = image_id.split('.')[0] | |
| # convert caption list to string | |
| caption = " ".join(caption) | |
| # create list if needed | |
| if image_id not in mapping: | |
| mapping[image_id] = [] | |
| # store the caption | |
| mapping[image_id].append(caption) | |
| def clean(mapping): | |
| for key, captions in mapping.items(): | |
| for i in range(len(captions)): | |
| # take one caption at a time | |
| caption = captions[i] | |
| # preprocessing steps | |
| # convert to lowercase | |
| caption = caption.lower() | |
| # delete digits, special chars, etc., | |
| caption = caption.replace('[^A-Za-z]', '') | |
| # delete additional spaces | |
| caption = caption.replace('\s+', ' ') | |
| # add start and end tags to the caption | |
| caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq' | |
| captions[i] = caption | |
| clean(mapping) | |
| all_captions = [] | |
| for key in mapping: | |
| for caption in mapping[key]: | |
| all_captions.append(caption) | |
| # tokenize the text | |
| tokenizer = Tokenizer() | |
| tokenizer.fit_on_texts(all_captions) | |
| vocab_size = len(tokenizer.word_index) + 1 | |
| # get maximum length of the caption available | |
| max_length = max(len(caption.split()) for caption in all_captions) | |
| # create data generator to get data in batch (avoids session crash) | |
| def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size): | |
| # loop over images | |
| X1, X2, y = list(), list(), list() | |
| n = 0 | |
| while 1: | |
| for key in data_keys: | |
| n += 1 | |
| captions = mapping[key] | |
| # process each caption | |
| for caption in captions: | |
| # encode the sequence | |
| seq = tokenizer.texts_to_sequences([caption])[0] | |
| # split the sequence into X, y pairs | |
| for i in range(1, len(seq)): | |
| # split into input and output pairs | |
| in_seq, out_seq = seq[:i], seq[i] | |
| # pad input sequence | |
| in_seq = pad_sequences([in_seq], maxlen=max_length)[0] | |
| # encode output sequence | |
| out_seq = to_categorical([out_seq], num_classes=vocab_size)[0] | |
| # store the sequences | |
| X1.append(features[key][0]) | |
| X2.append(in_seq) | |
| y.append(out_seq) | |
| if n == batch_size: | |
| X1, X2, y = np.array(X1), np.array(X2), np.array(y) | |
| yield [X1, X2], y | |
| X1, X2, y = list(), list(), list() | |
| n = 0 | |
| # encoder model | |
| # image feature layers | |
| inputs1 = Input(shape=(4096,)) | |
| fe1 = Dropout(0.4)(inputs1) | |
| fe2 = Dense(256, activation='relu')(fe1) | |
| # sequence feature layers | |
| inputs2 = Input(shape=(max_length,)) | |
| se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2) | |
| se2 = Dropout(0.4)(se1) | |
| se3 = LSTM(256)(se2) | |
| # decoder model | |
| decoder1 = add([fe2, se3]) | |
| decoder2 = Dense(256, activation='relu')(decoder1) | |
| outputs = Dense(vocab_size, activation='softmax')(decoder2) | |
| model = Model(inputs=[inputs1, inputs2], outputs=outputs) | |
| model.compile(loss='categorical_crossentropy', optimizer='adam') | |
| from keras.models import load_model | |
| model = load_model("best_model.h5") | |
| def idx_to_word(integer, tokenizer): | |
| for word, index in tokenizer.word_index.items(): | |
| if index == integer: | |
| return word | |
| return None | |
| # generate caption for an image | |
| def predict_caption(model, image, tokenizer, max_length): | |
| # add start tag for generation process | |
| in_text = 'startseq' | |
| # iterate over the max length of sequence | |
| for i in range(max_length): | |
| # encode input sequence | |
| sequence = tokenizer.texts_to_sequences([in_text])[0] | |
| # pad the sequence | |
| sequence = pad_sequences([sequence], max_length) | |
| # predict next word | |
| yhat = model.predict([image, sequence], verbose=0) | |
| # get index with high probability | |
| yhat = np.argmax(yhat) | |
| # convert index to word | |
| word = idx_to_word(yhat, tokenizer) | |
| # stop if word not found | |
| if word is None: | |
| break | |
| # append word as input for generating next word | |
| in_text += " " + word | |
| # stop if we reach end tag | |
| if word == 'endseq': | |
| break | |
| return in_text | |
| vgg_model = VGG16() | |
| # restructure the model | |
| vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output) | |
| def generate_caption(image_path): | |
| image_path = image_path | |
| # load image | |
| image = load_img(image_path, target_size=(224, 224)) | |
| # convert image pixels to numpy array | |
| image = img_to_array(image) | |
| # reshape data for model | |
| image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) | |
| # preprocess image for vgg | |
| image = preprocess_input(image) | |
| # extract features | |
| feature = vgg_model.predict(image, verbose=0) | |
| # predict from the trained model | |
| return predict_caption(model, feature, tokenizer, max_length)[9: -7] |