|
|
|
|
|
"""img-captioning-project.ipynb |
|
|
|
|
|
Automatically generated by Colaboratory. |
|
|
|
|
|
Original file is located at |
|
|
https://colab.research.google.com/drive/1wZSpAsjNfRWhMdsclYJT6o2B-YBZc1yH |
|
|
""" |
|
|
|
|
|
from os import listdir |
|
|
from numpy import array |
|
|
from keras.models import Model |
|
|
from pickle import dump |
|
|
from keras.applications.vgg16 import VGG16 |
|
|
from tensorflow.keras.preprocessing.image import load_img |
|
|
from tensorflow.keras.preprocessing.image import img_to_array |
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
from keras.preprocessing.text import Tokenizer |
|
|
from keras.utils import to_categorical |
|
|
from keras.utils import plot_model |
|
|
from keras.models import Model |
|
|
from keras.layers import Input |
|
|
from keras.layers import Dense |
|
|
from keras.layers import LSTM |
|
|
from keras.layers import Embedding |
|
|
from keras.layers import Dropout |
|
|
from tensorflow.keras.layers import Add |
|
|
from keras.callbacks import ModelCheckpoint |
|
|
|
|
|
from keras.applications.vgg16 import VGG16, preprocess_input |
|
|
model = VGG16() |
|
|
|
|
|
model.layers.pop() |
|
|
model = Model(inputs=model.inputs, outputs=model.layers[-2].output) |
|
|
|
|
|
print(model.summary()) |
|
|
|
|
|
from os import listdir |
|
|
from pickle import dump |
|
|
from tensorflow.keras.preprocessing.image import img_to_array, load_img |
|
|
from keras.models import Model |
|
|
|
|
|
|
|
|
def extract_features(directory): |
|
|
|
|
|
features = dict() |
|
|
for name in listdir(directory): |
|
|
|
|
|
filename = directory + '/' + name |
|
|
image = load_img(filename, target_size=(224, 224)) |
|
|
|
|
|
image = img_to_array(image) |
|
|
|
|
|
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) |
|
|
|
|
|
image = preprocess_input(image) |
|
|
|
|
|
feature = model.predict(image, verbose=0) |
|
|
|
|
|
image_id = name.split('.')[0] |
|
|
|
|
|
features[image_id] = feature |
|
|
print('>%s' % name) |
|
|
return features |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
!ls |
|
|
|
|
|
import string |
|
|
from nltk.tokenize import word_tokenize |
|
|
|
|
|
def load_doc(filename): |
|
|
|
|
|
file = open(filename, 'r') |
|
|
|
|
|
text = file.read() |
|
|
|
|
|
file.close() |
|
|
return text |
|
|
|
|
|
def load_descriptions(doc): |
|
|
mapping = dict() |
|
|
|
|
|
for line in doc.split('\n'): |
|
|
|
|
|
tokens = line.split() |
|
|
if len(line) < 2: |
|
|
continue |
|
|
|
|
|
image_id, image_desc = tokens[0], tokens[1:] |
|
|
|
|
|
image_id = image_id.split('.')[0] |
|
|
|
|
|
image_desc = ' '.join(image_desc) |
|
|
|
|
|
if image_id not in mapping: |
|
|
mapping[image_id] = list() |
|
|
|
|
|
mapping[image_id].append(image_desc) |
|
|
return mapping |
|
|
|
|
|
"""## Preprocessing of Text |
|
|
|
|
|
1. Convert all words to lowercase. |
|
|
2. Remove all punctuation. |
|
|
3. Remove all words that are one character or less in length (e.g. ‘a’). |
|
|
4. Remove all words with numbers in them. |
|
|
""" |
|
|
|
|
|
def clean_descriptions(descriptions): |
|
|
|
|
|
table = str.maketrans('', '', string.punctuation) |
|
|
for key, desc_list in descriptions.items(): |
|
|
for i in range(len(desc_list)): |
|
|
desc = desc_list[i] |
|
|
|
|
|
desc = desc.split() |
|
|
|
|
|
desc = [word.lower() for word in desc] |
|
|
|
|
|
desc = [w.translate(table) for w in desc] |
|
|
|
|
|
desc = [word for word in desc if len(word)>1] |
|
|
|
|
|
desc = [word for word in desc if word.isalpha()] |
|
|
|
|
|
desc_list[i] = ' '.join(desc) |
|
|
def to_vocabulary(descriptions): |
|
|
|
|
|
all_desc = set() |
|
|
for key in descriptions.keys(): |
|
|
[all_desc.update(d.split()) for d in descriptions[key]] |
|
|
return all_desc |
|
|
|
|
|
def save_descriptions(descriptions, filename): |
|
|
lines = list() |
|
|
for key, desc_list in descriptions.items(): |
|
|
for desc in desc_list: |
|
|
lines.append(key + " " + desc) |
|
|
data = '\n'.join(lines) |
|
|
file = open(filename, 'w') |
|
|
file.write(data) |
|
|
file.close() |
|
|
|
|
|
import nltk |
|
|
nltk.download('punkt') |
|
|
|
|
|
filename = "/content/drive/MyDrive/Image_Captioning_Project/Flickr8k.token.txt" |
|
|
doc = load_doc(filename) |
|
|
descriptions = load_descriptions(doc) |
|
|
print("Loaded: %d" %len(descriptions)) |
|
|
|
|
|
|
|
|
clean_descriptions(descriptions) |
|
|
vocab = to_vocabulary(descriptions) |
|
|
print("Vocab size: %d" %len(vocab)) |
|
|
|
|
|
|
|
|
|
|
|
"""### Developing Deep Learning Model |
|
|
|
|
|
#### This section is divided into the following parts: |
|
|
|
|
|
Loading Data. |
|
|
Defining the Model. |
|
|
Fitting the Model. |
|
|
""" |
|
|
|
|
|
from pickle import dump |
|
|
|
|
|
|
|
|
def load_doc(filename): |
|
|
|
|
|
file = open(filename, 'r') |
|
|
|
|
|
text = file.read() |
|
|
|
|
|
file.close() |
|
|
return text |
|
|
|
|
|
|
|
|
def load_set(filename): |
|
|
doc = load_doc(filename) |
|
|
dataset = list() |
|
|
for line in doc.split("\n"): |
|
|
if len(line) < 1: |
|
|
continue |
|
|
identifier = line.split('.')[0] |
|
|
dataset.append(identifier) |
|
|
return set(dataset) |
|
|
|
|
|
"""load_clean_descriptions() that loads the cleaned text descriptions from ‘descriptions.txt‘ for a given set of identifiers and returns a dictionary of identifiers to lists of text descriptions. |
|
|
|
|
|
The model we will develop will generate a caption given a photo, and the caption will be generated one word at a time. The sequence of previously generated words will be provided as input. Therefore, we will need a ‘first word’ to kick-off the generation process and a ‘last word‘ to signal the end of the caption. |
|
|
|
|
|
We will use the strings ‘startseq‘ and ‘endseq‘ for this purpose. |
|
|
""" |
|
|
|
|
|
def load_photo_features(features, dataset): |
|
|
all_features = load(open(features, 'rb')) |
|
|
features = {k: all_features[k] for k in dataset} |
|
|
return features |
|
|
|
|
|
def load_clean_descriptions(filename, dataset): |
|
|
|
|
|
doc = load_doc(filename) |
|
|
descriptions = dict() |
|
|
for line in doc.split('\n'): |
|
|
|
|
|
tokens = line.split() |
|
|
|
|
|
image_id, image_desc = tokens[0], tokens[1:] |
|
|
|
|
|
if image_id in dataset: |
|
|
|
|
|
if image_id not in descriptions: |
|
|
descriptions[image_id] = list() |
|
|
|
|
|
desc = 'startseq ' + ' '.join(image_desc) + ' endseq' |
|
|
|
|
|
descriptions[image_id].append(desc) |
|
|
return descriptions |
|
|
|
|
|
from pickle import load |
|
|
|
|
|
|
|
|
filename = '/content/drive/MyDrive/Image_Captioning_Project/Flickr_8k.trainImages.txt' |
|
|
train = load_set(filename) |
|
|
print('Dataset: %d' % len(train)) |
|
|
|
|
|
train_descriptions = load_clean_descriptions('/content/drive/MyDrive/Image_Captioning_Project/descriptions1.txt', train) |
|
|
print('Descriptions: train=%d' % len(train_descriptions)) |
|
|
|
|
|
train_features = load_photo_features('/content/drive/MyDrive/Image_Captioning_Project/features.pkl', train) |
|
|
print('Photos: train=%d' % len(train_features)) |
|
|
|
|
|
def load_doc(filename): |
|
|
|
|
|
file = open(filename, 'r') |
|
|
|
|
|
text = file.read() |
|
|
|
|
|
file.close |
|
|
return text |
|
|
|
|
|
def load_set(filename): |
|
|
doc = load_doc(filename) |
|
|
dataset = list() |
|
|
for line in doc.split("\n"): |
|
|
if len(line) < 1: |
|
|
continue |
|
|
identifier = line.split('.')[0] |
|
|
dataset.append(identifier) |
|
|
return set(dataset) |
|
|
|
|
|
def load_clean_descriptions(filename, dataset): |
|
|
|
|
|
doc = load_doc(filename) |
|
|
descriptions = dict() |
|
|
for line in doc.split('\n'): |
|
|
|
|
|
tokens = line.split() |
|
|
|
|
|
image_id, image_desc = tokens[0], tokens[1:] |
|
|
|
|
|
if image_id in dataset: |
|
|
|
|
|
if image_id not in descriptions: |
|
|
descriptions[image_id] = list() |
|
|
|
|
|
desc = 'startseq ' + ' '.join(image_desc) + ' endseq' |
|
|
|
|
|
descriptions[image_id].append(desc) |
|
|
return descriptions |
|
|
|
|
|
def load_photo_features(filename, dataset): |
|
|
|
|
|
all_features = load(open(filename, 'rb')) |
|
|
|
|
|
features = {k: all_features[k] for k in dataset} |
|
|
return features |
|
|
|
|
|
|
|
|
def to_lines(descriptions): |
|
|
all_desc = list() |
|
|
for key in descriptions.keys(): |
|
|
[all_desc.append(d) for d in descriptions[key]] |
|
|
return all_desc |
|
|
|
|
|
def create_tokenizer(descriptions): |
|
|
lines = to_lines(descriptions) |
|
|
tokenizer = Tokenizer() |
|
|
tokenizer.fit_on_texts(lines) |
|
|
return tokenizer |
|
|
|
|
|
|
|
|
def max_length(description): |
|
|
lines = to_lines(description) |
|
|
return max(len(d.split()) for d in lines) |
|
|
|
|
|
|
|
|
def create_sequences(tokenizer, max_length, desc_list, photo): |
|
|
X1, X2, y = list(), list(), list() |
|
|
|
|
|
for desc in desc_list: |
|
|
|
|
|
seq = tokenizer.texts_to_sequences([desc])[0] |
|
|
|
|
|
for i in range(1, len(seq)): |
|
|
|
|
|
in_seq, out_seq = seq[:i], seq[i] |
|
|
|
|
|
in_seq = pad_sequences([in_seq], maxlen=max_length)[0] |
|
|
|
|
|
out_seq = to_categorical([out_seq], num_classes=vocab_size)[0] |
|
|
|
|
|
X1.append(photo) |
|
|
X2.append(in_seq) |
|
|
y.append(out_seq) |
|
|
return array(X1), array(X2), array(y) |
|
|
|
|
|
"""## Model building""" |
|
|
|
|
|
from tensorflow.keras.layers import add |
|
|
def define_model(vocab_size, max_length): |
|
|
|
|
|
inputs1 = Input(shape=(1000,)) |
|
|
fe1 = Dropout(0.5)(inputs1) |
|
|
fe2 = Dense(256, activation='relu')(fe1) |
|
|
|
|
|
inputs2 = Input(shape=(max_length,)) |
|
|
se1 = Embedding(vocab_size,output_dim=256, mask_zero=True)(inputs2) |
|
|
se2 = Dropout(0.5)(se1) |
|
|
se3 = LSTM(256)(se2) |
|
|
|
|
|
decoder1 = add([fe2, se3]) |
|
|
decoder2 = Dense(256, activation='relu')(decoder1) |
|
|
outputs = Dense(vocab_size, activation='softmax')(decoder2) |
|
|
|
|
|
model = Model(inputs=[inputs1, inputs2], outputs=outputs) |
|
|
model.compile(loss='categorical_crossentropy', optimizer='adam') |
|
|
|
|
|
print(model.summary()) |
|
|
return model |
|
|
|
|
|
|
|
|
def data_generator(descriptions, photos, tokenizer, max_length): |
|
|
|
|
|
while 1: |
|
|
for key, desc_list in descriptions.items(): |
|
|
|
|
|
photo = photos[key][0] |
|
|
in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo) |
|
|
yield [[in_img, in_seq], out_word] |
|
|
|
|
|
|
|
|
import tensorflow as tf |
|
|
filename = "/content/drive/MyDrive/Image_Captioning_Project/Flickr_8k.trainImages.txt" |
|
|
train = load_set(filename) |
|
|
print("Dataset: %d" %len(train)) |
|
|
|
|
|
train_descriptions = load_clean_descriptions("/content/drive/MyDrive/Image_Captioning_Project/descriptions1.txt", train) |
|
|
print("train_descriptions= %d" %len(train_descriptions)) |
|
|
|
|
|
train_feature = load_photo_features("/content/drive/MyDrive/Image_Captioning_Project/features.pkl", train) |
|
|
print("photos: train= %d" %len(train_feature)) |
|
|
|
|
|
tokenizer = create_tokenizer(train_descriptions) |
|
|
vocab_size = len(tokenizer.word_index)+1 |
|
|
print("Vocab size: %d" %vocab_size) |
|
|
|
|
|
max_length = max_length(train_descriptions) |
|
|
print('Description Length: %d' % max_length) |
|
|
|
|
|
import pickle |
|
|
|
|
|
|
|
|
with open('tokenizer1.pkl', 'wb') as f: |
|
|
pickle.dump(tokenizer, f) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_doc(filename): |
|
|
|
|
|
file = open(filename, 'r') |
|
|
|
|
|
text = file.read() |
|
|
|
|
|
file.close() |
|
|
return text |
|
|
|
|
|
|
|
|
def load_set(filename): |
|
|
doc = load_doc(filename) |
|
|
dataset = list() |
|
|
|
|
|
for line in doc.split('\n'): |
|
|
|
|
|
if len(line) < 1: |
|
|
continue |
|
|
|
|
|
identifier = line.split('.')[0] |
|
|
dataset.append(identifier) |
|
|
return set(dataset) |
|
|
|
|
|
def load_photo_features(filename, dataset): |
|
|
|
|
|
all_features = load(open(filename, 'rb')) |
|
|
|
|
|
features = {k: all_features[k] for k in dataset} |
|
|
return features |
|
|
|
|
|
|
|
|
def to_lines(descriptions): |
|
|
all_desc = list() |
|
|
for key in descriptions.keys(): |
|
|
[all_desc.append(d) for d in descriptions[key]] |
|
|
return all_desc |
|
|
|
|
|
|
|
|
def create_tokenizer(descriptions): |
|
|
lines = to_lines(descriptions) |
|
|
tokenizer = Tokenizer() |
|
|
tokenizer.fit_on_texts(lines) |
|
|
return tokenizer |
|
|
|
|
|
|
|
|
def max_length(descriptions): |
|
|
lines = to_lines(descriptions) |
|
|
return max(len(d.split()) for d in lines) |
|
|
|
|
|
|
|
|
def word_for_id(integer, tokenizer): |
|
|
for word, index in tokenizer.word_index.items(): |
|
|
if index == integer: |
|
|
return word |
|
|
return None |
|
|
|
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
import numpy as np |
|
|
def generate_desc(model, tokenizer, photo, max_length): |
|
|
|
|
|
in_text = 'startseq' |
|
|
|
|
|
for i in range(max_length): |
|
|
|
|
|
sequence = tokenizer.texts_to_sequences([in_text])[0] |
|
|
|
|
|
sequence = pad_sequences([sequence], maxlen=max_length) |
|
|
|
|
|
yhat = model.predict([photo,sequence], verbose=0) |
|
|
|
|
|
yhat = np.argmax(yhat) |
|
|
|
|
|
word = word_for_id(yhat, tokenizer) |
|
|
|
|
|
if word is None: |
|
|
break |
|
|
|
|
|
in_text += ' ' + word |
|
|
|
|
|
if word == 'endseq': |
|
|
break |
|
|
return in_text |
|
|
|
|
|
|
|
|
from nltk.translate.bleu_score import corpus_bleu |
|
|
def evaluate_model(model, descriptions, photos, tokenizer, max_length): |
|
|
actual, predicted = list(), list() |
|
|
|
|
|
for key, desc_list in descriptions.items(): |
|
|
|
|
|
yhat = generate_desc(model, tokenizer, photos[key], max_length) |
|
|
|
|
|
references = [d.split() for d in desc_list] |
|
|
actual.append(references) |
|
|
predicted.append(yhat.split()) |
|
|
|
|
|
print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))) |
|
|
print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))) |
|
|
print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))) |
|
|
print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))) |
|
|
|
|
|
|
|
|
import tensorflow as tf |
|
|
filename = "/content/drive/MyDrive/Image_Captioning_Project/Flickr_8k.trainImages.txt" |
|
|
train = load_set(filename) |
|
|
print("Dataset: %d" %len(train)) |
|
|
|
|
|
train_descriptions = load_clean_descriptions("/content/drive/MyDrive/Image_Captioning_Project/descriptions.txt", train) |
|
|
print("train_descriptions= %d" %len(train_descriptions)) |
|
|
|
|
|
train_feature = load_photo_features("/content/drive/MyDrive/Image_Captioning_Project/features.pkl", train) |
|
|
print("photos: train= %d" %len(train_feature)) |
|
|
|
|
|
tokenizer = create_tokenizer(train_descriptions) |
|
|
vocab_size = len(tokenizer.word_index)+1 |
|
|
print("Vocab size: %d" %vocab_size) |
|
|
|
|
|
max_length = max_length(train_descriptions) |
|
|
print('Description Length: %d' % max_length) |
|
|
|
|
|
filename = "/content/drive/MyDrive/Image_Captioning_Project/Flickr_8k.testImages.txt" |
|
|
test = load_set(filename) |
|
|
print("Dataset: %d" %len(test)) |
|
|
test_description = load_clean_descriptions("/content/drive/MyDrive/Image_Captioning_Project/descriptions.txt", test) |
|
|
print("Description= %d" %len(test_description)) |
|
|
test_features = load_photo_features("/content/drive/MyDrive/Image_Captioning_Project/features.pkl", test) |
|
|
print("photos: test=%d" % len(test_features)) |
|
|
|
|
|
from keras.models import load_model |
|
|
filename = "/content/drive/MyDrive/Image_Captioning_Project/model_18.h5" |
|
|
model = load_model(filename) |
|
|
|
|
|
|
|
|
|
|
|
from pickle import load |
|
|
from numpy import argmax |
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
from keras.applications.vgg16 import VGG16 |
|
|
from tensorflow.keras.preprocessing.image import load_img |
|
|
from tensorflow.keras.preprocessing.image import img_to_array |
|
|
from keras.applications.vgg16 import preprocess_input |
|
|
from keras.models import Model |
|
|
from keras.models import load_model |
|
|
|
|
|
|
|
|
def extract_features(filename): |
|
|
|
|
|
model = VGG16() |
|
|
|
|
|
model.layers.pop() |
|
|
model = Model(inputs=model.inputs, outputs=model.layers[-2].output) |
|
|
|
|
|
image = load_img(filename, target_size=(224, 224)) |
|
|
|
|
|
image = img_to_array(image) |
|
|
|
|
|
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) |
|
|
|
|
|
image = preprocess_input(image) |
|
|
|
|
|
feature = model.predict(image, verbose=0) |
|
|
return feature |
|
|
|
|
|
from pickle import load |
|
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
|
|
|
|
tokenizer = load(open('/content/tokenizer1.pkl', 'rb')) |
|
|
max_len = 34 |
|
|
model = load_model('/content/drive/MyDrive/Image_Captioning_Project/model_18.h5') |
|
|
photo = extract_features("/content/drive/MyDrive/Image_Captioning_Project/Images/101654506_8eb26cfb60.jpg") |
|
|
tokenizer.analyzer = None |
|
|
description = generate_desc(model, tokenizer, photo, max_len) |
|
|
print(description) |
|
|
|
|
|
query = description |
|
|
stopwords = ['startseq','endseq'] |
|
|
querywords = query.split() |
|
|
|
|
|
resultwords = [word for word in querywords if word.lower() not in stopwords] |
|
|
result = ' '.join(resultwords) |
|
|
|
|
|
print(result) |
|
|
|
|
|
|