Spaces:

reputation
/

sharingan

Sleeping

sharingan / model_main.py

Tanishq

Update model_main.py

a1865ce over 2 years ago

6.16 kB

	import os
	import pickle
	import numpy as np
	from tqdm.notebook import tqdm

	from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
	from tensorflow.keras.preprocessing.image import load_img, img_to_array
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.models import Model
	from tensorflow.keras.utils import to_categorical, plot_model
	from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

	# load vgg16 model
	model = VGG16()
	# restructure the model
	model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

	with open('features.pkl', 'rb') as f:
	features = pickle.load(f)

	with open('captions.txt', 'r') as f:
	next(f)
	captions_doc = f.read()

	# # create mapping of image to captions
	mapping = {}
	# process lines
	for line in captions_doc.split('\n'):
	# split the line by comma(,)
	tokens = line.split(',')
	if len(line) < 2:
	continue
	image_id, caption = tokens[0], tokens[1:]
	# remove extension from image ID
	image_id = image_id.split('.')[0]
	# convert caption list to string
	caption = " ".join(caption)
	# create list if needed
	if image_id not in mapping:
	mapping[image_id] = []
	# store the caption
	mapping[image_id].append(caption)

	def clean(mapping):
	for key, captions in mapping.items():
	for i in range(len(captions)):
	# take one caption at a time
	caption = captions[i]
	# preprocessing steps
	# convert to lowercase
	caption = caption.lower()
	# delete digits, special chars, etc.,
	caption = caption.replace('[^A-Za-z]', '')
	# delete additional spaces
	caption = caption.replace('\s+', ' ')
	# add start and end tags to the caption
	caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
	captions[i] = caption

	clean(mapping)

	all_captions = []
	for key in mapping:
	for caption in mapping[key]:
	all_captions.append(caption)

	# tokenize the text
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(all_captions)
	vocab_size = len(tokenizer.word_index) + 1

	# get maximum length of the caption available
	max_length = max(len(caption.split()) for caption in all_captions)


	# create data generator to get data in batch (avoids session crash)
	def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
	# loop over images
	X1, X2, y = list(), list(), list()
	n = 0
	while 1:
	for key in data_keys:
	n += 1
	captions = mapping[key]
	# process each caption
	for caption in captions:
	# encode the sequence
	seq = tokenizer.texts_to_sequences([caption])[0]
	# split the sequence into X, y pairs
	for i in range(1, len(seq)):
	# split into input and output pairs
	in_seq, out_seq = seq[:i], seq[i]
	# pad input sequence
	in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
	# encode output sequence
	out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

	# store the sequences
	X1.append(features[key][0])
	X2.append(in_seq)
	y.append(out_seq)
	if n == batch_size:
	X1, X2, y = np.array(X1), np.array(X2), np.array(y)
	yield [X1, X2], y
	X1, X2, y = list(), list(), list()
	n = 0

	# encoder model
	# image feature layers
	inputs1 = Input(shape=(4096,))
	fe1 = Dropout(0.4)(inputs1)
	fe2 = Dense(256, activation='relu')(fe1)
	# sequence feature layers
	inputs2 = Input(shape=(max_length,))
	se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
	se2 = Dropout(0.4)(se1)
	se3 = LSTM(256)(se2)

	# decoder model
	decoder1 = add([fe2, se3])
	decoder2 = Dense(256, activation='relu')(decoder1)
	outputs = Dense(vocab_size, activation='softmax')(decoder2)

	model = Model(inputs=[inputs1, inputs2], outputs=outputs)
	model.compile(loss='categorical_crossentropy', optimizer='adam')


	from keras.models import load_model
	model = load_model("best_model.h5")

	def idx_to_word(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
	if index == integer:
	return word
	return None


	# generate caption for an image
	def predict_caption(model, image, tokenizer, max_length):
	# add start tag for generation process
	in_text = 'startseq'
	# iterate over the max length of sequence
	for i in range(max_length):
	# encode input sequence
	sequence = tokenizer.texts_to_sequences([in_text])[0]
	# pad the sequence
	sequence = pad_sequences([sequence], max_length)
	# predict next word
	yhat = model.predict([image, sequence], verbose=0)
	# get index with high probability
	yhat = np.argmax(yhat)
	# convert index to word
	word = idx_to_word(yhat, tokenizer)
	# stop if word not found
	if word is None:
	break
	# append word as input for generating next word
	in_text += " " + word
	# stop if we reach end tag
	if word == 'endseq':
	break

	return in_text

	vgg_model = VGG16()
	# restructure the model
	vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output)

	def generate_caption(image_path):
	image_path = image_path
	# load image
	image = load_img(image_path, target_size=(224, 224))
	# convert image pixels to numpy array
	image = img_to_array(image)
	# reshape data for model
	image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
	# preprocess image for vgg
	image = preprocess_input(image)
	# extract features
	feature = vgg_model.predict(image, verbose=0)
	# predict from the trained model
	return predict_caption(model, feature, tokenizer, max_length)[9: -7]