captionGPT / imageCaptionGPT.py
bishalbose294's picture
changes
b61b120
import pickle, os, io, re, gc
from transformers import AutoTokenizer, VisionEncoderDecoderModel, ViTImageProcessor
from keras.models import Model, load_model
from keras.applications.inception_v3 import InceptionV3, preprocess_input
import numpy as np
from keras.utils import pad_sequences
from PIL import Image
cwd = os.getcwd()
############################### Custom Model #############################
embedding_dim = 300
count = 0
max_caption_length = 80
models = os.path.join(cwd, "model")
hf_model = os.path.join(cwd, "hf_model")
word_index_Mapping = pickle.load(
open(os.path.join(models, "word_index_Mapping.pkl"), "rb")
)
index_word_Mapping = pickle.load(
open(os.path.join(models, "index_word_Mapping.pkl"), "rb")
)
vocab_size = len(word_index_Mapping) + 1
incpmodel = InceptionV3(weights="imagenet")
inceptionModel = Model(incpmodel.input, incpmodel.layers[-2].output)
model_weights_save_path = os.path.join(models, "model.h5")
# predictionModel = load_model(model_weights_save_path)
############################### Hugging Face Model #################################
# os.environ['TRANSFORMERS_CACHE'] = hf_model
# os.environ['HF_HOME'] = hf_model
# os.environ['HF_DATASETS_CACHE'] = hf_model
# os.environ['XDG_CACHE_HOME'] = hf_model
device='cpu'
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
feature_extractor = ViTImageProcessor.from_pretrained(encoder_checkpoint, cache_dir=hf_model)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint, cache_dir=hf_model)
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint, cache_dir=hf_model)
# def predictCustomModel(img):
# img = img.resize((299, 299), Image.LANCZOS)
# img = np.expand_dims(img, axis=0)
# img = preprocess_input(img)
# vectorImg = inceptionModel.predict(img)
# in_text = "startSeq"
# for i in range(1, max_caption_length):
# seq = [
# word_index_Mapping[w] for w in in_text.split() if w in word_index_Mapping
# ]
# in_seq = pad_sequences([seq], maxlen=max_caption_length)
# inputs = [vectorImg, in_seq]
# yhat = predictionModel.predict(x=inputs, verbose=0)
# yhat = np.argmax(yhat)
# word = index_word_Mapping[yhat]
# in_text += " " + word
# if word == "endSeq":
# break
# final = in_text.split()
# final = final[1:-1]
# final = " ".join(final)
# predict = re.sub(r"\b(\w+)( \1\b)+", r"\1", final)
# del img
# del vectorImg
# del final
# del in_text
# del seq
# del inputs
# gc.collect()
# return predict
def predictHFModel(image, max_length=224, num_beams=4):
image = image.convert('RGB')
image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
caption_ids = model.generate(image, max_length = max_length)[0]
caption_text = clean_text(tokenizer.decode(caption_ids))
prediction = re.sub(r"\b(\w+)( \1\b)+", r"\1", caption_text.strip())
del image
del clean_text
del caption_ids
del caption_text
gc.collect()
return prediction