| import streamlit as st |
| import pickle |
| st.header("Image Captioner") |
| st.markdown("Building the framework may take upto a minute. Please be patient. Thank you!") |
| features=pickle.load(open("features.pkl","rb")) |
| all_captions=pickle.load(open("all_captions.pkl","rb")) |
| from tensorflow.keras.preprocessing.text import Tokenizer |
| tokenizer = Tokenizer() |
| tokenizer.fit_on_texts(all_captions) |
| vocab_size = len(tokenizer.word_index) + 1 |
| max_length = max(len(caption.split()) for caption in all_captions) |
| from tensorflow import keras |
| model = keras.models.load_model("best_model.h5") |
| from tensorflow.keras.preprocessing.sequence import pad_sequences |
| def idx_to_word(integer, tokenizer): |
| for word, index in tokenizer.word_index.items(): |
| if index == integer: |
| return word |
| return None |
| |
| import numpy as np |
| def predict_caption(model, image, tokenizer, max_length): |
| |
| in_text = 'startseq' |
| |
| for i in range(max_length): |
| |
| sequence = tokenizer.texts_to_sequences([in_text])[0] |
| |
| sequence = pad_sequences([sequence], max_length) |
| |
| yhat = model.predict([image, sequence], verbose=0) |
| |
| yhat = np.argmax(yhat) |
| |
| word = idx_to_word(yhat, tokenizer) |
| |
| if word is None: |
| break |
| |
| in_text += " " + word |
|
|
| if word == 'endseq': |
| break |
| |
| return in_text |
| from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input |
| from tensorflow.keras.models import Model |
| from gtts import gTTS |
| from io import BytesIO |
| sound_file = BytesIO() |
| vgg_model = VGG16() |
| vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output) |
| from tensorflow.keras.preprocessing.image import img_to_array |
| from PIL import Image |
| uploaded_image=st.file_uploader("Upload image to be captioned",type=["jpg","png","jpeg","webp"]) |
| image_path="bushman.jpeg" |
| if(uploaded_image!=None): |
| display_image=Image.open(uploaded_image) |
| st.image(display_image) |
| if st.button("Caption"): |
| st.text("Please be patient...") |
| display_image=display_image.resize((224,224)) |
| image = img_to_array(display_image) |
| image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) |
| image = preprocess_input(image) |
| feature = vgg_model.predict(image, verbose=0) |
| final=predict_caption(model, feature, tokenizer, max_length) |
| final_output=((" ").join(final.split(" ")[1:len(final.split(" "))-1])) |
| tts = gTTS(final_output, lang='en') |
| tts.write_to_fp(sound_file) |
| st.text("Output:") |
| st.markdown(final_output) |
| st.audio(sound_file) |