Spaces:
Sleeping
Sleeping
Commit ·
51c45e2
0
Parent(s):
Duplicate from NicolasVana/image-captioning
Browse filesCo-authored-by: Nicolas Vana Santos <NicolasVana@users.noreply.huggingface.co>
This view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +19 -0
- Inception/PretrainedInceptionLSTM/Model/keras_metadata.pb +3 -0
- Inception/PretrainedInceptionLSTM/Model/saved_model.pb +3 -0
- Inception/PretrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001 +3 -0
- Inception/PretrainedInceptionLSTM/Model/variables/variables.index +0 -0
- Inception/PretrainedInceptionLSTM/index2Word.npy +0 -0
- Inception/PretrainedInceptionLSTM/variable_params.npy +0 -0
- Inception/PretrainedInceptionLSTM/word2Index.npy +0 -0
- Inception/RetrainedInceptionFeatureExtraction/Model/keras_metadata.pb +3 -0
- Inception/RetrainedInceptionFeatureExtraction/Model/saved_model.pb +3 -0
- Inception/RetrainedInceptionFeatureExtraction/Model/variables/variables.data-00000-of-00001 +3 -0
- Inception/RetrainedInceptionFeatureExtraction/Model/variables/variables.index +0 -0
- Inception/RetrainedInceptionLSTM/Model/keras_metadata.pb +3 -0
- Inception/RetrainedInceptionLSTM/Model/saved_model.pb +3 -0
- Inception/RetrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001 +3 -0
- Inception/RetrainedInceptionLSTM/Model/variables/variables.index +0 -0
- Inception/RetrainedInceptionLSTM/index2Word.npy +0 -0
- Inception/RetrainedInceptionLSTM/variable_params.npy +0 -0
- Inception/RetrainedInceptionLSTM/word2Index.npy +0 -0
- README.md +34 -0
- app.py +92 -0
- model.py +174 -0
- requirements.txt +10 -0
- samples/ROCO_00001.jpg +0 -0
- samples/ROCO_00006.jpg +0 -0
- samples/ROCO_00016.jpg +0 -0
- samples/ROCO_00025.jpg +0 -0
- samples/ROCO_00031.jpg +0 -0
- samples/ROCO_00036.jpg +0 -0
- samples/ROCO_00061.jpg +0 -0
- samples/ROCO_00084.jpg +0 -0
- samples/ROCO_00138.jpg +0 -0
- samples/ROCO_00153.jpg +0 -0
- samples/ROCO_00176.jpg +0 -0
- samples/ROCO_00185.jpg +0 -0
- samples/ROCO_00190.jpg +0 -0
- samples/ROCO_00206.jpg +0 -0
- samples/ROCO_00218.jpg +0 -0
- samples/ROCO_00251.jpg +0 -0
- samples/ROCO_00258.jpg +0 -0
- samples/ROCO_00261.jpg +0 -0
- samples/ROCO_00264.jpg +0 -0
- samples/ROCO_00271.jpg +0 -0
- samples/ROCO_00300.jpg +0 -0
- samples/ROCO_00302.jpg +0 -0
- samples/ROCO_00303.jpg +0 -0
- samples/ROCO_00307.jpg +0 -0
- samples/ROCO_00316.jpg +0 -0
- samples/ROCO_00319.jpg +0 -0
- samples/ROCO_00328.jpg +0 -0
.gitattributes
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
Inception/PretrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
Inception/RetrainedInceptionFeatureExtraction/Model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
Inception/RetrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
Inception/PretrainedInceptionLSTM/Model/keras_metadata.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90fe3518b5f0e26908c460bc876abaef2017a5252faea2854e19e6bbc80c1abb
|
| 3 |
+
size 19875
|
Inception/PretrainedInceptionLSTM/Model/saved_model.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ac9410ec5d75b446ba1913ce546556b276f4f7243c6b84692dfe71d04785eb1
|
| 3 |
+
size 2728089
|
Inception/PretrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:607eba2255866ff15c9be9dbc271e24c643b9c5650b5b36bd22c6f1ad461c443
|
| 3 |
+
size 23853510
|
Inception/PretrainedInceptionLSTM/Model/variables/variables.index
ADDED
|
Binary file (2.07 kB). View file
|
|
|
Inception/PretrainedInceptionLSTM/index2Word.npy
ADDED
|
Binary file (91.1 kB). View file
|
|
|
Inception/PretrainedInceptionLSTM/variable_params.npy
ADDED
|
Binary file (327 Bytes). View file
|
|
|
Inception/PretrainedInceptionLSTM/word2Index.npy
ADDED
|
Binary file (91.1 kB). View file
|
|
|
Inception/RetrainedInceptionFeatureExtraction/Model/keras_metadata.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b304413d09ac695dc11a96b0305ffb4e41f34f145b90a536ed4c929c11c7306
|
| 3 |
+
size 974015
|
Inception/RetrainedInceptionFeatureExtraction/Model/saved_model.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57d9834d47ee681be13d8ecf60b93770a30feb9d655dea12b78c0f0f7e1c845a
|
| 3 |
+
size 6312206
|
Inception/RetrainedInceptionFeatureExtraction/Model/variables/variables.data-00000-of-00001
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12cf113be83ae0bc7024191ae51b1e41c2c016d5543c3711e0bab928904eaeab
|
| 3 |
+
size 279976841
|
Inception/RetrainedInceptionFeatureExtraction/Model/variables/variables.index
ADDED
|
Binary file (50.2 kB). View file
|
|
|
Inception/RetrainedInceptionLSTM/Model/keras_metadata.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e29ab07736ef18245cac5040bf1dd2100d21e8084ed51db859064026a1a0fba4
|
| 3 |
+
size 19858
|
Inception/RetrainedInceptionLSTM/Model/saved_model.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e40821682b6a5e4b88848c9ec60bd8400cf2a37065137871f59112d77d027c65
|
| 3 |
+
size 2727709
|
Inception/RetrainedInceptionLSTM/Model/variables/variables.data-00000-of-00001
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ecad5c20713acfd90563bc562f048e9cc302936b162e2e196f37d38922a0dca
|
| 3 |
+
size 18577366
|
Inception/RetrainedInceptionLSTM/Model/variables/variables.index
ADDED
|
Binary file (2.07 kB). View file
|
|
|
Inception/RetrainedInceptionLSTM/index2Word.npy
ADDED
|
Binary file (91.1 kB). View file
|
|
|
Inception/RetrainedInceptionLSTM/variable_params.npy
ADDED
|
Binary file (327 Bytes). View file
|
|
|
Inception/RetrainedInceptionLSTM/word2Index.npy
ADDED
|
Binary file (91.1 kB). View file
|
|
|
README.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Medical Image Captioning
|
| 3 |
+
emoji: 🖼️
|
| 4 |
+
colorFrom: pink
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
app_file: app.py
|
| 8 |
+
pinned: false
|
| 9 |
+
duplicated_from: NicolasVana/image-captioning
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Configuration
|
| 13 |
+
|
| 14 |
+
`title`: _string_
|
| 15 |
+
Display title for the Space
|
| 16 |
+
|
| 17 |
+
`emoji`: _string_
|
| 18 |
+
Space emoji (emoji-only character allowed)
|
| 19 |
+
|
| 20 |
+
`colorFrom`: _string_
|
| 21 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
| 22 |
+
|
| 23 |
+
`colorTo`: _string_
|
| 24 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
| 25 |
+
|
| 26 |
+
`sdk`: _string_
|
| 27 |
+
Can be either `gradio` or `streamlit`
|
| 28 |
+
|
| 29 |
+
`app_file`: _string_
|
| 30 |
+
Path to your main application file (which contains either `gradio` or `streamlit` Python code).
|
| 31 |
+
Path is relative to the root of the repository.
|
| 32 |
+
|
| 33 |
+
`pinned`: _boolean_
|
| 34 |
+
Whether the Space stays on top of your list.
|
app.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import requests
|
| 3 |
+
import io
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Designing the interface
|
| 7 |
+
st.title("Medical Image Captioning")
|
| 8 |
+
|
| 9 |
+
st.sidebar.markdown(
|
| 10 |
+
"""
|
| 11 |
+
This project features 3 different Medical image captioning models.
|
| 12 |
+
Two of the use the InceptionV3 architecture to do feature extraction and then generate the captions using an LSTM model.
|
| 13 |
+
The difference between these two is that the first one uses InceptionV3 trained on ImageNet data and outputs 2048 features.
|
| 14 |
+
The second one is based on a retrained version of InceptionV3 that uses the CUI data from the ROCO dataset to extract 745 features from the images.
|
| 15 |
+
The final model is transformer based on...
|
| 16 |
+
"""
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
with st.spinner('Loading objects ...'):
|
| 20 |
+
from model import *
|
| 21 |
+
|
| 22 |
+
random_image_id = get_random_image_id()
|
| 23 |
+
|
| 24 |
+
st.sidebar.title("Select a sample image")
|
| 25 |
+
sample_image_id = st.sidebar.selectbox(
|
| 26 |
+
"Please choose a sample image",
|
| 27 |
+
sample_image_ids
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
st.sidebar.title("Select a model Type")
|
| 31 |
+
model_type = st.sidebar.selectbox(
|
| 32 |
+
"Please choose a model",
|
| 33 |
+
['Pretrained Inception', 'Retrained Inception', 'Transformer']
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
inception, lstm = fetch_model(model_type)
|
| 37 |
+
word2Index, index2Word, variable_params = fetch_auxiliary_files(model_type)
|
| 38 |
+
max_len = variable_params['max_caption_len']
|
| 39 |
+
|
| 40 |
+
if st.sidebar.button("Random ROCO (test) images"):
|
| 41 |
+
random_image_id = get_random_image_id()
|
| 42 |
+
sample_image_id = "None"
|
| 43 |
+
|
| 44 |
+
bytes_data = None
|
| 45 |
+
with st.sidebar.form("file-uploader-form", clear_on_submit=True):
|
| 46 |
+
uploaded_file = st.file_uploader("Choose a file")
|
| 47 |
+
submitted = st.form_submit_button("Upload")
|
| 48 |
+
if submitted and uploaded_file is not None:
|
| 49 |
+
bytes_data = io.BytesIO(uploaded_file.getvalue())
|
| 50 |
+
|
| 51 |
+
if (bytes_data is None) and submitted:
|
| 52 |
+
|
| 53 |
+
st.write("No file is selected to upload")
|
| 54 |
+
|
| 55 |
+
else:
|
| 56 |
+
|
| 57 |
+
image_id = random_image_id
|
| 58 |
+
if sample_image_id != "None":
|
| 59 |
+
assert type(sample_image_id) == int
|
| 60 |
+
image_id = sample_image_id
|
| 61 |
+
|
| 62 |
+
sample_name = f"ROCO_{str(image_id).zfill(5)}.jpg"
|
| 63 |
+
sample_path = os.path.join(sample_dir, sample_name)
|
| 64 |
+
|
| 65 |
+
if bytes_data is not None:
|
| 66 |
+
image = Image.open(bytes_data)
|
| 67 |
+
elif os.path.isfile(sample_path):
|
| 68 |
+
image = Image.open(sample_path)
|
| 69 |
+
|
| 70 |
+
width, height = 299, 299
|
| 71 |
+
resized = image.resize(size=(width, height))
|
| 72 |
+
|
| 73 |
+
if bytes_data is None:
|
| 74 |
+
st.markdown(f"ROCO_{str(image_id).zfill(5)}.jpg")
|
| 75 |
+
show = st.image(resized)
|
| 76 |
+
show.image(resized, '\n\nSelected Image')
|
| 77 |
+
|
| 78 |
+
# For newline
|
| 79 |
+
st.sidebar.write('\n')
|
| 80 |
+
|
| 81 |
+
with st.spinner('Generating image caption ...'):
|
| 82 |
+
st.header(f'Predicted caption:\n\n')
|
| 83 |
+
|
| 84 |
+
preprocessed_img = preprocess_image_inception(resized)
|
| 85 |
+
features = extract_features(inception, preprocessed_img)
|
| 86 |
+
caption = generate_caption(lstm, features, max_len, word2Index, index2Word)
|
| 87 |
+
st.subheader(caption)
|
| 88 |
+
|
| 89 |
+
st.sidebar.header("Model predicts: ")
|
| 90 |
+
st.sidebar.write(f"{caption}")
|
| 91 |
+
|
| 92 |
+
image.close()
|
model.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os, shutil
|
| 3 |
+
import random
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
from PIL import Image
|
| 10 |
+
import tensorflow as tf
|
| 11 |
+
from tensorflow.keras.applications.inception_v3 import preprocess_input
|
| 12 |
+
from tensorflow.keras.preprocessing import image
|
| 13 |
+
from tensorflow.keras.applications.inception_v3 import InceptionV3
|
| 14 |
+
from tensorflow.keras.models import Model
|
| 15 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
root = Path(os.getcwd())
|
| 20 |
+
aux_pre = root / 'Inception' / 'PretrainedInceptionLSTM'
|
| 21 |
+
aux_re = root / 'Inception' / 'RetrainedInceptionLSTM'
|
| 22 |
+
|
| 23 |
+
model_re_path = root / 'Inception' / 'RetrainedInceptionLSTM' / 'Model'
|
| 24 |
+
model_inception_path = root / 'Inception' / 'RetrainedInceptionFeatureExtraction' / 'Model'
|
| 25 |
+
model_pre_path = root / 'Inception' / 'PretrainedInceptionLSTM' / 'Model'
|
| 26 |
+
|
| 27 |
+
# Must create
|
| 28 |
+
|
| 29 |
+
def get_pretrained_inceptionV3():
|
| 30 |
+
model = InceptionV3(weights='imagenet')
|
| 31 |
+
model2 = Model(model.input, model.layers[-2].output)
|
| 32 |
+
return model2
|
| 33 |
+
|
| 34 |
+
def fetch_auxiliary_files(type):
|
| 35 |
+
if type == 'Pretrained Inception':
|
| 36 |
+
word2Index = np.load(aux_pre / "word2Index.npy", allow_pickle=True).item()
|
| 37 |
+
index2Word = np.load(aux_pre / "index2Word.npy", allow_pickle=True).item()
|
| 38 |
+
variable_params = np.load(aux_pre / "variable_params.npy", allow_pickle=True).item()
|
| 39 |
+
return word2Index, index2Word, variable_params
|
| 40 |
+
if type == 'Retrained Inception':
|
| 41 |
+
word2Index = np.load(aux_re / "word2Index.npy", allow_pickle=True).item()
|
| 42 |
+
index2Word = np.load(aux_re / "index2Word.npy", allow_pickle=True).item()
|
| 43 |
+
variable_params = np.load(aux_re / "variable_params.npy", allow_pickle=True).item()
|
| 44 |
+
return word2Index, index2Word, variable_params
|
| 45 |
+
|
| 46 |
+
@st.cache(allow_output_mutation=True, show_spinner=False)
|
| 47 |
+
def fetch_model(type):
|
| 48 |
+
with st.spinner(text="Fetching Model"):
|
| 49 |
+
if type == 'Pretrained Inception':
|
| 50 |
+
model_pre = tf.keras.models.load_model(model_pre_path)
|
| 51 |
+
model_inc = get_pretrained_inceptionV3()
|
| 52 |
+
return model_inc, model_pre
|
| 53 |
+
if type == 'Retrained Inception':
|
| 54 |
+
model_re = tf.keras.models.load_model(model_re_path)
|
| 55 |
+
model_inc = tf.keras.models.load_model(model_inception_path)
|
| 56 |
+
return model_inc, model_re
|
| 57 |
+
|
| 58 |
+
def preprocess_image_inception(image):
|
| 59 |
+
if image.mode != "RGB":
|
| 60 |
+
image = image.convert(mode="RGB")
|
| 61 |
+
|
| 62 |
+
x = np.array(image)
|
| 63 |
+
x = np.expand_dims(x, axis = 0)
|
| 64 |
+
x = preprocess_input(x)
|
| 65 |
+
x = x.reshape(1, 299, 299, 3)
|
| 66 |
+
|
| 67 |
+
return x
|
| 68 |
+
|
| 69 |
+
def extract_features(model, image):
|
| 70 |
+
features = model.predict(image, verbose = 0)
|
| 71 |
+
return features
|
| 72 |
+
|
| 73 |
+
def generate_caption(model, features, max_len, word2Index, index2Word, beam_index = 3):
|
| 74 |
+
caption = beam_search(model, features, max_len, word2Index, index2Word, beam_index)
|
| 75 |
+
return caption
|
| 76 |
+
|
| 77 |
+
def beam_search(model, features, max_len, word2Index, index2Word, beam_index):
|
| 78 |
+
start = [word2Index["startseq"]]
|
| 79 |
+
start_word = [[start, 1]]
|
| 80 |
+
|
| 81 |
+
final_preds = []
|
| 82 |
+
live_seqs = beam_index
|
| 83 |
+
features = np.tile(features, (beam_index,1))
|
| 84 |
+
count = 0
|
| 85 |
+
while len(start_word) > 0:
|
| 86 |
+
#print(count)
|
| 87 |
+
count+=1
|
| 88 |
+
temp = []
|
| 89 |
+
padded_seqs = []
|
| 90 |
+
#Get padded seqs for each of the starting seqs so far, misnamed as start_word
|
| 91 |
+
for s in start_word:
|
| 92 |
+
par_caps = pad_sequences([s[0]], maxlen=max_len, padding='post')
|
| 93 |
+
padded_seqs.append(par_caps)
|
| 94 |
+
|
| 95 |
+
#Formatting input so that it can be used for a prediction
|
| 96 |
+
padded_seqs = np.array(padded_seqs).reshape(len(start_word), max_len)
|
| 97 |
+
|
| 98 |
+
preds = model.predict([features[:len(start_word)],padded_seqs], verbose=0)
|
| 99 |
+
|
| 100 |
+
#Getting the best branches for each of the start seqs that we had
|
| 101 |
+
for index, pred in enumerate(preds):
|
| 102 |
+
word_preds = np.argsort(pred)[-live_seqs:]
|
| 103 |
+
for w in word_preds:
|
| 104 |
+
next_cap, prob = start_word[index][0][:], start_word[index][1]
|
| 105 |
+
next_cap.append(w)
|
| 106 |
+
prob *= pred[w]
|
| 107 |
+
temp.append([next_cap, prob])
|
| 108 |
+
|
| 109 |
+
start_word = temp
|
| 110 |
+
# Sorting according to the probabilities
|
| 111 |
+
start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
|
| 112 |
+
# Getting the top words from all branches
|
| 113 |
+
start_word = start_word[-live_seqs:]
|
| 114 |
+
|
| 115 |
+
for pair in start_word:
|
| 116 |
+
if index2Word[pair[0][-1]] == 'endseq':
|
| 117 |
+
final_preds.append([pair[0][:-1], pair[1]])
|
| 118 |
+
start_word = start_word[:-1]
|
| 119 |
+
live_seqs -= 1
|
| 120 |
+
if len(pair[0]) == max_len:
|
| 121 |
+
final_preds.append(pair)
|
| 122 |
+
start_word = start_word[:-1]
|
| 123 |
+
live_seqs -= 1
|
| 124 |
+
|
| 125 |
+
# Between all the finished sequences (either max len or predicted endseq), decide which is best
|
| 126 |
+
max_prob = 0
|
| 127 |
+
for index, pred in enumerate(final_preds):
|
| 128 |
+
if pred[1] > max_prob:
|
| 129 |
+
best_index = index
|
| 130 |
+
max_prob = pred[1]
|
| 131 |
+
|
| 132 |
+
# Convert to readable text
|
| 133 |
+
final_pred = final_preds[best_index]
|
| 134 |
+
final_caption = [index2Word[i] for i in final_pred[0]]
|
| 135 |
+
final_caption = ' '.join(final_caption[1:])
|
| 136 |
+
return final_caption
|
| 137 |
+
|
| 138 |
+
# # create target model directory
|
| 139 |
+
# model_dir = './models/'
|
| 140 |
+
# os.makedirs(model_dir, exist_ok=True)
|
| 141 |
+
#
|
| 142 |
+
# files_to_download = [
|
| 143 |
+
# "config.json",
|
| 144 |
+
# "flax_model.msgpack",
|
| 145 |
+
# "merges.txt",
|
| 146 |
+
# "special_tokens_map.json",
|
| 147 |
+
# "tokenizer.json",
|
| 148 |
+
# "tokenizer_config.json",
|
| 149 |
+
# "vocab.json",
|
| 150 |
+
# "preprocessor_config.json",
|
| 151 |
+
# ]
|
| 152 |
+
|
| 153 |
+
def _compile():
|
| 154 |
+
|
| 155 |
+
image_path = 'samples/ROCO_00929.jpg'
|
| 156 |
+
image = Image.open(image_path)
|
| 157 |
+
#predict(image)
|
| 158 |
+
image.close()
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
_compile()
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
sample_dir = './samples/'
|
| 165 |
+
sample_image_ids = tuple(["None"] + [int(f.replace('ROCO_', '').replace('.jpg', '')) for f in os.listdir(sample_dir) if f.startswith('ROCO_')])
|
| 166 |
+
|
| 167 |
+
with open(os.path.join(sample_dir, "Roco-img-ids.json"), "r", encoding="UTF-8") as fp:
|
| 168 |
+
roco_image_ids = json.load(fp)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def get_random_image_id():
|
| 172 |
+
|
| 173 |
+
image_id = random.sample(roco_image_ids, k=1)[0]
|
| 174 |
+
return image_id
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tensorflow
|
| 2 |
+
pathlib
|
| 3 |
+
streamlit==0.84.1
|
| 4 |
+
Pillow
|
| 5 |
+
jax[cpu]
|
| 6 |
+
flax
|
| 7 |
+
transformers
|
| 8 |
+
huggingface_hub
|
| 9 |
+
googletrans==4.0.0-rc1
|
| 10 |
+
protobuf==3.20
|
samples/ROCO_00001.jpg
ADDED
|
samples/ROCO_00006.jpg
ADDED
|
samples/ROCO_00016.jpg
ADDED
|
samples/ROCO_00025.jpg
ADDED
|
samples/ROCO_00031.jpg
ADDED
|
samples/ROCO_00036.jpg
ADDED
|
samples/ROCO_00061.jpg
ADDED
|
samples/ROCO_00084.jpg
ADDED
|
samples/ROCO_00138.jpg
ADDED
|
samples/ROCO_00153.jpg
ADDED
|
samples/ROCO_00176.jpg
ADDED
|
samples/ROCO_00185.jpg
ADDED
|
samples/ROCO_00190.jpg
ADDED
|
samples/ROCO_00206.jpg
ADDED
|
samples/ROCO_00218.jpg
ADDED
|
samples/ROCO_00251.jpg
ADDED
|
samples/ROCO_00258.jpg
ADDED
|
samples/ROCO_00261.jpg
ADDED
|
samples/ROCO_00264.jpg
ADDED
|
samples/ROCO_00271.jpg
ADDED
|
samples/ROCO_00300.jpg
ADDED
|
samples/ROCO_00302.jpg
ADDED
|
samples/ROCO_00303.jpg
ADDED
|
samples/ROCO_00307.jpg
ADDED
|
samples/ROCO_00316.jpg
ADDED
|
samples/ROCO_00319.jpg
ADDED
|
samples/ROCO_00328.jpg
ADDED
|