Spaces:
Build error
Build error
Add most relevant part
Browse files- app.py +2 -1
- most_relevant_part.py +80 -0
- text2image.py +1 -1
app.py
CHANGED
|
@@ -2,8 +2,9 @@ import streamlit as st
|
|
| 2 |
|
| 3 |
import image2text
|
| 4 |
import text2image
|
|
|
|
| 5 |
|
| 6 |
-
PAGES = {"Text to Image": text2image, "Image to Text": image2text}
|
| 7 |
|
| 8 |
st.sidebar.title("Navigation")
|
| 9 |
model = st.sidebar.selectbox("Choose a model", ["koclip-base", "koclip-large"])
|
|
|
|
| 2 |
|
| 3 |
import image2text
|
| 4 |
import text2image
|
| 5 |
+
import most_relevant_part
|
| 6 |
|
| 7 |
+
PAGES = {"Text to Image": text2image, "Image to Text": image2text, "Most Relevant Part of Image": most_relevant_part}
|
| 8 |
|
| 9 |
st.sidebar.title("Navigation")
|
| 10 |
model = st.sidebar.selectbox("Choose a model", ["koclip-base", "koclip-large"])
|
most_relevant_part.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from PIL import Image
|
| 5 |
+
|
| 6 |
+
import jax
|
| 7 |
+
import jax.numpy as jnp
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
from utils import load_model
|
| 11 |
+
|
| 12 |
+
def split_image(im):
|
| 13 |
+
im = np.array(im)
|
| 14 |
+
M = im.shape[0] // 3
|
| 15 |
+
N = im.shape[1] // 3
|
| 16 |
+
tiles = [
|
| 17 |
+
im[x:x + M, y:y + N]
|
| 18 |
+
for x in range(0, im.shape[0], M)
|
| 19 |
+
for y in range(0, im.shape[1], N)
|
| 20 |
+
]
|
| 21 |
+
return tiles
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# def split_image(X):
|
| 25 |
+
# num_rows = X.shape[0] // 224
|
| 26 |
+
# num_cols = X.shape[1] // 224
|
| 27 |
+
# Xc = X[0:num_rows * 224, 0:num_cols * 224, :]
|
| 28 |
+
# patches = []
|
| 29 |
+
# for j in range(num_rows):
|
| 30 |
+
# for i in range(num_cols):
|
| 31 |
+
# patches.append(Xc[j * 224:(j + 1) * 224, i * 224:(i + 1) * 224, :])
|
| 32 |
+
# return patches
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def app(model_name):
|
| 36 |
+
model, processor = load_model(f"koclip/{model_name}")
|
| 37 |
+
|
| 38 |
+
st.title("Most Relevant Part of Image")
|
| 39 |
+
st.markdown("""
|
| 40 |
+
Given a piece of text, the CLIP model finds the part of an image that best explains the text.
|
| 41 |
+
To try it out, you can
|
| 42 |
+
1) Upload an image
|
| 43 |
+
2) Explain a part of the image in text
|
| 44 |
+
Which will yield the most relevant image tile from a 3x3 grid of the image
|
| 45 |
+
""")
|
| 46 |
+
|
| 47 |
+
query1 = st.text_input(
|
| 48 |
+
"Enter a URL to an image...",
|
| 49 |
+
value="https://img.sbs.co.kr/newimg/news/20200823/201463830_1280.jpg")
|
| 50 |
+
query2 = st.file_uploader("or upload an image...",
|
| 51 |
+
type=["jpg", "jpeg", "png"])
|
| 52 |
+
captions = st.text_input(
|
| 53 |
+
"Enter query to find most relevant part of image ",
|
| 54 |
+
value="이건 서울의 경복궁 사진이다.",
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
if st.button("질문 (Query)"):
|
| 58 |
+
if not any([query1, query2]):
|
| 59 |
+
st.error("Please upload an image or paste an image URL.")
|
| 60 |
+
else:
|
| 61 |
+
image_data = (query2 if query2 is not None else requests.get(
|
| 62 |
+
query1, stream=True).raw)
|
| 63 |
+
image = Image.open(image_data)
|
| 64 |
+
st.image(image)
|
| 65 |
+
|
| 66 |
+
images = split_image(image)
|
| 67 |
+
|
| 68 |
+
inputs = processor(text=captions,
|
| 69 |
+
images=images,
|
| 70 |
+
return_tensors="jax",
|
| 71 |
+
padding=True)
|
| 72 |
+
inputs["pixel_values"] = jnp.transpose(inputs["pixel_values"],
|
| 73 |
+
axes=[0, 2, 3, 1])
|
| 74 |
+
outputs = model(**inputs)
|
| 75 |
+
probs = jax.nn.softmax(outputs.logits_per_image, axis=0)
|
| 76 |
+
for idx, prob in sorted(enumerate(probs),
|
| 77 |
+
key=lambda x: x[1],
|
| 78 |
+
reverse=True):
|
| 79 |
+
st.text(f"Score: {prob[0]:.3f}")
|
| 80 |
+
st.image(images[idx])
|
text2image.py
CHANGED
|
@@ -40,7 +40,7 @@ def app(model_name):
|
|
| 40 |
result_imgs, result_captions = [], []
|
| 41 |
for file, dist in zip(result_files, dists):
|
| 42 |
result_imgs.append(plt.imread(os.path.join(images_directory, file)))
|
| 43 |
-
result_captions.append("
|
| 44 |
|
| 45 |
st.image(result_imgs[:3], caption=result_captions[:3], width=200)
|
| 46 |
st.image(result_imgs[3:6], caption=result_captions[3:6], width=200)
|
|
|
|
| 40 |
result_imgs, result_captions = [], []
|
| 41 |
for file, dist in zip(result_files, dists):
|
| 42 |
result_imgs.append(plt.imread(os.path.join(images_directory, file)))
|
| 43 |
+
result_captions.append("Score: {:.3f}".format(1.0 - dist))
|
| 44 |
|
| 45 |
st.image(result_imgs[:3], caption=result_captions[:3], width=200)
|
| 46 |
st.image(result_imgs[3:6], caption=result_captions[3:6], width=200)
|