Spaces:
Runtime error
Runtime error
Commit ·
a67c2b0
0
Parent(s):
Duplicate from DrishtiSharma/Image-search-using-CLIP
Browse filesCo-authored-by: Drishti Sharma <DrishtiSharma@users.noreply.huggingface.co>
- .gitattributes +28 -0
- README.md +38 -0
- app.py +110 -0
- features.npy +3 -0
- photo_ids.csv +0 -0
- photos.tsv000 +0 -0
- requirements.txt +5 -0
.gitattributes
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
features.npy filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Image Search Using CLIP
|
| 3 |
+
emoji: 🏢
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
app_file: app.py
|
| 8 |
+
pinned: false
|
| 9 |
+
duplicated_from: DrishtiSharma/Image-search-using-CLIP
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Configuration
|
| 13 |
+
|
| 14 |
+
`title`: _string_
|
| 15 |
+
Display title for the Space
|
| 16 |
+
|
| 17 |
+
`emoji`: _string_
|
| 18 |
+
Space emoji (emoji-only character allowed)
|
| 19 |
+
|
| 20 |
+
`colorFrom`: _string_
|
| 21 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
| 22 |
+
|
| 23 |
+
`colorTo`: _string_
|
| 24 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
| 25 |
+
|
| 26 |
+
`sdk`: _string_
|
| 27 |
+
Can be either `gradio` or `streamlit`
|
| 28 |
+
|
| 29 |
+
`sdk_version` : _string_
|
| 30 |
+
Only applicable for `streamlit` SDK.
|
| 31 |
+
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
| 32 |
+
|
| 33 |
+
`app_file`: _string_
|
| 34 |
+
Path to your main application file (which contains either `gradio` or `streamlit` Python code).
|
| 35 |
+
Path is relative to the root of the repository.
|
| 36 |
+
|
| 37 |
+
`pinned`: _boolean_
|
| 38 |
+
Whether the Space stays on top of your list.
|
app.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#Acknowledgments:
|
| 2 |
+
#This project is inspired by:
|
| 3 |
+
#1. https://github.com/haltakov/natural-language-image-search by Vladimir Haltakov
|
| 4 |
+
#2. OpenAI's CLIP
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
#Importing all the necessary libraries
|
| 9 |
+
import torch
|
| 10 |
+
import requests
|
| 11 |
+
import numpy as np
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import gradio as gr
|
| 14 |
+
from io import BytesIO
|
| 15 |
+
from PIL import Image as PILIMAGE
|
| 16 |
+
from IPython.display import Image
|
| 17 |
+
from IPython.core.display import HTML
|
| 18 |
+
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
|
| 19 |
+
from sentence_transformers import SentenceTransformer, util
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 24 |
+
|
| 25 |
+
# Define model
|
| 26 |
+
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
|
| 27 |
+
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 28 |
+
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
| 29 |
+
|
| 30 |
+
# Load data
|
| 31 |
+
photos = pd.read_csv("./photos.tsv000", sep='\t', header=0)
|
| 32 |
+
photo_features = np.load("./features.npy")
|
| 33 |
+
photo_ids = pd.read_csv("./photo_ids.csv")
|
| 34 |
+
photo_ids = list(photo_ids['photo_id'])
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def encode_text(text):
|
| 39 |
+
with torch.no_grad():
|
| 40 |
+
# Encode and normalize the description using CLIP
|
| 41 |
+
inputs = tokenizer([text], padding=True, return_tensors="pt")
|
| 42 |
+
inputs = processor(text=[text], images=None, return_tensors="pt", padding=True)
|
| 43 |
+
text_encoded = model.get_text_features(**inputs).detach().numpy()
|
| 44 |
+
return text_encoded
|
| 45 |
+
|
| 46 |
+
def encode_image(image):
|
| 47 |
+
image = PILIMAGE.fromarray(image.astype('uint8'), 'RGB')
|
| 48 |
+
with torch.no_grad():
|
| 49 |
+
photo_preprocessed = processor(text=None, images=image, return_tensors="pt", padding=True)["pixel_values"]
|
| 50 |
+
search_photo_feature = model.get_image_features(photo_preprocessed.to(device))
|
| 51 |
+
search_photo_feature /= search_photo_feature.norm(dim=-1, keepdim=True)
|
| 52 |
+
image_encoded = search_photo_feature.cpu().numpy()
|
| 53 |
+
return image_encoded
|
| 54 |
+
|
| 55 |
+
T2I = "Text2Image"
|
| 56 |
+
I2I = "Image2Image"
|
| 57 |
+
|
| 58 |
+
def similarity(feature, photo_features):
|
| 59 |
+
similarities = list((feature @ photo_features.T).squeeze(0))
|
| 60 |
+
return similarities
|
| 61 |
+
|
| 62 |
+
def find_best_matches(image, mode, text):
|
| 63 |
+
# Compute the similarity between the descrption and each photo using the Cosine similarity
|
| 64 |
+
print ("Mode now ",mode)
|
| 65 |
+
|
| 66 |
+
if mode == "Text2Image":
|
| 67 |
+
# Encode the text input
|
| 68 |
+
text_features = encode_text(text)
|
| 69 |
+
feature = text_features
|
| 70 |
+
similarities = similarity(text_features, photo_features)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
else:
|
| 74 |
+
#Encode the image input
|
| 75 |
+
image_features = encode_image(image)
|
| 76 |
+
feature = image_features
|
| 77 |
+
similarities = similarity(image_features, photo_features)
|
| 78 |
+
|
| 79 |
+
# Sort the photos by their similarity score
|
| 80 |
+
best_photos = sorted(zip(similarities, range(photo_features.shape[0])), key=lambda x: x[0], reverse=True)
|
| 81 |
+
|
| 82 |
+
matched_images = []
|
| 83 |
+
for i in range(3):
|
| 84 |
+
# Retrieve the photo ID
|
| 85 |
+
idx = best_photos[i][1]
|
| 86 |
+
photo_id = photo_ids[idx]
|
| 87 |
+
|
| 88 |
+
# Get all metadata for this photo
|
| 89 |
+
photo_data = photos[photos["photo_id"] == photo_id].iloc[0]
|
| 90 |
+
|
| 91 |
+
# Display the images
|
| 92 |
+
#display(Image(url=photo_data["photo_image_url"] + "?w=640"))
|
| 93 |
+
response = requests.get(photo_data["photo_image_url"] + "?w=640")
|
| 94 |
+
img = PILIMAGE.open(BytesIO(response.content))
|
| 95 |
+
matched_images.append(img)
|
| 96 |
+
return matched_images
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
gr.Interface(fn=find_best_matches,
|
| 102 |
+
inputs=[
|
| 103 |
+
gr.inputs.Image(label="Image to search", optional=True),
|
| 104 |
+
gr.inputs.Radio([T2I, I2I]),
|
| 105 |
+
gr.inputs.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",
|
| 106 |
+
)],
|
| 107 |
+
theme="grass",
|
| 108 |
+
outputs=gr.outputs.Carousel([gr.outputs.Image(type="pil")]), enable_queue=True, title="CLIP Image Search",
|
| 109 |
+
description="This application displays TOP THREE images from Unsplash dataset that best match the search query provided by the user. Moreover, the input can be provided via two modes ie text or image form.").launch()
|
| 110 |
+
|
features.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31ac381e52fa007821a642b5808ac9a6eaf7163322ab340d36bcc3c2a94a38c8
|
| 3 |
+
size 25596032
|
photo_ids.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
photos.tsv000
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sentence-transformers==2.1.0
|
| 2 |
+
transformers
|
| 3 |
+
torch
|
| 4 |
+
numpy
|
| 5 |
+
ftfy
|