fkunn1326 DrishtiSharma commited on
Commit
a67c2b0
·
0 Parent(s):

Duplicate from DrishtiSharma/Image-search-using-CLIP

Browse files

Co-authored-by: Drishti Sharma <DrishtiSharma@users.noreply.huggingface.co>

Files changed (7) hide show
  1. .gitattributes +28 -0
  2. README.md +38 -0
  3. app.py +110 -0
  4. features.npy +3 -0
  5. photo_ids.csv +0 -0
  6. photos.tsv000 +0 -0
  7. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ features.npy filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Image Search Using CLIP
3
+ emoji: 🏢
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ duplicated_from: DrishtiSharma/Image-search-using-CLIP
10
+ ---
11
+
12
+ # Configuration
13
+
14
+ `title`: _string_
15
+ Display title for the Space
16
+
17
+ `emoji`: _string_
18
+ Space emoji (emoji-only character allowed)
19
+
20
+ `colorFrom`: _string_
21
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
22
+
23
+ `colorTo`: _string_
24
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
25
+
26
+ `sdk`: _string_
27
+ Can be either `gradio` or `streamlit`
28
+
29
+ `sdk_version` : _string_
30
+ Only applicable for `streamlit` SDK.
31
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
32
+
33
+ `app_file`: _string_
34
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code).
35
+ Path is relative to the root of the repository.
36
+
37
+ `pinned`: _boolean_
38
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Acknowledgments:
2
+ #This project is inspired by:
3
+ #1. https://github.com/haltakov/natural-language-image-search by Vladimir Haltakov
4
+ #2. OpenAI's CLIP
5
+
6
+
7
+
8
+ #Importing all the necessary libraries
9
+ import torch
10
+ import requests
11
+ import numpy as np
12
+ import pandas as pd
13
+ import gradio as gr
14
+ from io import BytesIO
15
+ from PIL import Image as PILIMAGE
16
+ from IPython.display import Image
17
+ from IPython.core.display import HTML
18
+ from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
19
+ from sentence_transformers import SentenceTransformer, util
20
+
21
+
22
+
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+
25
+ # Define model
26
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
27
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
28
+ tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
29
+
30
+ # Load data
31
+ photos = pd.read_csv("./photos.tsv000", sep='\t', header=0)
32
+ photo_features = np.load("./features.npy")
33
+ photo_ids = pd.read_csv("./photo_ids.csv")
34
+ photo_ids = list(photo_ids['photo_id'])
35
+
36
+
37
+
38
+ def encode_text(text):
39
+ with torch.no_grad():
40
+ # Encode and normalize the description using CLIP
41
+ inputs = tokenizer([text], padding=True, return_tensors="pt")
42
+ inputs = processor(text=[text], images=None, return_tensors="pt", padding=True)
43
+ text_encoded = model.get_text_features(**inputs).detach().numpy()
44
+ return text_encoded
45
+
46
+ def encode_image(image):
47
+ image = PILIMAGE.fromarray(image.astype('uint8'), 'RGB')
48
+ with torch.no_grad():
49
+ photo_preprocessed = processor(text=None, images=image, return_tensors="pt", padding=True)["pixel_values"]
50
+ search_photo_feature = model.get_image_features(photo_preprocessed.to(device))
51
+ search_photo_feature /= search_photo_feature.norm(dim=-1, keepdim=True)
52
+ image_encoded = search_photo_feature.cpu().numpy()
53
+ return image_encoded
54
+
55
+ T2I = "Text2Image"
56
+ I2I = "Image2Image"
57
+
58
+ def similarity(feature, photo_features):
59
+ similarities = list((feature @ photo_features.T).squeeze(0))
60
+ return similarities
61
+
62
+ def find_best_matches(image, mode, text):
63
+ # Compute the similarity between the descrption and each photo using the Cosine similarity
64
+ print ("Mode now ",mode)
65
+
66
+ if mode == "Text2Image":
67
+ # Encode the text input
68
+ text_features = encode_text(text)
69
+ feature = text_features
70
+ similarities = similarity(text_features, photo_features)
71
+
72
+
73
+ else:
74
+ #Encode the image input
75
+ image_features = encode_image(image)
76
+ feature = image_features
77
+ similarities = similarity(image_features, photo_features)
78
+
79
+ # Sort the photos by their similarity score
80
+ best_photos = sorted(zip(similarities, range(photo_features.shape[0])), key=lambda x: x[0], reverse=True)
81
+
82
+ matched_images = []
83
+ for i in range(3):
84
+ # Retrieve the photo ID
85
+ idx = best_photos[i][1]
86
+ photo_id = photo_ids[idx]
87
+
88
+ # Get all metadata for this photo
89
+ photo_data = photos[photos["photo_id"] == photo_id].iloc[0]
90
+
91
+ # Display the images
92
+ #display(Image(url=photo_data["photo_image_url"] + "?w=640"))
93
+ response = requests.get(photo_data["photo_image_url"] + "?w=640")
94
+ img = PILIMAGE.open(BytesIO(response.content))
95
+ matched_images.append(img)
96
+ return matched_images
97
+
98
+
99
+
100
+
101
+ gr.Interface(fn=find_best_matches,
102
+ inputs=[
103
+ gr.inputs.Image(label="Image to search", optional=True),
104
+ gr.inputs.Radio([T2I, I2I]),
105
+ gr.inputs.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",
106
+ )],
107
+ theme="grass",
108
+ outputs=gr.outputs.Carousel([gr.outputs.Image(type="pil")]), enable_queue=True, title="CLIP Image Search",
109
+ description="This application displays TOP THREE images from Unsplash dataset that best match the search query provided by the user. Moreover, the input can be provided via two modes ie text or image form.").launch()
110
+
features.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31ac381e52fa007821a642b5808ac9a6eaf7163322ab340d36bcc3c2a94a38c8
3
+ size 25596032
photo_ids.csv ADDED
The diff for this file is too large to render. See raw diff
 
photos.tsv000 ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sentence-transformers==2.1.0
2
+ transformers
3
+ torch
4
+ numpy
5
+ ftfy