ismot vivien commited on
Commit
e47827c
·
0 Parent(s):

Duplicate from vivien/clip

Browse files

Co-authored-by: Vivien Tran-Thien <vivien@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.npy filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .vscode/
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Clip Demo
3
+ emoji: 👁
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.2.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: vivien/clip
11
+ ---
12
+
13
+ # Configuration
14
+
15
+ `title`: _string_
16
+ Display title for the Space
17
+
18
+ `emoji`: _string_
19
+ Space emoji (emoji-only character allowed)
20
+
21
+ `colorFrom`: _string_
22
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
23
+
24
+ `colorTo`: _string_
25
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
26
+
27
+ `sdk`: _string_
28
+ Can be either `gradio` or `streamlit`
29
+
30
+ `sdk_version` : _string_
31
+ Only applicable for `streamlit` SDK.
32
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
33
+
34
+ `app_file`: _string_
35
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code).
36
+ Path is relative to the root of the repository.
37
+
38
+ `pinned`: _boolean_
39
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from html import escape
2
+ import re
3
+ import streamlit as st
4
+ import pandas as pd, numpy as np
5
+ from transformers import CLIPProcessor, CLIPModel
6
+ from st_clickable_images import clickable_images
7
+
8
+ MODEL_NAMES = [
9
+ # "base-patch32",
10
+ # "base-patch16",
11
+ # "large-patch14",
12
+ "large-patch14-336"
13
+ ]
14
+
15
+
16
+ @st.cache(allow_output_mutation=True)
17
+ def load():
18
+ df = {0: pd.read_csv("data.csv"), 1: pd.read_csv("data2.csv")}
19
+ models = {}
20
+ processors = {}
21
+ embeddings = {}
22
+ for name in MODEL_NAMES:
23
+ models[name] = CLIPModel.from_pretrained(f"openai/clip-vit-{name}")
24
+ processors[name] = CLIPProcessor.from_pretrained(f"openai/clip-vit-{name}")
25
+ embeddings[name] = {
26
+ 0: np.load(f"embeddings-vit-{name}.npy"),
27
+ 1: np.load(f"embeddings2-vit-{name}.npy"),
28
+ }
29
+ for k in [0, 1]:
30
+ embeddings[name][k] = embeddings[name][k] / np.linalg.norm(
31
+ embeddings[name][k], axis=1, keepdims=True
32
+ )
33
+ return models, processors, df, embeddings
34
+
35
+
36
+ models, processors, df, embeddings = load()
37
+ source = {0: "\nSource: Unsplash", 1: "\nSource: The Movie Database (TMDB)"}
38
+
39
+
40
+ def compute_text_embeddings(list_of_strings, name):
41
+ inputs = processors[name](text=list_of_strings, return_tensors="pt", padding=True)
42
+ result = models[name].get_text_features(**inputs).detach().numpy()
43
+ return result / np.linalg.norm(result, axis=1, keepdims=True)
44
+
45
+
46
+ def image_search(query, corpus, name, n_results=24):
47
+ positive_embeddings = None
48
+
49
+ def concatenate_embeddings(e1, e2):
50
+ if e1 is None:
51
+ return e2
52
+ else:
53
+ return np.concatenate((e1, e2), axis=0)
54
+
55
+ splitted_query = query.split("EXCLUDING ")
56
+ dot_product = 0
57
+ k = 0 if corpus == "Unsplash" else 1
58
+ if len(splitted_query[0]) > 0:
59
+ positive_queries = splitted_query[0].split(";")
60
+ for positive_query in positive_queries:
61
+ match = re.match(r"\[(Movies|Unsplash):(\d{1,5})\](.*)", positive_query)
62
+ if match:
63
+ corpus2, idx, remainder = match.groups()
64
+ idx, remainder = int(idx), remainder.strip()
65
+ k2 = 0 if corpus2 == "Unsplash" else 1
66
+ positive_embeddings = concatenate_embeddings(
67
+ positive_embeddings, embeddings[name][k2][idx : idx + 1, :]
68
+ )
69
+ if len(remainder) > 0:
70
+ positive_embeddings = concatenate_embeddings(
71
+ positive_embeddings, compute_text_embeddings([remainder], name)
72
+ )
73
+ else:
74
+ positive_embeddings = concatenate_embeddings(
75
+ positive_embeddings, compute_text_embeddings([positive_query], name)
76
+ )
77
+ dot_product = embeddings[name][k] @ positive_embeddings.T
78
+ dot_product = dot_product - np.median(dot_product, axis=0)
79
+ dot_product = dot_product / np.max(dot_product, axis=0, keepdims=True)
80
+ dot_product = np.min(dot_product, axis=1)
81
+
82
+ if len(splitted_query) > 1:
83
+ negative_queries = (" ".join(splitted_query[1:])).split(";")
84
+ negative_embeddings = compute_text_embeddings(negative_queries, name)
85
+ dot_product2 = embeddings[name][k] @ negative_embeddings.T
86
+ dot_product2 = dot_product2 - np.median(dot_product2, axis=0)
87
+ dot_product2 = dot_product2 / np.max(dot_product2, axis=0, keepdims=True)
88
+ dot_product -= np.max(np.maximum(dot_product2, 0), axis=1)
89
+
90
+ results = np.argsort(dot_product)[-1 : -n_results - 1 : -1]
91
+ return [
92
+ (
93
+ df[k].iloc[i]["path"],
94
+ df[k].iloc[i]["tooltip"] + source[k],
95
+ i,
96
+ )
97
+ for i in results
98
+ ]
99
+
100
+
101
+ description = """
102
+ # Semantic image search
103
+
104
+ **Enter your query and hit enter**
105
+
106
+ *Built with OpenAI's [CLIP](https://openai.com/blog/clip/) model, 🤗 Hugging Face's [transformers library](https://huggingface.co/transformers/), [Streamlit](https://streamlit.io/), 25k images from [Unsplash](https://unsplash.com/) and 8k images from [The Movie Database (TMDB)](https://www.themoviedb.org/)*
107
+
108
+ *Inspired by [Unsplash Image Search](https://github.com/haltakov/natural-language-image-search) from Vladimir Haltakov and [Alph, The Sacred River](https://github.com/thoppe/alph-the-sacred-river) from Travis Hoppe*
109
+ """
110
+
111
+ howto = """
112
+ - Click on an image to use it as a query and find similar images
113
+ - Several queries, including one based on an image, can be combined (use "**;**" as a separator)
114
+ - If the input includes "**EXCLUDING**", the part right of it will be used as a negative query
115
+ """
116
+
117
+ div_style = {
118
+ "display": "flex",
119
+ "justify-content": "center",
120
+ "flex-wrap": "wrap",
121
+ }
122
+
123
+
124
+ def main():
125
+ st.markdown(
126
+ """
127
+ <style>
128
+ .block-container{
129
+ max-width: 1200px;
130
+ }
131
+ div.row-widget.stRadio > div{
132
+ flex-direction:row;
133
+ display: flex;
134
+ justify-content: center;
135
+ }
136
+ div.row-widget.stRadio > div > label{
137
+ margin-left: 5px;
138
+ margin-right: 5px;
139
+ }
140
+ .row-widget {
141
+ margin-top: -25px;
142
+ }
143
+ section>div:first-child {
144
+ padding-top: 30px;
145
+ }
146
+ div.reportview-container > section:first-child{
147
+ max-width: 320px;
148
+ }
149
+ #MainMenu {
150
+ visibility: hidden;
151
+ }
152
+ footer {
153
+ visibility: hidden;
154
+ }
155
+ </style>""",
156
+ unsafe_allow_html=True,
157
+ )
158
+ st.sidebar.markdown(description)
159
+ with st.sidebar.expander("Advanced use"):
160
+ st.markdown(howto)
161
+ #mode = st.sidebar.selectbox(
162
+ # "", ["Results for ViT-L/14@336px", "Comparison of 2 models"], index=0
163
+ #)
164
+
165
+ _, c, _ = st.columns((1, 3, 1))
166
+ if "query" in st.session_state:
167
+ query = c.text_input("", value=st.session_state["query"])
168
+ else:
169
+ query = c.text_input("", value="clouds at sunset")
170
+ corpus = st.radio("", ["Unsplash", "Movies"])
171
+
172
+ models_dict = {
173
+ "ViT-B/32 (quicker)": "base-patch32",
174
+ "ViT-B/16 (average)": "base-patch16",
175
+ # "ViT-L/14 (slow)": "large-patch14",
176
+ "ViT-L/14@336px (slower)": "large-patch14-336",
177
+ }
178
+
179
+ if False:#"Comparison" in mode:
180
+ c1, c2 = st.columns((1, 1))
181
+ selection1 = c1.selectbox("", models_dict.keys(), index=0)
182
+ selection2 = c2.selectbox("", models_dict.keys(), index=2)
183
+ name1 = models_dict[selection1]
184
+ name2 = models_dict[selection2]
185
+ else:
186
+ name1 = MODEL_NAMES[-1]
187
+
188
+ if len(query) > 0:
189
+ results1 = image_search(query, corpus, name1)
190
+ if False:#"Comparison" in mode:
191
+ with c1:
192
+ clicked1 = clickable_images(
193
+ [result[0] for result in results1],
194
+ titles=[result[1] for result in results1],
195
+ div_style=div_style,
196
+ img_style={"margin": "2px", "height": "150px"},
197
+ key=query + corpus + name1 + "1",
198
+ )
199
+ results2 = image_search(query, corpus, name2)
200
+ with c2:
201
+ clicked2 = clickable_images(
202
+ [result[0] for result in results2],
203
+ titles=[result[1] for result in results2],
204
+ div_style=div_style,
205
+ img_style={"margin": "2px", "height": "150px"},
206
+ key=query + corpus + name2 + "2",
207
+ )
208
+ else:
209
+ clicked1 = clickable_images(
210
+ [result[0] for result in results1],
211
+ titles=[result[1] for result in results1],
212
+ div_style=div_style,
213
+ img_style={"margin": "2px", "height": "200px"},
214
+ key=query + corpus + name1 + "1",
215
+ )
216
+ clicked2 = -1
217
+
218
+ if clicked2 >= 0 or clicked1 >= 0:
219
+ change_query = False
220
+ if "last_clicked" not in st.session_state:
221
+ change_query = True
222
+ else:
223
+ if max(clicked2, clicked1) != st.session_state["last_clicked"]:
224
+ change_query = True
225
+ if change_query:
226
+ if clicked1 >= 0:
227
+ st.session_state["query"] = f"[{corpus}:{results1[clicked1][2]}]"
228
+ #elif clicked2 >= 0:
229
+ # st.session_state["query"] = f"[{corpus}:{results2[clicked2][2]}]"
230
+ st.experimental_rerun()
231
+
232
+
233
+ if __name__ == "__main__":
234
+ main()
data.csv ADDED
The diff for this file is too large to render. See raw diff
 
data2.csv ADDED
The diff for this file is too large to render. See raw diff
 
embeddings-vit-base-patch16.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:125430e11a4a415ec0c0fc5339f97544f0447e4b0a24c20f2e59f8852e706afc
3
+ size 51200128
embeddings-vit-base-patch32.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f7ebdff24079665faf58d07045056a63b5499753e3ffbda479691d53de3ab38
3
+ size 51200128
embeddings-vit-large-patch14-336.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f79f10ebe267b4ee7acd553dfe0ee31df846123630058a6d58c04bf22e0ad068
3
+ size 76800128
embeddings-vit-large-patch14.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64515f7d3d71137e2944f2c3d72c8df3e684b5d6a6ff7dcebb92370f7326ccfd
3
+ size 76800128
embeddings2-vit-base-patch16.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:153cf3fae2385d51fe8729d3a1c059f611ca47a3fc501049708114d1bbf79049
3
+ size 16732288
embeddings2-vit-base-patch32.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7d545bed86121dac1cedcc1de61ea5295f5840c1eb751637e6628ac54faef81
3
+ size 16732288
embeddings2-vit-large-patch14-336.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e66eb377465fbfaa56cec079aa3e214533ceac43646f2ca78028ae4d8ad6d03
3
+ size 25098368
embeddings2-vit-large-patch14.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d730b33e758c2648419a96ac86d39516c59795e613c35700d3a64079e5a9a27
3
+ size 25098368
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ ftfy
4
+ numpy
5
+ pandas
6
+ st-clickable-images