Spaces:

danielvarga
/

se

Sleeping

App Files Files Community

Daniel Varga commited on May 5, 2023

Commit

51b0e53

1 Parent(s): 9cdc9a1

switching to annoy

Browse files

Files changed (2) hide show

app.py +17 -15
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import gradio as gr
 import numpy as np
 import torch
 import clip
 CONFIG_PATH = "app.ini"
@@ -49,12 +50,18 @@ data = pickle.load(open(pickle_filename, "rb"))
 # but we use float32 in-memory to avoid numerical issues.
 # tbh i'm not sure there are any such issues.
 embeddings = data["embeddings"].astype(np.float32)
-image_features = torch.Tensor(embeddings)
-image_features /= image_features.norm(dim=-1, keepdim=True)
 n, d = embeddings.shape
 filenames = data["filenames"]
 urls = [base_url + filename for filename in filenames]
@@ -67,29 +74,24 @@ def embed_text(text):
     with torch.no_grad():
         text_features = model.encode_text(tokens)
     assert text_features.shape == (1, d)
     return text_features
-def similarities(text_features, topk=20):
-    text_features /= text_features.norm(dim=-1, keepdim=True)
-    # the softmax rounds up everything to 1, so does not distinguish between good fits.
-    similarity = (100.0 * image_features @ text_features.T) # .softmax(dim=-1)
-    values, indices = similarity[:, 0].topk(topk)
-    return values, indices
 def image_retrieval_from_text(text):
-    values, indices = similarities(embed_text(text), topk=20)
     top_urls = np.array(urls)[indices]
-    return top_urls.tolist(), indices.numpy().tolist()
 def image_retrieval_from_image(state, selected_locally):
     selected = state[int(selected_locally)]
     image_vector = image_features[selected][None, :]
-    values, indices = similarities(image_vector, topk=20)
     top_urls = np.array(urls)[indices]
-    return top_urls.tolist(), indices.numpy().tolist()
 with gr.Blocks(css="footer {visibility: hidden}") as demo:

 import numpy as np
 import torch
 import clip
+import annoy
 CONFIG_PATH = "app.ini"
 # but we use float32 in-memory to avoid numerical issues.
 # tbh i'm not sure there are any such issues.
 embeddings = data["embeddings"].astype(np.float32)
+embeddings /= np.linalg.norm(embeddings, axis=-1)[:, None]
 n, d = embeddings.shape
+print("annoy indexing")
+annoy_index = annoy.AnnoyIndex(d, 'angular')
+for i, vec in enumerate(embeddings):
+    annoy_index.add_item(i, vec)
+annoy_index.build(10)
+print("done")
 filenames = data["filenames"]
 urls = [base_url + filename for filename in filenames]
     with torch.no_grad():
         text_features = model.encode_text(tokens)
     assert text_features.shape == (1, d)
+    text_features = text_features.numpy()[0]
+    text_features /= np.linalg.norm(text_features)
     return text_features
 def image_retrieval_from_text(text):
+    text_features = embed_text(text)
+    indices = annoy_index.get_nns_by_vector(text_features, n=20)
     top_urls = np.array(urls)[indices]
+    return top_urls.tolist(), indices
 def image_retrieval_from_image(state, selected_locally):
     selected = state[int(selected_locally)]
     image_vector = image_features[selected][None, :]
+    indices = annoy_index.get_nns_by_item(selected, n=20)
     top_urls = np.array(urls)[indices]
+    return top_urls.tolist(), indices
 with gr.Blocks(css="footer {visibility: hidden}") as demo:

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	git+https://github.com/openai/CLIP.git


1	git+https://github.com/openai/CLIP.git
2	+ annoy==1.17.2