Spaces:

Navyabhat
/

Session19

Runtime error

App Files Files Community

Navyabhat commited on Oct 6, 2023

Commit

02419e1

1 Parent(s): e7fc315

Upload 5 files

Browse files

Files changed (5) hide show

app.py +111 -0
features.npy +3 -0
photo_ids.csv +0 -0
photos.tsv000 +0 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+#Acknowledgments:
+#This project is inspired by:
+#1. https://github.com/haltakov/natural-language-image-search by Vladimir Haltakov
+#2. OpenAI's CLIP
+#Importing all the necessary libraries
+import torch
+import requests
+import numpy as np
+import pandas as pd
+import gradio as gr
+from io import BytesIO
+from PIL import Image as PILIMAGE
+from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
+from sentence_transformers import SentenceTransformer, util
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Define model
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+# Load data
+photos = pd.read_csv("./photos.tsv000", sep='\t', header=0)
+photo_features = np.load("./features.npy")
+photo_ids = pd.read_csv("./photo_ids.csv")
+photo_ids = list(photo_ids['photo_id'])
+def encode_text(text):
+  with torch.no_grad():
+    # Encode and normalize the description using CLIP
+    inputs = tokenizer([text],  padding=True, return_tensors="pt")
+    inputs = processor(text=[text], images=None, return_tensors="pt", padding=True)
+  text_encoded =  model.get_text_features(**inputs).detach().numpy()
+  return text_encoded
+def encode_image(image):
+  image = PILIMAGE.fromarray(image.astype('uint8'), 'RGB')
+  with torch.no_grad():
+        photo_preprocessed = processor(text=None, images=image, return_tensors="pt", padding=True)["pixel_values"]
+        search_photo_feature = model.get_image_features(photo_preprocessed.to(device))
+        search_photo_feature /= search_photo_feature.norm(dim=-1, keepdim=True)
+  image_encoded = search_photo_feature.cpu().numpy()
+  return image_encoded
+T2I = "Text2Image"
+I2I = "Image2Image"
+def similarity(feature, photo_features):
+  similarities = list((feature @ photo_features.T).squeeze(0))
+  return similarities
+def find_best_matches(image, mode, text):
+  # Compute the similarity between the descrption and each photo using the Cosine similarity
+  print ("Mode now ",mode)
+  if mode == "Text2Image":
+    # Encode the text input
+    text_features = encode_text(text)
+    feature = text_features
+    similarities = similarity(text_features, photo_features)
+  else:
+    #Encode the image input
+    image_features = encode_image(image)
+    feature = image_features
+    similarities = similarity(image_features, photo_features)
+  # Sort the photos by their similarity score
+  best_photos = sorted(zip(similarities, range(photo_features.shape[0])), key=lambda x: x[0], reverse=True)
+  matched_images = []
+  for i in range(3):
+    # Retrieve the photo ID
+    idx = best_photos[i][1]
+    photo_id = photo_ids[idx]
+    # Get all metadata for this photo
+    photo_data = photos[photos["photo_id"] == photo_id].iloc[0]
+    # Display the images
+    #display(Image(url=photo_data["photo_image_url"] + "?w=640"))
+    response = requests.get(photo_data["photo_image_url"] + "?w=640")
+    img = PILIMAGE.open(BytesIO(response.content))
+    matched_images.append(img)
+  return matched_images
+gr.Interface(fn=find_best_matches,
+                     inputs=[
+            gr.Image(label="Image to search", optional=True),
+            gr.Radio([T2I, I2I]),
+            gr.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",
+            )],
+            theme="grass",
+            outputs=[gr.Gallery(
+            label="Generated images", show_label=False, elem_id="gallery"
+        ).style(grid=[2], height="auto")], enable_queue=True, title="CLIP Image Search",
+        description="This application displays TOP THREE images from Unsplash dataset that best match the search query provided by the user. Moreover, the input can be provided via two modes ie text or image form.").launch()

features.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31ac381e52fa007821a642b5808ac9a6eaf7163322ab340d36bcc3c2a94a38c8
+size 25596032

photo_ids.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

photos.tsv000 ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+sentence-transformers
+transformers
+torch
+numpy
+ftfy