Spaces:

ryaalbr
/

QuestApp

Sleeping

ryaalbr commited on Apr 10, 2023

Commit

4150f63

1 Parent(s): a3cfe93

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,8 +12,8 @@ import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # # Load the pre-trained model and processor
-# model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
@@ -64,10 +64,19 @@ with open(emb_filename, 'rb') as emb:
 def search(search_query):
     with torch.no_grad():
-        # Encode and normalize the description using CLIP
-        text_encoded = orig_clip_model.encode_text(clip.tokenize(search_query))
-        text_encoded /= text_encoded.norm(dim=-1, keepdim=True)
     # Retrieve the description vector
@@ -136,8 +145,8 @@ with gr.Blocks(css=".caption-text {font-size: 40px !important;}") as demo:
         label_text.blur(fn=set_labels, inputs=label_text, outputs=labels)  # parse list if focus is moved elsewhere; ensures that list is fully parsed before classification
         label_text.submit(fn=set_labels, inputs=label_text, outputs=labels)  # parse list if user hits enter; ensures that list is fully parsed before classification
         get_btn.click(fn=rand_image, outputs=im)
-        im.change(predict2, inputs=[im, labels], outputs=cf)
-        reclass_btn.click(predict2, inputs=[im, labels], outputs=cf)
     with gr.Tab("Image Captioning"):
         with gr.Row():

 device = "cuda" if torch.cuda.is_available() else "cpu"
 # # Load the pre-trained model and processor
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
 def search(search_query):
     with torch.no_grad():
+        # Encode and normalize the description using CLIP (HF CLIP)
+        inputs = processor(text=[text], images=None, return_tensors="pt", padding=True)
+        text_encoded =  model.get_text_features(**inputs)
+        # # Encode and normalize the description using CLIP (original CLIP)
+        # text_encoded = orig_clip_model.encode_text(clip.tokenize(search_query))
+        # text_encoded /= text_encoded.norm(dim=-1, keepdim=True)
     # Retrieve the description vector
         label_text.blur(fn=set_labels, inputs=label_text, outputs=labels)  # parse list if focus is moved elsewhere; ensures that list is fully parsed before classification
         label_text.submit(fn=set_labels, inputs=label_text, outputs=labels)  # parse list if user hits enter; ensures that list is fully parsed before classification
         get_btn.click(fn=rand_image, outputs=im)
+        im.change(predict, inputs=[im, labels], outputs=cf)
+        reclass_btn.click(predict, inputs=[im, labels], outputs=cf)
     with gr.Tab("Image Captioning"):
         with gr.Row():