Spaces:

ryaalbr
/

QuestApp

Sleeping

App Files Files

ryaalbr commited on Apr 10, 2023

Commit

971e64d

1 Parent(s): 6760107

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -8

app.py CHANGED Viewed

@@ -9,12 +9,14 @@ import pickle
 import requests
 import torch
-is_gpu = False
-device = CUDA(0) if is_gpu else "cpu"
-# Load the pre-trained model and processor
-model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 # Load the Unsplash dataset
 dataset = load_dataset("jamescalam/unsplash-25k-photos", split="train")  # all 25K images are in train split
@@ -28,6 +30,17 @@ def predict(image, labels):
     probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
     return {k: float(v) for k, v in zip(labels, probs[0])}
 def rand_image():
     n = dataset.num_rows
     r = random.randrange(0,n)
@@ -48,7 +61,6 @@ emb_filename = 'unsplash-25k-photos-embeddings-indexes.pkl'
 with open(emb_filename, 'rb') as emb:
         id2url, img_names, img_emb = pickle.load(emb)
-orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
 def search(search_query):
@@ -124,8 +136,8 @@ with gr.Blocks() as demo:
         label_text.blur(fn=set_labels, inputs=label_text, outputs=labels)  # parse list if focus is moved elsewhere; ensures that list is fully parsed before classification
         label_text.submit(fn=set_labels, inputs=label_text, outputs=labels)  # parse list if user hits enter; ensures that list is fully parsed before classification
         get_btn.click(fn=rand_image, outputs=im)
-        im.change(predict, inputs=[im, labels], outputs=cf)
-        reclass_btn.click(predict, inputs=[im, labels], outputs=cf)
     with gr.Tab("Image Captioning"):
         with gr.Row():

 import requests
 import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# # Load the pre-trained model and processor
+# model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
 # Load the Unsplash dataset
 dataset = load_dataset("jamescalam/unsplash-25k-photos", split="train")  # all 25K images are in train split
     probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
     return {k: float(v) for k, v in zip(labels, probs[0])}
+def predict2(image, labels):
+    image = orig_clip_processor(img).unsqueeze(0).to(device)
+    text = clip.tokenize(labels).to(device)
+    with torch.no_grad():
+        image_features = orig_clip_model.encode_image(image)
+        text_features = orig_clip_model.encode_text(text)
+        logits_per_image, logits_per_text = orig_clip_model(image, text)
+        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+        return {k: float(v) for k, v in zip(labels, probs[0])}
 def rand_image():
     n = dataset.num_rows
     r = random.randrange(0,n)
 with open(emb_filename, 'rb') as emb:
         id2url, img_names, img_emb = pickle.load(emb)
 def search(search_query):
         label_text.blur(fn=set_labels, inputs=label_text, outputs=labels)  # parse list if focus is moved elsewhere; ensures that list is fully parsed before classification
         label_text.submit(fn=set_labels, inputs=label_text, outputs=labels)  # parse list if user hits enter; ensures that list is fully parsed before classification
         get_btn.click(fn=rand_image, outputs=im)
+        im.change(predict2, inputs=[im, labels], outputs=cf)
+        reclass_btn.click(predict2, inputs=[im, labels], outputs=cf)
     with gr.Tab("Image Captioning"):
         with gr.Row():