Spaces:

ryaalbr
/

QuestApp

Sleeping

ryaalbr commited on Apr 11, 2023

Commit

eb1df9f

1 Parent(s): 2d9d672

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -40,6 +40,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
 clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 #orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
@@ -58,7 +59,7 @@ height = 256   # height for resizing images
 def predict(image, labels):
     with torch.no_grad():
-        inputs = clip_processor(text=[f"a photo of {c}" for c in labels], images=image, return_tensors="pt", padding=True)
         outputs = clip_model(**inputs)
         logits_per_image = outputs.logits_per_image # this is the image-text similarity score
         probs = logits_per_image.softmax(dim=1).cpu().numpy() # we can take the softmax to get the label probabilities
@@ -144,7 +145,7 @@ def get_caption(img,model_name):
     model = model.eval()
     model = model.to(device)
-    clip_model = clip_model.to(device)
     input = clip_processor(images=img, return_tensors="pt").to(device)
     with torch.no_grad():
@@ -181,7 +182,7 @@ def search(search_query):
     with torch.no_grad():
         # Encode and normalize the description using CLIP (HF CLIP)
-        inputs = clip_processor(text=search_query, images=None, return_tensors="pt", padding=True)
         text_encoded = clip_model.get_text_features(**inputs)
         # # Encode and normalize the description using CLIP (original CLIP)

 clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
 clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+clip_model = clip_model.to(device)
 #orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
 def predict(image, labels):
     with torch.no_grad():
+        inputs = clip_processor(text=[f"a photo of {c}" for c in labels], images=image, return_tensors="pt", padding=True).to(device)
         outputs = clip_model(**inputs)
         logits_per_image = outputs.logits_per_image # this is the image-text similarity score
         probs = logits_per_image.softmax(dim=1).cpu().numpy() # we can take the softmax to get the label probabilities
     model = model.eval()
     model = model.to(device)
     input = clip_processor(images=img, return_tensors="pt").to(device)
     with torch.no_grad():
     with torch.no_grad():
         # Encode and normalize the description using CLIP (HF CLIP)
+        inputs = clip_processor(text=search_query, images=None, return_tensors="pt", padding=True).to(device)
         text_encoded = clip_model.get_text_features(**inputs)
         # # Encode and normalize the description using CLIP (original CLIP)