Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -40,6 +40,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
| 40 |
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 41 |
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 42 |
|
|
|
|
| 43 |
#orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
|
| 44 |
|
| 45 |
|
|
@@ -58,7 +59,7 @@ height = 256 # height for resizing images
|
|
| 58 |
|
| 59 |
def predict(image, labels):
|
| 60 |
with torch.no_grad():
|
| 61 |
-
inputs = clip_processor(text=[f"a photo of {c}" for c in labels], images=image, return_tensors="pt", padding=True)
|
| 62 |
outputs = clip_model(**inputs)
|
| 63 |
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
|
| 64 |
probs = logits_per_image.softmax(dim=1).cpu().numpy() # we can take the softmax to get the label probabilities
|
|
@@ -144,7 +145,7 @@ def get_caption(img,model_name):
|
|
| 144 |
model = model.eval()
|
| 145 |
model = model.to(device)
|
| 146 |
|
| 147 |
-
|
| 148 |
|
| 149 |
input = clip_processor(images=img, return_tensors="pt").to(device)
|
| 150 |
with torch.no_grad():
|
|
@@ -181,7 +182,7 @@ def search(search_query):
|
|
| 181 |
with torch.no_grad():
|
| 182 |
|
| 183 |
# Encode and normalize the description using CLIP (HF CLIP)
|
| 184 |
-
inputs = clip_processor(text=search_query, images=None, return_tensors="pt", padding=True)
|
| 185 |
text_encoded = clip_model.get_text_features(**inputs)
|
| 186 |
|
| 187 |
# # Encode and normalize the description using CLIP (original CLIP)
|
|
|
|
| 40 |
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 41 |
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 42 |
|
| 43 |
+
clip_model = clip_model.to(device)
|
| 44 |
#orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
|
| 45 |
|
| 46 |
|
|
|
|
| 59 |
|
| 60 |
def predict(image, labels):
|
| 61 |
with torch.no_grad():
|
| 62 |
+
inputs = clip_processor(text=[f"a photo of {c}" for c in labels], images=image, return_tensors="pt", padding=True).to(device)
|
| 63 |
outputs = clip_model(**inputs)
|
| 64 |
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
|
| 65 |
probs = logits_per_image.softmax(dim=1).cpu().numpy() # we can take the softmax to get the label probabilities
|
|
|
|
| 145 |
model = model.eval()
|
| 146 |
model = model.to(device)
|
| 147 |
|
| 148 |
+
|
| 149 |
|
| 150 |
input = clip_processor(images=img, return_tensors="pt").to(device)
|
| 151 |
with torch.no_grad():
|
|
|
|
| 182 |
with torch.no_grad():
|
| 183 |
|
| 184 |
# Encode and normalize the description using CLIP (HF CLIP)
|
| 185 |
+
inputs = clip_processor(text=search_query, images=None, return_tensors="pt", padding=True).to(device)
|
| 186 |
text_encoded = clip_model.get_text_features(**inputs)
|
| 187 |
|
| 188 |
# # Encode and normalize the description using CLIP (original CLIP)
|