google
/

tipsv2-b14

@@ -50,20 +50,20 @@ transform = transforms.Compose([
     transforms.ToTensor(),
 ])
-url = "https://raw.githubusercontent.com/google-deepmind/tips/main/scenic/images/example_image.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 pixel_values = transform(image).unsqueeze(0)
 out = model.encode_image(pixel_values)
-out.cls_token      # (B, 1, 768)
-out.patch_tokens   # (B, N, 768)
 ```
 ### Encode text
 ```python
 text_emb = model.encode_text(["a photo of a cat", "a photo of a dog"])
-# (2, 768)
 ```
 ### Zero-shot classification
@@ -71,10 +71,11 @@ text_emb = model.encode_text(["a photo of a cat", "a photo of a dog"])
 ```python
 import torch.nn.functional as F
 cls = F.normalize(out.cls_token[:, 0, :], dim=-1)
-text_emb = F.normalize(model.encode_text(["cat", "dog", "car"]), dim=-1)
 similarity = cls @ text_emb.T
-prediction = similarity.argmax(dim=-1)
 ```
 ### Zero-shot segmentation
@@ -84,6 +85,7 @@ classes = ["cat", "dog", "grass", "sky"]
 patch_feats = F.normalize(out.patch_tokens, dim=-1)
 text_emb = F.normalize(model.encode_text(classes), dim=-1)
 seg_map = (patch_feats @ text_emb.T).reshape(32, 32, len(classes)).argmax(dim=-1)
 ```
 ### Visualize spatial features
@@ -95,6 +97,7 @@ from sklearn.decomposition import PCA
 spatial = out.patch_tokens.reshape(1, 32, 32, 768)
 feat = spatial[0].detach().numpy().reshape(-1, 768)
 rgb = PCA(n_components=3).fit_transform(feat).reshape(32, 32, 3)
 ```
 ### GPU inference

     transforms.ToTensor(),
 ])
+url = "https://raw.githubusercontent.com/google-deepmind/tips/main/scenic/images/example_image_2.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 pixel_values = transform(image).unsqueeze(0)
 out = model.encode_image(pixel_values)
+print(out.cls_token.shape)     # (1, 1, 768)
+print(out.patch_tokens.shape)  # (1, 1024, 768)
 ```
 ### Encode text
 ```python
 text_emb = model.encode_text(["a photo of a cat", "a photo of a dog"])
+print(text_emb.shape)  # (2, 768)
 ```
 ### Zero-shot classification
 ```python
 import torch.nn.functional as F
+classes = ["cat", "dog", "car"]
 cls = F.normalize(out.cls_token[:, 0, :], dim=-1)
+text_emb = F.normalize(model.encode_text(classes), dim=-1)
 similarity = cls @ text_emb.T
+print(classes[similarity.argmax()])  # cat
 ```
 ### Zero-shot segmentation
 patch_feats = F.normalize(out.patch_tokens, dim=-1)
 text_emb = F.normalize(model.encode_text(classes), dim=-1)
 seg_map = (patch_feats @ text_emb.T).reshape(32, 32, len(classes)).argmax(dim=-1)
+print(seg_map.shape)  # (32, 32)
 ```
 ### Visualize spatial features
 spatial = out.patch_tokens.reshape(1, 32, 32, 768)
 feat = spatial[0].detach().numpy().reshape(-1, 768)
 rgb = PCA(n_components=3).fit_transform(feat).reshape(32, 32, 3)
+print(rgb.shape)  # (32, 32, 3)
 ```
 ### GPU inference