google
/

tipsv2-l14

@@ -55,15 +55,15 @@ image = Image.open(requests.get(url, stream=True).raw)
 pixel_values = transform(image).unsqueeze(0)
 out = model.encode_image(pixel_values)
-print(out.cls_token.shape)     # (1, 1, 1024)
-print(out.patch_tokens.shape)  # (1, 1024, 1024)
 ```
 ### Encode text
 ```python
 text_emb = model.encode_text(["a photo of a cat", "a photo of a dog"])
-print(text_emb.shape)  # (2, 1024)
 ```
 ### Zero-shot classification
@@ -75,17 +75,17 @@ classes = ["cat", "dog", "car"]
 cls = F.normalize(out.cls_token[:, 0, :], dim=-1)
 text_emb = F.normalize(model.encode_text(classes), dim=-1)
 similarity = cls @ text_emb.T
-print(classes[similarity.argmax()])  # cat
 ```
 ### Zero-shot segmentation
 ```python
-classes = ["cat", "dog", "grass", "sky"]
 patch_feats = F.normalize(out.patch_tokens, dim=-1)
 text_emb = F.normalize(model.encode_text(classes), dim=-1)
 seg_map = (patch_feats @ text_emb.T).reshape(32, 32, len(classes)).argmax(dim=-1)
-print(seg_map.shape)  # (32, 32)
 ```
 ### Visualize spatial features
@@ -97,7 +97,7 @@ from sklearn.decomposition import PCA
 spatial = out.patch_tokens.reshape(1, 32, 32, 1024)
 feat = spatial[0].detach().numpy().reshape(-1, 1024)
 rgb = PCA(n_components=3).fit_transform(feat).reshape(32, 32, 3)
-print(rgb.shape)  # (32, 32, 3)
 ```
 ### GPU inference

 pixel_values = transform(image).unsqueeze(0)
 out = model.encode_image(pixel_values)
+print(out.cls_token.shape)     # (1, 1, 1024) — global image embedding
+print(out.patch_tokens.shape)  # (1, 1024, 1024) — per-patch spatial features
 ```
 ### Encode text
 ```python
 text_emb = model.encode_text(["a photo of a cat", "a photo of a dog"])
+print(text_emb.shape)  # (2, 1024) — one embedding per query
 ```
 ### Zero-shot classification
 cls = F.normalize(out.cls_token[:, 0, :], dim=-1)
 text_emb = F.normalize(model.encode_text(classes), dim=-1)
 similarity = cls @ text_emb.T
+print(classes[similarity.argmax()])  # cat — predicted class
 ```
 ### Zero-shot segmentation
 ```python
+classes = ["cat", "carpet", "floor", "furniture"]
 patch_feats = F.normalize(out.patch_tokens, dim=-1)
 text_emb = F.normalize(model.encode_text(classes), dim=-1)
 seg_map = (patch_feats @ text_emb.T).reshape(32, 32, len(classes)).argmax(dim=-1)
+print(seg_map.shape)  # (32, 32) — per-patch class prediction
 ```
 ### Visualize spatial features
 spatial = out.patch_tokens.reshape(1, 32, 32, 1024)
 feat = spatial[0].detach().numpy().reshape(-1, 1024)
 rgb = PCA(n_components=3).fit_transform(feat).reshape(32, 32, 3)
+print(rgb.shape)  # (32, 32, 3) — PCA of patch features as RGB
 ```
 ### GPU inference