google
/

tipsv2-so400m14

@@ -24,7 +24,7 @@ TIPSv2 (Text-Image Pre-training with Spatial awareness) is a family of contrasti
 ## Usage
 ```bash
-pip install transformers torch torchvision sentencepiece
 ```
 ### Load the model
@@ -43,14 +43,17 @@ Images should be tensors in `[0, 1]` range (just `ToTensor()`, no ImageNet norma
 ```python
 from torchvision import transforms
 from PIL import Image
 transform = transforms.Compose([
     transforms.Resize((448, 448)),
     transforms.ToTensor(),
 ])
-image = transform(Image.open("photo.jpg")).unsqueeze(0)
-out = model.encode_image(image)
 out.cls_token      # (B, 1, 1152)
 out.patch_tokens   # (B, N, 1152)
@@ -76,6 +79,15 @@ prediction = similarity.argmax(dim=-1)
 ### Zero-shot segmentation
 ```python
 import numpy as np
 from sklearn.decomposition import PCA
@@ -89,7 +101,7 @@ rgb = PCA(n_components=3).fit_transform(feat).reshape(32, 32, 3)
 ```python
 model = model.cuda()
-out = model.encode_image(image.cuda())
 text_emb = model.encode_text(["a city"])
 ```

 ## Usage
 ```bash
+pip install transformers torch torchvision sentencepiece scikit-learn
 ```
 ### Load the model
 ```python
 from torchvision import transforms
 from PIL import Image
+import requests
 transform = transforms.Compose([
     transforms.Resize((448, 448)),
     transforms.ToTensor(),
 ])
+url = "https://raw.githubusercontent.com/google-deepmind/tips/main/scenic/images/example_image.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+pixel_values = transform(image).unsqueeze(0)
+out = model.encode_image(pixel_values)
 out.cls_token      # (B, 1, 1152)
 out.patch_tokens   # (B, N, 1152)
 ### Zero-shot segmentation
+```python
+classes = ["cat", "dog", "grass", "sky"]
+patch_feats = F.normalize(out.patch_tokens, dim=-1)
+text_emb = F.normalize(model.encode_text(classes), dim=-1)
+seg_map = (patch_feats @ text_emb.T).reshape(32, 32, len(classes)).argmax(dim=-1)
+```
+### Visualize spatial features
 ```python
 import numpy as np
 from sklearn.decomposition import PCA
 ```python
 model = model.cuda()
+out = model.encode_image(pixel_values.cuda())
 text_emb = model.encode_text(["a city"])
 ```