lorebianchi98 commited on Oct 16, 2025

Commit

29cb4d8

1 Parent(s): 7de983f

First commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +1 -0
LICENSE.md +66 -0
README.md +118 -4
assets/overview.png +3 -0
assets/pikachu.png +3 -0
assets/pikachu_seg.png +3 -0
assets/qualitatives.png +3 -0
assets/qualitatives/cityscapes/1_clipdinoiser.png +3 -0
assets/qualitatives/cityscapes/1_freeda.png +3 -0
assets/qualitatives/cityscapes/1_gt.png +3 -0
assets/qualitatives/cityscapes/1_image.png +3 -0
assets/qualitatives/cityscapes/1_proxyclip.png +3 -0
assets/qualitatives/cityscapes/1_talk2dino.png +3 -0
assets/qualitatives/cityscapes/1r_clipdinoiser.png +3 -0
assets/qualitatives/cityscapes/1r_freeda.png +3 -0
assets/qualitatives/cityscapes/1r_gt.png +3 -0
assets/qualitatives/cityscapes/1r_image.png +3 -0
assets/qualitatives/cityscapes/1r_proxyclip.png +3 -0
assets/qualitatives/cityscapes/1r_talk2dino.png +3 -0
assets/qualitatives/context/1r_clipdinoiser.png +3 -0
assets/qualitatives/context/1r_freeda.png +3 -0
assets/qualitatives/context/1r_gt.png +3 -0
assets/qualitatives/context/1r_img.png +3 -0
assets/qualitatives/context/1r_proxy.png +3 -0
assets/qualitatives/context/1r_talk2dino.png +3 -0
assets/qualitatives/object/2r_clipdinoiser.png +3 -0
assets/qualitatives/object/2r_freeda.png +3 -0
assets/qualitatives/object/2r_gt.png +3 -0
assets/qualitatives/object/2r_img.png +3 -0
assets/qualitatives/object/2r_proxy.png +3 -0
assets/qualitatives/object/2r_talk2dino.png +3 -0
assets/qualitatives/voc/1_clipdinoiser.png +3 -0
assets/qualitatives/voc/1_freeda.png +3 -0
assets/qualitatives/voc/1_gt.png +3 -0
assets/qualitatives/voc/1_img.jpg +0 -0
assets/qualitatives/voc/1_proxy.png +3 -0
assets/qualitatives/voc/1_talk2dino.png +3 -0
assets/qualitatives/voc/2_clipdinoiser.png +3 -0
assets/qualitatives/voc/2_freeda.png +3 -0
assets/qualitatives/voc/2_gt.png +3 -0
assets/qualitatives/voc/2_img.jpg +0 -0
assets/qualitatives/voc/2_proxy.png +3 -0
assets/qualitatives/voc/2_talk2dino.png +3 -0
config.json +6 -0
configuration_talk2dino.py +49 -0
dinotext.py +399 -0
hf_demo.ipynb +0 -0
hooks.py +52 -0
masker.py +246 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

LICENSE.md ADDED Viewed

	@@ -0,0 +1,66 @@

+# DINOv3 License
+*Last Updated: August 19, 2025*
+**“Agreement”** means the terms and conditions for use, reproduction, distribution and modification of the DINO Materials set forth herein.
+**“DINO Materials”** means, collectively, Documentation and the models, software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, and other elements of the foregoing distributed by Meta and made available under this Agreement.
+**“Documentation”** means the specifications, manuals and documentation accompanying
+DINO Materials distributed by Meta.
+**“Licensee”** or **“you”** means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+**“Meta”** or **“we”** means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) or Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+**“Sanctions”** means any economic or trade sanctions or restrictions administered or enforced by the United States (including the Office of Foreign Assets Control of the U.S. Department of the Treasury (“OFAC”), the U.S. Department of State and the U.S. Department of Commerce), the United Nations, the European Union, or the United Kingdom.
+**“Trade Controls”** means any of the following: Sanctions and applicable export and import controls.
+By clicking “I Accept” below or by using or distributing any portion or element of the DINO Materials, you agree to be bound by this Agreement.
+## 1. License Rights and Redistribution.
+a. <ins>Grant of Rights</ins>. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the DINO Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the DINO Materials.
+b. <ins>Redistribution and Use</ins>.
+i. Distribution of DINO Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the DINO Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement and you shall provide a copy of this Agreement with any such DINO Materials.
+ii.  If you submit for publication the results of research you perform on, using, or otherwise in connection with DINO Materials, you must acknowledge the use of DINO Materials in your publication.
+iii. Your use of the DINO Materials must comply with applicable laws and regulations, including Trade Control Laws and applicable privacy and data protection laws.
+iv. Your use of the DINO Materials will not involve or encourage others to reverse engineer, decompile or discover the underlying components of the DINO Materials.
+v. You are not the target of Trade Controls and your use of DINO Materials must comply with Trade Controls. You agree not to use, or permit others to use, DINO Materials for any activities subject to the International Traffic in Arms Regulations (ITAR) or end uses prohibited by Trade Controls, including those related to military or warfare purposes, nuclear industries or applications, espionage, or the development or use of guns or illegal weapons.
+## 2. User Support.
+Your use of the DINO Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use.  Meta is under no obligation to provide any support services for the DINO Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
+## 3. Disclaimer of Warranty.
+UNLESS REQUIRED BY APPLICABLE LAW, THE DINO MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE DINO MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE DINO MATERIALS AND ANY OUTPUT AND RESULTS.
+## 4. Limitation of Liability.
+IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+## 5. Intellectual Property.
+a. Subject to Meta’s ownership of DINO Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the DINO Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+b. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the DINO Materials, outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the DINO Materials.
+## 6. Term and Termination.
+The term of this Agreement will commence upon your acceptance of this Agreement or access to the DINO Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the DINO Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
+## 7. Governing Law and Jurisdiction.
+This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+## 8. Modifications and Amendments.
+Meta may modify this Agreement from time to time; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the DINO Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.

README.md CHANGED Viewed

@@ -1,10 +1,124 @@
 ---
 tags:
 - model_hub_mixin
 - pytorch_model_hub_mixin
 ---
-This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
-- Code: [More Information Needed]
-- Paper: [More Information Needed]
-- Docs: [More Information Needed]

 ---
+license: other
+license_name: dinov3-license
+pipeline_tag: image-segmentation
+library_name: Pytorch
 tags:
 - model_hub_mixin
 - pytorch_model_hub_mixin
+- DINOv3
+- CLIP
+- open-vocabulary segmentation
 ---
+<div align="center">
+<h1>
+Talking to DINO: Bridging Self-Supervised Vision Backbones with Language for Open-Vocabulary Segmentation (ICCV 2025)
+</h1>
+<h3>
+<a href="https://www.linkedin.com/in/luca-barsellotti/">Luca Barsellotti*</a>&ensp;
+<a href="https://www.linkedin.com/in/lorenzo-bianchi-893bb225a/">Lorenzo Bianchi*</a>&ensp;
+<a href="https://www.linkedin.com/in/nicola-messina-a33848164/">Nicola Messina</a>&ensp;
+<a href="https://www.linkedin.com/in/fabio-carrara-b28a2b111/">Fabio Carrara</a>&ensp;
+<a href="https://aimagelab.ing.unimore.it/imagelab/person.asp?idpersona=90">Marcella Cornia</a>&ensp;
+<a href="https://www.lorenzobaraldi.com/">Lorenzo Baraldi</a>&ensp;
+<a href="https://fabriziofalchi.it">Fabrizio Falchi</a>&ensp;
+<a href="https://www.linkedin.com/in/rita-cucchiara-a4653a13/">Rita Cucchiara</a>
+</h3>
+[Project Page](https://lorebianchi98.github.io/Talk2DINO/) | [Paper](http://arxiv.org/abs/2411.19331) | [Code](https://github.com/lorebianchi98/Talk2DINO)
+</div>
+<div align="center">
+<figure>
+  <img alt="Overview of Talk2DINO" src="./assets/overview.png" width="90%">
+</figure>
+</div>
+## About
+Open-Vocabulary Segmentation (OVS) aims at segmenting images from free-form textual concepts without predefined training classes. While existing vision-language models such as CLIP can generate segmentation masks by leveraging coarse spatial information from Vision Transformers, they face challenges in spatial localization due to their global alignment of image and text features. Conversely, self-supervised visual models like DINO excel in fine-grained visual encoding but lack integration with language. To bridge this gap, we present Talk2DINO, a novel hybrid approach that combines the spatial accuracy of DINOv2 with the language understanding of CLIP. Our approach aligns the textual embeddings of CLIP to the patch-level features of DINOv2 through a learned mapping function without the need to fine-tune the underlying backbones. At training time, we exploit the attention maps of DINOv2 to selectively align local visual patches with textual embeddings. We show that the powerful semantic and localization abilities of Talk2DINO can enhance the segmentation process, resulting in more natural and less noisy segmentations, and that our approach can also effectively distinguish foreground objects from the background. Experimental results demonstrate that Talk2DINO achieves state-of-the-art performance across several unsupervised OVS benchmarks.
+## Sample Usage
+### Mapping CLIP Text Embeddings to DINOv2 space with Talk2DINO
+We can use Talk2DINO to map CLIP text embeddings into the DINOv3 patch embedding space.
+```python
+from transformers import AutoModel
+from torchvision.io import read_image
+# Device setup
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# Model Loading
+model = AutoModel.from_pretrained("lorebianchi98/Talk2DINO_v3-ViTL").to(device).eval()
+# Embedding generation
+with torch.no_grad():
+    text_embed = model.encode_text("a pikachu")
+    image_embed = model.encode_image(image)
+# normalize the features to perform cosine similarity
+text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True)
+image_embed = image_embed / image_embed.norm(dim=-1, keepdim=True)
+similarity = (image_embed @ text_embed.T).squeeze(0, -1).cpu().numpy()
+```
+### Demo
+In `demo.ipynb` we provide a simple example on how to use Talk2DINO for inference on a given image with custom textual categories.
+Result:
+<div align="center">
+<table><tr><td><figure>
+  <img alt="" src="./assets/pikachu.png" width=300>
+</figure></td><td><figure>
+  <img alt="" src="./assets/pikachu_seg.png" width=300>
+</figure></td></tr></table>
+</div>
+## Installation
+To use the **Hugging Face interface** for inference:
+```bash
+# Clone the repository
+git clone https://huggingface.co/lorebianchi98/Talk2DINO-ViTB
+cd Talk2DINO-ViTB
+# Install dependencies
+pip install -r requirements.txt
+# Install PyTorch and torchvision with the appropriate CUDA version
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126
+```
+For the **full MMCV interface** to perform evaluation on segmentation benchmarks, please refer to the [original Talk2DINO repository](https://github.com/lorebianchi98/Talk2DINO).
+<details>
+  <summary>Qualitative Results</summary>
+| **Image** | **Ground Truth** | **FreeDA** | **ProxyCLIP** | **CLIP-DINOiser** | **Ours (Talk2DINO)** |
+|-----------|------------------|------------|---------------|-------------------|------------------|
+| ![Image](assets/qualitatives/voc/2_img.jpg) | ![Ground Truth](assets/qualitatives/voc/2_gt.png) | ![FreeDA](assets/qualitatives/voc/2_freeda.png) | ![ProxyCLIP](assets/qualitatives/voc/2_proxy.png) | ![CLIP-DINOiser](assets/qualitatives/voc/2_clipdinoiser.png) | ![Ours](assets/qualitatives/voc/2_talk2dino.png) |
+| ![Image](assets/qualitatives/object/2r_img.png) | ![Ground Truth](assets/qualitatives/object/2r_gt.png) | ![FreeDA](assets/qualitatives/object/2r_freeda.png) | ![ProxyCLIP](assets/qualitatives/object/2r_proxy.png) | ![CLIP-DINOiser](assets/qualitatives/object/2r_clipdinoiser.png) | ![Ours](assets/qualitatives/object/2r_talk2dino.png) |
+| ![Image](assets/qualitatives/cityscapes/1r_image.png) | ![Ground Truth](assets/qualitatives/cityscapes/1r_gt.png) | ![FreeDA](assets/qualitatives/cityscapes/1r_freeda.png) | ![ProxyCLIP](assets/qualitatives/cityscapes/1r_proxyclip.png) | ![CLIP-DINOiser](assets/qualitatives/cityscapes/1r_clipdinoiser.png) | ![Ours](assets/qualitatives/cityscapes/1r_talk2dino.png) |
+| ![Image](assets/qualitatives/context/1r_img.png) | ![Ground Truth](assets/qualitatives/context/1r_gt.png) | ![FreeDA](assets/qualitatives/context/1r_freeda.png) | ![ProxyCLIP](assets/qualitatives/context/1r_proxy.png) | ![CLIP-DINOiser](assets/qualitatives/context/1r_clipdinoiser.png) | ![Ours](assets/qualitatives/context/1r_talk2dino.png) |
+</details>
+## Reference
+If you found this code useful, please cite the following paper:
+```
+@misc{barsellotti2024talkingdinobridgingselfsupervised,
+      title={Talking to DINO: Bridging Self-Supervised Vision Backbones with Language for Open-Vocabulary Segmentation},
+      author={Luca Barsellotti and Lorenzo Bianchi and Nicola Messina and Fabio Carrara and Marcella Cornia and Lorenzo Baraldi and Fabrizio Falchi and Rita Cucchiara},
+      year={2024},
+      eprint={2411.19331},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2411.19331},
+}
+```

assets/overview.png ADDED Viewed

Git LFS Details

SHA256: fcefc8c68cf95a966f769852ea51e7efa7ea2398b21936cacaa2eb5c6fff0358
Pointer size: 130 Bytes
Size of remote file: 89.5 kB

assets/pikachu.png ADDED Viewed

Git LFS Details

SHA256: 7a5efcbce11e4a293ebb743c8857c0654c6bce0b89beb59f6ca71d64311c4106
Pointer size: 131 Bytes
Size of remote file: 377 kB

assets/pikachu_seg.png ADDED Viewed

Git LFS Details

SHA256: 4b200c8a069a9d277073989a1bf3398a432fc47ade91ab3e90599fa04e0db33b
Pointer size: 131 Bytes
Size of remote file: 209 kB

assets/qualitatives.png ADDED Viewed

Git LFS Details

SHA256: 7aafda1e9b4816d125c7a9a2294da44ca08f8be90b496e2bc1f72a58c5fbc859
Pointer size: 132 Bytes
Size of remote file: 1.09 MB

assets/qualitatives/cityscapes/1_clipdinoiser.png ADDED Viewed

Git LFS Details

SHA256: dc7d50518fa3fb82c9ffe101c37ef71430636065d9b3287831d5deac74d2e958
Pointer size: 131 Bytes
Size of remote file: 208 kB

assets/qualitatives/cityscapes/1_freeda.png ADDED Viewed

Git LFS Details

SHA256: 021db152eea34d9140c2c113fcbc4883c4a8a2b7714c5c6ae26f8494addc5d4b
Pointer size: 131 Bytes
Size of remote file: 225 kB

assets/qualitatives/cityscapes/1_gt.png ADDED Viewed

Git LFS Details

SHA256: d4319e6ff02331c3bd80f23ffa2a15c99c4381a60770f2a147f3ac3410a4d4c1
Pointer size: 131 Bytes
Size of remote file: 215 kB

assets/qualitatives/cityscapes/1_image.png ADDED Viewed

Git LFS Details

SHA256: e78e72e601e9113dfc900743c7f1ea483c8cf86cd424789dded0d50f90e2a4c1
Pointer size: 131 Bytes
Size of remote file: 732 kB

assets/qualitatives/cityscapes/1_proxyclip.png ADDED Viewed

Git LFS Details

SHA256: 110c7e9507e95e5ecc0c0dedbbed9fe43fb3cd92961a3edb710f79a666088035
Pointer size: 131 Bytes
Size of remote file: 225 kB

assets/qualitatives/cityscapes/1_talk2dino.png ADDED Viewed

Git LFS Details

SHA256: c0fe49fd3056deca04dd67146101b910e20286312d2dcccac59536ff37082e1d
Pointer size: 131 Bytes
Size of remote file: 223 kB

assets/qualitatives/cityscapes/1r_clipdinoiser.png ADDED Viewed

Git LFS Details

SHA256: 850ff27eddf5ed958440f76b51b59551c40ff53527349292c5d3f5e6784f966c
Pointer size: 131 Bytes
Size of remote file: 152 kB

assets/qualitatives/cityscapes/1r_freeda.png ADDED Viewed

Git LFS Details

SHA256: 97c3d6a6ff0ca34429727ca57d86f5db97473c67f035f364091b09a542869419
Pointer size: 131 Bytes
Size of remote file: 168 kB

assets/qualitatives/cityscapes/1r_gt.png ADDED Viewed

Git LFS Details

SHA256: a5f53f91cbba23d5a9865f5674f3827e6abec5165d68f1d799abbcfdb31aa148
Pointer size: 131 Bytes
Size of remote file: 160 kB

assets/qualitatives/cityscapes/1r_image.png ADDED Viewed

Git LFS Details

SHA256: 441e3846e7b9adf5b6f66e394f6c2988c12cd306dc20a21488b0080cb13f94c0
Pointer size: 131 Bytes
Size of remote file: 207 kB

assets/qualitatives/cityscapes/1r_proxyclip.png ADDED Viewed

Git LFS Details

SHA256: 565024c62febde5ecbb90c428914341b100a7ec6a8cd4045b35b427d87acba65
Pointer size: 131 Bytes
Size of remote file: 166 kB

assets/qualitatives/cityscapes/1r_talk2dino.png ADDED Viewed

Git LFS Details

SHA256: 7737bdf861eaeb8b7595279d3cd4ba5eb8b3f14e6b3bc45f275430005f783426
Pointer size: 131 Bytes
Size of remote file: 165 kB

assets/qualitatives/context/1r_clipdinoiser.png ADDED Viewed

Git LFS Details

SHA256: e494d3a59eef1360f76756c2b6e5c240fcdf2e964328c95d31166ae538733c00
Pointer size: 131 Bytes
Size of remote file: 195 kB

assets/qualitatives/context/1r_freeda.png ADDED Viewed

Git LFS Details

SHA256: 897c90477ce48ca384952138f2cbafc9273e18e855592828cc3d61621d33a4a5
Pointer size: 131 Bytes
Size of remote file: 196 kB

assets/qualitatives/context/1r_gt.png ADDED Viewed

Git LFS Details

SHA256: 3a32121bae796e860c460c8d10fd15e0bf89e396aa498af4bedcab31d467eab3
Pointer size: 131 Bytes
Size of remote file: 194 kB

assets/qualitatives/context/1r_img.png ADDED Viewed

Git LFS Details

SHA256: 782c879d32d409d87bfb8458f815f780f3ce0c4320eac50bd80877568049a129
Pointer size: 131 Bytes
Size of remote file: 267 kB

assets/qualitatives/context/1r_proxy.png ADDED Viewed

Git LFS Details

SHA256: dbdc03938f29c23c7e2adf04a2b4aecc4ca4897ed316f50412ef07c926cbea8b
Pointer size: 131 Bytes
Size of remote file: 198 kB

assets/qualitatives/context/1r_talk2dino.png ADDED Viewed

Git LFS Details

SHA256: 7b24878e048c1024795dccf81a8ff2efce7d7fc410adc02e8d0270fc448b8da4
Pointer size: 131 Bytes
Size of remote file: 197 kB

assets/qualitatives/object/2r_clipdinoiser.png ADDED Viewed

Git LFS Details

SHA256: 947b12f655f028f0dbf1028531d8a3a79792d6d8ad476c30ea33adef3ef49e67
Pointer size: 131 Bytes
Size of remote file: 259 kB

assets/qualitatives/object/2r_freeda.png ADDED Viewed

Git LFS Details

SHA256: f4e02e1578410da7db1d5709d568d9597f915da8b6783f4352cb96da3fc27d5d
Pointer size: 131 Bytes
Size of remote file: 261 kB

assets/qualitatives/object/2r_gt.png ADDED Viewed

Git LFS Details

SHA256: 3202de65eb235c709a9a5bfcd9275b2786c7802cdc5014eee9d33533e5cce4a0
Pointer size: 131 Bytes
Size of remote file: 258 kB

assets/qualitatives/object/2r_img.png ADDED Viewed

Git LFS Details

SHA256: 20437921ab37fd684adf5af51d26016d9e081a1c80f2b8f5a4572f2a8d699a7e
Pointer size: 131 Bytes
Size of remote file: 345 kB

assets/qualitatives/object/2r_proxy.png ADDED Viewed

Git LFS Details

SHA256: 3d2bbdf80167d8e2924ffffe5f097bdcafd046de801c126cfb4fd71727aef995
Pointer size: 131 Bytes
Size of remote file: 256 kB

assets/qualitatives/object/2r_talk2dino.png ADDED Viewed

Git LFS Details

SHA256: 97dec62cdb8919aa30325a0a2bf15d07d2f7c644829b5183f97e21faa6fdcd4e
Pointer size: 131 Bytes
Size of remote file: 258 kB

assets/qualitatives/voc/1_clipdinoiser.png ADDED Viewed

Git LFS Details

SHA256: 1b77e7692a0f3d70636a9a5b0efc6718216af5c762b9a96b68ec91fb9e0570f7
Pointer size: 131 Bytes
Size of remote file: 353 kB

assets/qualitatives/voc/1_freeda.png ADDED Viewed

Git LFS Details

SHA256: d849bab0384a9d661a32a87faea349d047649d0272df9e16f1994ce7f9b51b3c
Pointer size: 131 Bytes
Size of remote file: 352 kB

assets/qualitatives/voc/1_gt.png ADDED Viewed

Git LFS Details

SHA256: d3060aabbe6a4fa6fd1895f96d9f09c4e66537c94ce59bd4b37a3a1e11825c94
Pointer size: 131 Bytes
Size of remote file: 352 kB

assets/qualitatives/voc/1_img.jpg ADDED Viewed

assets/qualitatives/voc/1_proxy.png ADDED Viewed

Git LFS Details

SHA256: e88055ef402a7ec39ec582fd9d5fe5e85d4dc092e981ee1291d999c4709c435e
Pointer size: 131 Bytes
Size of remote file: 352 kB

assets/qualitatives/voc/1_talk2dino.png ADDED Viewed

Git LFS Details

SHA256: a4dd7ba3df8cc21bb9dafc5eabe545f8af2619d096ed42da04461c0d0a35fc7d
Pointer size: 131 Bytes
Size of remote file: 352 kB

assets/qualitatives/voc/2_clipdinoiser.png ADDED Viewed

Git LFS Details

SHA256: 731afb02c5190b7cbacf0c18a0f3cedac5b47903d78c0aef8afa7c33d9fe4ec8
Pointer size: 131 Bytes
Size of remote file: 311 kB

assets/qualitatives/voc/2_freeda.png ADDED Viewed

Git LFS Details

SHA256: d790a1f9ae79b83fc761f33a4821558dd1c2f3251cbb3e702aa46d73a89dd823
Pointer size: 131 Bytes
Size of remote file: 308 kB

assets/qualitatives/voc/2_gt.png ADDED Viewed

Git LFS Details

SHA256: 0d7a9abef94bd64a4452c837749d5d97770a10da3552c90549b0ae9c59b9754c
Pointer size: 131 Bytes
Size of remote file: 308 kB

assets/qualitatives/voc/2_img.jpg ADDED Viewed

assets/qualitatives/voc/2_proxy.png ADDED Viewed

Git LFS Details

SHA256: 0271de330e49dfdc82d934f808d19e81d8c0f2ec0d0c8bc9149ec26e59ea4237
Pointer size: 131 Bytes
Size of remote file: 313 kB

assets/qualitatives/voc/2_talk2dino.png ADDED Viewed

Git LFS Details

SHA256: addb7639db5064a086666a76345a13ea74bceefb33870bb7a74ee3a540f9d361
Pointer size: 131 Bytes
Size of remote file: 312 kB

config.json CHANGED Viewed

@@ -1,4 +1,10 @@
 {
   "avg_self_attn_token": false,
   "clip_model_name": "ViT-B/16",
   "disentangled_self_attn_token": true,

 {
+  "architectures": ["Talk2DINO"],
+  "model_type": "talk2dino",
+  "auto_map": {
+    "AutoConfig": "configuration_talk2dino.Talk2DINOConfig",
+    "AutoModel": "modeling_talk2dino.Talk2DINO"
+  },
   "avg_self_attn_token": false,
   "clip_model_name": "ViT-B/16",
   "disentangled_self_attn_token": true,

configuration_talk2dino.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from transformers import PretrainedConfig
+class Talk2DINOConfig(PretrainedConfig):
+    model_type = "talk2dino"
+    def __init__(
+        self,
+        avg_self_attn_token=False,
+        clip_model_name="ViT-B/16",
+        disentangled_self_attn_token=True,
+        is_eval=True,
+        keep_cls=False,
+        keep_end_seq=False,
+        loss=None,
+        model_name="dinov2_vitb14_reg",
+        pre_trained=True,
+        proj_class="vitb_mlp_infonce",
+        proj_model="ProjectionLayer",
+        proj_name="vitb_mlp_infonce",
+        resize_dim=518,
+        type="DINOText",
+        unfreeze_last_image_layer=False,
+        unfreeze_last_text_layer=False,
+        use_avg_text_token=False,
+        with_bg_clean=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # Store all parameters
+        self.avg_self_attn_token = avg_self_attn_token
+        self.clip_model_name = clip_model_name
+        self.disentangled_self_attn_token = disentangled_self_attn_token
+        self.is_eval = is_eval
+        self.keep_cls = keep_cls
+        self.keep_end_seq = keep_end_seq
+        self.loss = loss
+        self.model_name = model_name
+        self.pre_trained = pre_trained
+        self.proj_class = proj_class
+        self.proj_model = proj_model
+        self.proj_name = proj_name
+        self.resize_dim = resize_dim
+        self.type = type
+        self.unfreeze_last_image_layer = unfreeze_last_image_layer
+        self.unfreeze_last_text_layer = unfreeze_last_text_layer
+        self.use_avg_text_token = use_avg_text_token
+        self.with_bg_clean = with_bg_clean

dinotext.py ADDED Viewed

	@@ -0,0 +1,399 @@

+import itertools
+import os
+import pickle
+from math import sqrt
+import re
+import yaml
+import numpy as np
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+from einops import rearrange
+from transformers import BertModel, AutoTokenizer
+import torchvision.transforms as T
+import clip
+import importlib
+from .us import normalize
+from .pamr import PAMR
+from .masker import DINOTextMasker
+from .templates import get_template
+from .model import ProjectionLayer, VisualProjectionLayer, CLIPLastLayer, DoubleMLP
+from .hooks import average_text_tokens, get_vit_out, feats
+class DINOText(nn.Module):
+    def get_self_attention(self, module, input, output):
+        self.feats['self_attn'] = output
+    def get_clip_second_last_dense_out(self, model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+        self.feats['clip_second_last_out'] = output
+        self.feats['clip_second_last_out'].to(dtype=torch.float32)
+    def get_all_out_tokens(self, model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+        self.feats['clip_txt_out_tokens'] = output
+    def __init__(
+            self, model_name, resize_dim, clip_model_name, proj_class, proj_name, proj_model, avg_self_attn_token=False, disentangled_self_attn_token=True, loss=None, pre_trained=True,
+            unfreeze_last_text_layer=False, unfreeze_last_image_layer=False, is_eval=True, use_avg_text_token=False, keep_cls=False, keep_end_seq=False, with_bg_clean=False, **kwargs
+    ):
+        nn.Module.__init__(self)
+        self.feats = {}
+        self.model_name = model_name
+        # loading the model
+        if 'dinov2' in model_name:
+            self.model_family = 'facebookresearch/dinov2' if 'dinov2' in model_name else 'facebookresearch/dino:main'
+            self.model = torch.hub.load(self.model_family, model_name)
+        elif 'mae' in model_name or 'sam' in model_name or 'clip' in model_name or 'dino' in model_name:
+            self.model = timm.create_model(
+                model_name,
+                pretrained=True,
+                num_classes=0,  # remove classifier nn.Linear
+                img_size=resize_dim
+            )
+            if 'sam' in model_name:
+                self.model.blocks[-1].register_forward_hook(get_vit_out)
+        else:
+            raise Exception("Unknown ViT model")
+        # self.model.eval()
+        mean = (0.485, 0.456, 0.406) if not 'clip' in model_name else (0.4815, 0.4578, 0.4082)
+        std = (0.229, 0.224, 0.225) if not 'clip' in model_name else (0.2686, 0.2613, 0.2758)
+        self.image_transforms = T.Compose([
+            T.Resize((resize_dim, resize_dim)),
+            lambda x: T.ToTensor()(x) if not isinstance(x, torch.Tensor) else x / 255.0,  # ensure tensor
+            T.Normalize(mean, std),
+        ])
+        self.model
+        self.model.requires_grad_(False)
+        self.clip_model_name = clip_model_name
+        if 'bert' in self.clip_model_name:
+            self.clip_model = BertModel.from_pretrained(self.clip_model_name, output_hidden_states = False)
+            # load the corresponding wordtokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(self.clip_model_name)
+        else:
+            self.clip_model, _ = clip.load(clip_model_name, device='meta')
+        self.clip_model.eval()
+        self.clip_model.requires_grad_(False)
+        if unfreeze_last_text_layer:
+            for param in self.clip_model.transformer.resblocks[-1].parameters():
+                param.requires_grad = True
+            for param in self.clip_model.ln_final.parameters():
+                param.requires_grad = True
+            self.clip_model.text_projection.requires_grad = True
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        if 'vitb_mlp_infonce' in proj_class:
+            config = {
+                'act': 'tanh', # None, tanh, relu or sigmoid
+                'hidden_layer': True,
+                'dino_embed_dim': 768
+            }
+        elif 'vitl_mlp_infonce' in proj_class:
+            config = {
+                'act': 'tanh', # None, tanh, relu or sigmoid
+                'hidden_layer': True,
+                'dino_embed_dim': 1024
+            }
+        ProjClass = ProjectionLayer
+        self.proj = ProjClass.from_config(config)
+        self.masker = DINOTextMasker(similarity_type="cosine")
+        self.masker = self.masker.eval()
+        self.pamr = None
+        self.avg_self_attn_token = avg_self_attn_token
+        self.disentangled_self_attn_token = disentangled_self_attn_token
+        if self.avg_self_attn_token or self.disentangled_self_attn_token or is_eval:
+            self.model.blocks[-1].attn.qkv.register_forward_hook(self.get_self_attention)
+            self.num_global_tokens = 5 if 'reg' in model_name or 'dinov3' in model_name else 1
+            if 'sam' in self.model_name:
+                self.num_global_tokens = 0
+            if 'dinov3' in self.model_name:
+                if 'vit_base' in self.model_name:
+                    self.num_attn_heads = 12
+                elif 'vit_large' in self.model_name:
+                    self.num_attn_heads = 16
+                else:
+                    raise Exception("Unknown dinov3 model")
+            else:
+                self.num_attn_heads = self.model.num_heads
+            self.scale = 0.125
+        self.use_avg_text_token = use_avg_text_token
+        if self.use_avg_text_token:
+            self.feats = {}
+            # in this case we register a forward hook with the aim of getting all the tokens and not only the cls
+            self.clip_model.ln_final.register_forward_hook(self.get_all_out_tokens)
+            self.keep_cls = keep_cls
+            self.keep_end_seq = keep_end_seq
+        self.with_bg_clean = with_bg_clean
+    def process_self_attention(self, output, batch_size, num_tokens, num_attn_heads, embed_dim, scale, num_global_tokens, ret_self_attn_maps=False):
+        qkv = output.reshape(batch_size, num_tokens, 3, num_attn_heads, embed_dim // num_attn_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        self_attn_maps = attn[:, : , 0, num_global_tokens:]
+        self_attn = self_attn_maps.mean(dim=1)
+        self_attn = self_attn.softmax(dim=-1)
+        if ret_self_attn_maps:
+            return self_attn, self_attn_maps
+        else:
+            return self_attn
+    def encode_text(self, tokenized_texts):
+        if type(self.proj) == CLIPLastLayer:
+            self.clip_model.encode_text(tokenized_texts)
+            x = self.feats['clip_second_last_out']
+            x = x.to(dtype=torch.float32)
+        else:
+            x = self.clip_model.encode_text(tokenized_texts)
+        return x
+    def encode_image(self, images):
+        batch_size, _, _, _ = images.shape
+        self_attn_maps = None
+        x = self.model(images, is_training=(self.avg_self_attn_token or self.disentangled_self_attn_token))
+        batch_size, num_tokens, embed_dim = x['x_norm_patchtokens'].shape
+        num_tokens = num_tokens + self.num_global_tokens
+        if self.avg_self_attn_token or self.disentangled_self_attn_token:
+            self_attn, self_attn_maps = self.process_self_attention(self.feats['self_attn'], batch_size, num_tokens, self.num_attn_heads, embed_dim, self.scale, self.num_global_tokens, ret_self_attn_maps=True)
+        if self.avg_self_attn_token:
+            x = (self_attn.unsqueeze(-1) * x['x_norm_patchtokens']).mean(dim=1)
+        elif self.disentangled_self_attn_token:
+            self_attn_maps = self_attn_maps.softmax(dim=-1)
+            x = (x['x_norm_patchtokens'].unsqueeze(1) * self_attn_maps.unsqueeze(-1)).mean(dim=2)
+        return x, self_attn_maps
+    def forward(self, image, text, return_logit_scale=False):
+        with torch.no_grad():
+            txt_embed = self.encode_text(text)
+        img_embed, self_attn_maps = self.encode_image(image)
+        if type(self.proj) == CLIPLastLayer:
+            img_embed, txt_embed = self.proj(img_embed, txt_embed, ret_embeds=True, self_attn_maps=self_attn_maps, text_argmax=text.argmax(dim=-1))
+        else:
+            img_embed, txt_embed = self.proj(img_embed, txt_embed, ret_embeds=True, self_attn_maps=self_attn_maps)
+        if return_logit_scale:
+            return txt_embed, img_embed, self.logit_scale
+        return txt_embed, img_embed
+    def compute_loss(self, image, text, cosine=True, ret_similarity_matrix=True):
+        ret = {}
+        if cosine:
+            img_embed = F.normalize(img_embed, p=2, dim=1)
+            txt_embed = F.normalize(txt_embed, p=2, dim=1)
+        sim = img_embed @ txt_embed.transpose(1, 0)
+        if not ret_similarity_matrix:
+            sim = sim[torch.eye(len(sim)) > 0.5] # only diagonal elements
+        ret['contrastive_loss'] = self.contrastive_loss.compute_contrastive_loss(sim)
+        return ret
+    @torch.no_grad()
+    def build_dataset_class_tokens(self, template_set, classnames):
+        tokens = []
+        templates = get_template(template_set)
+        for classname in classnames:
+            if 'bert' not in self.clip_model_name:
+                tokens.append(
+                    clip.tokenize([template.format(classname) for template in templates])
+                )
+            else:
+                tokens.append(self.tokenizer([template.format(classname) for template in templates], return_tensors='pt', padding='max_length')['input_ids'])
+        # [N, T, L], N: number of instance, T: number of captions (including ensembled), L: sequence length
+        tokens = torch.stack(tokens)
+        return tokens
+    @torch.no_grad()
+    def build_text_embedding(self, text):
+        """
+        Args:
+            text (torch.Tensor): [NUM_CLASSES, NUM_TEMPLATES, CONTEXT_LENGTH] text tokens
+        Returns:
+            text_embs
+        """
+        text = text.to(next(self.parameters()).device)
+        num_classes, num_templates = text.shape[:2]
+        text_argmax = text.argmax(dim=-1)
+        text_argmax = rearrange(text_argmax, 'n t -> (n t)', n=num_classes, t=num_templates)
+        text = rearrange(text, 'n t l -> (n t) l', n=num_classes, t=num_templates)
+        # chunked inference for memory limitation
+        chunk_size = 32
+        N = text.size(0)
+        if type(self.proj) == CLIPLastLayer:
+            text_embs = torch.cat([
+            self.proj.project_clip_txt(self.encode_text(text[i:i + chunk_size]).permute(1, 0, 2), text_argmax=text_argmax[i:i + chunk_size])
+            for i in range(0, N, chunk_size)
+        ])
+        else:
+            if not self.use_avg_text_token:
+                # performing classification using CLS textual token
+                if 'bert' not in self.clip_model_name:
+                    text_embs = torch.cat([
+                        self.clip_model.encode_text(text[i:i + chunk_size])
+                        for i in range(0, N, chunk_size)
+                    ])
+                else:
+                    # encoding with BERT
+                    text_embs = []
+                    for i in range(0, N, chunk_size):
+                        outputs = self.clip_model(text[i:i + chunk_size])
+                        text_embs.append(outputs['pooler_output'])
+                    text_embs = torch.cat(text_embs)
+            else:
+                # using text token average
+                text_embs = []
+                for i in range(0, N, chunk_size):
+                    self.clip_model.encode_text(text[i:i + chunk_size])
+                    text_embs.append(average_text_tokens(self.feats['clip_txt_out_tokens'] @ self.clip_model.text_projection, text[i:i + chunk_size] > 0, self.keep_cls, self.keep_end_seq))
+                text_embs = torch.cat(text_embs)
+        # [N, T, C]
+        text_embs = rearrange(text_embs, '(n t) c -> n t c', n=num_classes, t=num_templates)
+        # [N, C]
+        text_embs = text_embs.mean(dim=1).float()
+        if type(self.proj) == ProjectionLayer or type(self.proj) == DoubleMLP:
+            text_embs = self.proj.project_clip_txt(text_embs)
+        text_embs = normalize(text_embs, dim=-1)
+        return text_embs
+    def apply_pamr(self, image, mask):
+        image = F.interpolate(image, mask.shape[-2:], mode="bilinear", align_corners=True)
+        if self.pamr is None:
+            pamr_iter = 10
+            pamr_kernel = [1, 2, 4, 8, 12, 24]
+            self.pamr = PAMR(pamr_iter, pamr_kernel)
+            self.pamr.eval()
+            self.pamr.to(next(self.parameters()).device)
+        mask = self.pamr(image, mask)
+        return mask
+    def compute_padsize(self, H: int, W: int, patch_size: int):
+        l, r, t, b = 0, 0, 0, 0
+        if W % patch_size:
+            lr = patch_size - (W % patch_size)
+            l = lr // 2
+            r = lr - l
+        if H % patch_size:
+            tb = patch_size - (H % patch_size)
+            t = tb // 2
+            b = tb - t
+        return l, r, t, b
+    @torch.no_grad()
+    def generate_masks(
+            self, image, img_metas, text_emb, classnames, text_is_token=False, apply_pamr=False, background_func="weighted_average_sigmoid", lambda_bg=0.2,
+            # kp_w=0.3,
+    ):
+        """Generate masks for each text embeddings
+        Args:
+            image [B, 3, H, W]
+        Returns:
+            softmask [B, N, H, W]: softmasks for each text embeddings
+        """
+        H, W = image.shape[2:]  # original image shape
+        # padded image size
+        pH, pW = image.shape[2:]
+        num_classes = text_emb.shape[0]
+        batch_size = image.shape[0]
+        image = image[:, [2, 1, 0], :, :]  # BGR to RGB
+        ori_image = image.clone()
+        img_preprocessed = self.image_transforms(image).to(next(self.parameters()).device)
+        if 'dinov2' in self.model_name:
+            image_feat = self.model.forward_features(img_preprocessed)['x_norm_patchtokens']
+        elif 'dinov3' in self.model_name:
+            image_feat = self.model.forward_features(img_preprocessed)[:, 5:, :]
+        elif 'mae' in self.model_name or 'clip' in self.model_name or 'dino' in self.model_name:
+            image_feat = self.model.forward_features(img_preprocessed)[:, 1:, :]
+        elif 'sam' in self.model_name:
+            self.model.forward_features(img_preprocessed)
+            image_feat = feats['vit_out'].reshape(feats['vit_out'].shape[0], feats['vit_out'].shape[1]**2, feats['vit_out'].shape[-1]) # BS x N_PATCHES x EMBED_DIM
+        batch_size, num_tokens, embed_dim = image_feat.shape
+        if type(self.proj) == VisualProjectionLayer:
+            image_feat = self.proj.project_dino(image_feat.float())
+        if type(self.proj) == DoubleMLP:
+            image_feat = self.proj.project_visual(image_feat.float())
+        b, np, c = image_feat.shape
+        np_h = np_w = int(sqrt(np))
+        image_feat = image_feat.reshape(b, np_h, np_w, c).permute(0, 3, 1, 2)
+        self_attn, self_attn_maps = self.process_self_attention(self.feats['self_attn'], batch_size, num_tokens + self.num_global_tokens, self.num_attn_heads, embed_dim, self.scale, self.num_global_tokens, ret_self_attn_maps=True)
+        mask, simmap = self.masker.forward_seg(image_feat, text_emb, hard=False)  # [B, N, H', W']
+        if self.with_bg_clean:
+            mask = self.similarity_assignment_weighted(mask, image_feat, self_attn_maps, text_emb, lambda_bg)
+        # resize
+        mask = F.interpolate(mask, (pH, pW), mode='bilinear', align_corners=True)  # [B, N, H, W]
+        if apply_pamr:
+            for c in range(0, mask.shape[1], 30):
+                mask[:, c:c + 30] = self.apply_pamr(ori_image, mask[:, c:c + 30])
+        assert mask.shape[2] == H and mask.shape[3] == W, f"shape mismatch: ({H}, {W}) / {mask.shape}"
+        return mask, simmap
+    def similarity_assignment_weighted(self, mask, image_feat, self_attn_maps, text_emb, lambda_bg=0.2):
+        bs, c, h, w = image_feat.shape
+        bs, num_classes, h, w = mask.shape
+        bs, num_heads, hw = self_attn_maps.shape
+        image_feat = image_feat.reshape(bs, c, hw)
+        num_classes, c = text_emb.shape
+        avg_head_embed = (self_attn_maps.unsqueeze(2) * image_feat.unsqueeze(1)).mean(dim=-1)
+        avg_head_embed = avg_head_embed / avg_head_embed.norm(dim=-1, keepdim=True)
+        avg_head_embed = avg_head_embed.permute(0, 2, 1) # [B, C, M]
+        head_text_sim = text_emb.unsqueeze(0) @ avg_head_embed # [B, M, N]
+        head_text_sim = (head_text_sim).softmax(dim=-1)
+        head_text_sim_sum = head_text_sim.sum(dim=-1)
+        self_attn_maps_repeat = self_attn_maps.unsqueeze(1).repeat(1, num_classes, 1, 1)
+        head_text_sim_repeat = head_text_sim.unsqueeze(-1).repeat(1, 1, 1, hw)
+        avg_self_attn_per_class = (self_attn_maps_repeat * head_text_sim_repeat).sum(dim=2) / head_text_sim_sum.unsqueeze(-1).repeat(1, 1, hw)
+        avg_self_attn_per_class = avg_self_attn_per_class.softmax(dim=-1)
+        min_self_attn = avg_self_attn_per_class.min().item()
+        max_self_attn = avg_self_attn_per_class.max().item()
+        max_self_attn = max(max_self_attn, max_self_attn - min_self_attn)
+        avg_self_attn_per_class = avg_self_attn_per_class - min_self_attn
+        avg_self_attn_per_class = avg_self_attn_per_class / max_self_attn
+        avg_self_attn_per_class = avg_self_attn_per_class * (mask.max() - mask.min()) + mask.min()
+        mask = mask.reshape(num_classes, hw) # [N, P]
+        mask_output = (mask + lambda_bg * avg_self_attn_per_class).reshape(bs, num_classes, h, w) / (1 + lambda_bg)
+        return mask_output

hf_demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

hooks.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+feats = {}
+def get_self_attention(module, input, output):
+    feats['self_attn'] = output
+def process_self_attention(output, batch_size, num_tokens, num_attn_heads, embed_dim, scale, num_global_tokens, ret_self_attn_maps=False):
+    qkv = output.reshape(batch_size, num_tokens, 3, num_attn_heads, embed_dim // num_attn_heads).permute(2, 0, 3, 1, 4)
+    q, k, v = qkv[0] * scale, qkv[1], qkv[2]
+    attn = q @ k.transpose(-2, -1)
+    self_attn_maps = attn[:, : , 0, num_global_tokens:]
+    self_attn = self_attn_maps.mean(dim=1)
+    self_attn = self_attn.softmax(dim=-1)
+    if ret_self_attn_maps:
+        return self_attn, self_attn_maps
+    else:
+        return self_attn
+def get_vit_out(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['vit_out'] = output
+def get_second_last_out(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['second_last_out'] = output
+def get_all_out_tokens(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['clip_txt_out_tokens'] = output
+def get_clip_second_last_dense_out(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['clip_second_last_out'] = output.permute(1,0,2)
+def get_dinov1_patches(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['dinov1_patches'] = output
+def get_all_out_tokens(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['clip_txt_out_tokens'] = output
+def average_text_tokens(text_embeddings, mask, keep_cls=False, keep_end_seq=False):
+    if not keep_end_seq:
+        mask[torch.arange(mask.shape[0]), mask.sum(dim=1) - 1] = False # excluding end of sequence
+    if not keep_cls:
+        mask[:, 0] = False # excluding CLS token
+    masked_embeddings = text_embeddings * mask.unsqueeze(-1)  # shape: [BS, SEQ_LEN, 512]
+    sum_embeddings = masked_embeddings.sum(dim=1)  # shape: [BS, 512]
+    valid_elements = mask.sum(dim=1, keepdim=True)  # shape: [BS, 1]
+    mean_embeddings = sum_embeddings / valid_elements  # shape: [BS, 512]
+    return mean_embeddings

masker.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# ------------------------------------------------------------------------------
+# Talk2DINO
+# ------------------------------------------------------------------------------
+import copy
+from collections import OrderedDict
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from .us import normalize
+from einops import rearrange, repeat
+# from models.dinotext.gumbel import gumbel_sigmoid
+from .modules import FeatureEncoder
+from omegaconf import OmegaConf
+def build_model(config):
+    model = OmegaConf.to_container(config, resolve=True)
+    return model
+class Sim2Mask(nn.Module):
+    def __init__(self, init_w=1.0, init_b=0.0, gumbel_tau=1.0, learnable=True):
+        super().__init__()
+        self.init_w = init_w
+        self.init_b = init_b
+        self.gumbel_tau = gumbel_tau
+        self.learnable = learnable
+        assert not ((init_w is None) ^ (init_b is None))
+        if learnable:
+            self.w = nn.Parameter(torch.full([], float(init_w)))
+            self.b = nn.Parameter(torch.full([], float(init_b)))
+        else:
+            self.w = init_w
+            self.b = init_b
+    def forward(self, x, deterministic=False):
+        logits = x * self.w + self.b
+        soft_mask = torch.sigmoid(logits)
+        if deterministic:
+            hard_mask = soft_mask.gt(0.5).type(logits.dtype)
+        else:
+            hard_mask = gumbel_sigmoid(logits, hard=True, tau=self.gumbel_tau)
+        return hard_mask, soft_mask
+    def extra_repr(self):
+        return f'init_w={self.init_w}, init_b={self.init_b}, learnable={self.learnable}, gumbel_tau={self.gumbel_tau}'
+class MaskerBackbone(nn.Module):
+    """Masker image encoder backbone.
+    """
+    def __init__(self, clip_visual, freeze_idx):
+        super().__init__()
+        self.transformer = copy.deepcopy(clip_visual.transformer)
+        self.transformer.resblocks = self.transformer.resblocks[freeze_idx:]
+        for block in self.transformer.resblocks:
+            if hasattr(block, "hook_handler"):
+                block.hook_handler.remove()
+        self.ln_post = copy.deepcopy(clip_visual.ln_post)
+        self.proj = copy.deepcopy(clip_visual.proj)
+        self.layers = len(self.transformer.resblocks)
+        self.patch_size = clip_visual.patch_size
+        self.output_dim = clip_visual.output_dim if self.proj is not None else clip_visual.width
+    def forward(self, x, spatial=True, ignore_last_attn=True):
+        if self.layers:
+            x = self.transformer(x, ignore_last_attn=ignore_last_attn)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        if spatial:
+            x = self.ln_post(x)
+        else:
+            x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class MaskerImageFeatureEncoder(FeatureEncoder):
+    def __init__(self, backbone: nn.Module, decoder: nn.Module, ignore_last_attn: bool = True):
+        super().__init__()
+        self.ignore_last_attn = ignore_last_attn
+        self.patch_size = backbone.patch_size
+        self.backbone = backbone
+        self.decoder = decoder
+        for resblock in self.backbone.transformer.resblocks:
+            resblock.hook_handler = resblock.register_forward_hook(self.hook)
+    def _encode(self, image, image_feat):
+        H, W = image.shape[-2:]
+        h = H // self.patch_size
+        w = W // self.patch_size
+        x = self.backbone(image_feat, spatial=True, ignore_last_attn=self.ignore_last_attn)  # BLC
+        x = rearrange(x[:, 1:], "B (H W) C -> B C H W", H=h, W=w)
+        x = self.decoder(x)
+        return x
+class Masker(nn.Module):
+    def __init__(self, backbone, decoder, image_proj, sim2mask, ignore_last_attn, **kwargs):
+        super().__init__()
+        self.ignore_last_attn = ignore_last_attn
+        decoder["C"] = backbone.output_dim
+        decoder = MODELS.build(decoder)
+        decoder = nn.Sequential(OrderedDict([
+            ("decoder", decoder),
+            ("image_proj", image_proj)
+        ]))
+        self.image_encoder = MaskerImageFeatureEncoder(backbone, decoder, ignore_last_attn=ignore_last_attn)
+        self.sim2mask = Sim2Mask(**sim2mask)
+    def forward(self, image, image_feat, text_emb, deterministic=False):
+        B = image.size(0)
+        image_emb, feats = self.image_encoder(image, image_feat, ret_feats=True)  # [BCHW]
+        image_emb_norm = normalize(image_emb, dim=1)
+        text_emb_norm = normalize(text_emb, dim=-1)
+        H, W = image_emb.shape[2:]
+        D = dist.get_world_size()
+        # simmap [B, B*D, H, W] where D is #devices
+        all_text_emb_norm = gather_cat(text_emb_norm, grad=True, contiguous_grad=True)
+        simmap = torch.einsum("bchw,nc->bnhw", image_emb_norm, all_text_emb_norm)
+        mask, soft_mask = self.sim2mask(simmap, deterministic=deterministic)
+        # mask [B, B*D, H, W] where D is #devices
+        # positive global label
+        pos_indices = torch.arange(B, dtype=torch.long, device=image_emb.device) + B * dist.get_rank()
+        pos_mask = mask[torch.arange(B), pos_indices].unsqueeze(1)  # [B, 1, H, W]
+        offdiag = torch.ones(B, B*D, dtype=torch.bool, device=mask.device)
+        offdiag[torch.arange(B), pos_indices] = False
+        soft_pos_mask = soft_mask[torch.arange(B), pos_indices].unsqueeze(1)
+        soft_neg_mask = soft_mask.masked_select(offdiag[..., None, None]).view(B, B*D-1, H, W)
+        masks = {
+            "pos": pos_mask,  # [B, 1, H, W]
+            "soft_pos": soft_pos_mask,
+            "soft_neg": soft_neg_mask,
+            "soft_all": soft_mask,  # [B, N, H, W]
+        }
+        return masks, image_emb, text_emb, feats
+    @torch.no_grad()
+    def forward_seg(self, image, image_feat, text_emb, deterministic=True, hard=False):
+        """Make mask by 1:N matching
+        Args:
+            image [B, 3, H, W]
+            image_feat [L, B, C]: CLIP features
+            text_emb [N, C]
+            deterministic (bool): deterministic inference flag for gumbel noise
+            hard (bool): decide hard or soft returning segmentation mask.
+                Note that soft mask is required for proper evaluation
+        Return:
+            mask [B, N, H', W'] (H' and W' are downsampled H/W)
+        """
+        image_emb = self.image_encoder(image, image_feat)  # [BCHW]
+        image_emb = normalize(image_emb, dim=1)  # BCHW
+        text_emb = normalize(text_emb, dim=-1)  # NC
+        simmap = torch.einsum("b c h w, n c -> b n h w", image_emb, text_emb)
+        hard_mask, soft_mask = self.sim2mask(simmap, deterministic=deterministic)
+        mask = hard_mask if hard else soft_mask
+        return mask, simmap
+class DINOTextMasker(nn.Module):
+    def __init__(self, similarity_type="cosine"):
+        super().__init__()
+        self.sim2mask = DINOTextSim2Mask()
+        self.sim2mask = self.sim2mask.eval()
+        self.similarity_type = similarity_type
+    def forward(self, image, image_feat, text_emb, deterministic=False):
+        pass
+    @torch.no_grad()
+    def forward_seg(self, image_feat, text_emb, deterministic=True, hard=False):
+        """Make mask by 1:N matching
+        Args:
+            image [B, 3, H, W]
+            image_feat [L, B, C]: CLIP features
+            text_emb [N, K, C]
+            deterministic (bool): deterministic inference flag for gumbel noise
+            hard (bool): decide hard or soft returning segmentation mask.
+                Note that soft mask is required for proper evaluation
+            use_k_nn (bool): use kNN to segment
+            k_nn (int): number of nearest neighbors for kNN segmentation
+        Return:
+            mask [B, N, H', W'] (H' and W' are downsampled H/W)
+        """
+        b, c, h, w = image_feat.shape
+        n, c = text_emb.shape
+        if self.similarity_type == "cosine":
+            image_feat = normalize(image_feat, dim=1)  # BCHW
+            # text_emb = normalize(text_emb, dim=-1)  # NKC
+            simmap = torch.einsum("b c h w, n c -> b n h w", image_feat, text_emb)
+        else:
+            raise NotImplementedError("similarity type {} not implemented".format(self.similarity_type))
+        hard_mask, soft_mask = self.sim2mask(simmap, deterministic=deterministic)
+        mask = hard_mask if hard else soft_mask
+        return mask, simmap
+class DINOTextSim2Mask(nn.Module):
+    def __init__(self, gumbel_tau=1.0):
+        super().__init__()
+        self.gumbel_tau = gumbel_tau
+    def forward(self, x, deterministic=False):
+        soft_mask = torch.sigmoid(x)
+        if deterministic:
+            hard_mask = soft_mask.gt(0.5).type(x.dtype)
+        else:
+            hard_mask = gumbel_sigmoid(x, hard=True, tau=self.gumbel_tau)
+        return hard_mask, soft_mask