| | |
| | |
| | |
| | |
| |
|
| | from enum import Enum |
| | from typing import Union |
| |
|
| | import torch |
| | import torch.nn as nn |
| |
|
| | from .backbones import _make_dinov2_model |
| | from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name |
| |
|
| |
|
| | class Weights(Enum): |
| | IMAGENET1K = "IMAGENET1K" |
| |
|
| |
|
| | def _make_dinov2_linear_classification_head( |
| | *, |
| | arch_name: str = "vit_large", |
| | patch_size: int = 14, |
| | embed_dim: int = 1024, |
| | layers: int = 4, |
| | pretrained: bool = True, |
| | weights: Union[Weights, str] = Weights.IMAGENET1K, |
| | num_register_tokens: int = 0, |
| | **kwargs, |
| | ): |
| | if layers not in (1, 4): |
| | raise AssertionError(f"Unsupported number of layers: {layers}") |
| | if isinstance(weights, str): |
| | try: |
| | weights = Weights[weights] |
| | except KeyError: |
| | raise AssertionError(f"Unsupported weights: {weights}") |
| |
|
| | linear_head = nn.Linear((1 + layers) * embed_dim, 1_000) |
| |
|
| | if pretrained: |
| | model_base_name = _make_dinov2_model_name(arch_name, patch_size) |
| | model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens) |
| | layers_str = str(layers) if layers == 4 else "" |
| | url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_linear{layers_str}_head.pth" |
| | state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") |
| | linear_head.load_state_dict(state_dict, strict=True) |
| |
|
| | return linear_head |
| |
|
| |
|
| | class _LinearClassifierWrapper(nn.Module): |
| | def __init__(self, *, backbone: nn.Module, linear_head: nn.Module, layers: int = 4): |
| | super().__init__() |
| | self.backbone = backbone |
| | self.linear_head = linear_head |
| | self.layers = layers |
| |
|
| | def forward(self, x): |
| | if self.layers == 1: |
| | x = self.backbone.forward_features(x) |
| | cls_token = x["x_norm_clstoken"] |
| | patch_tokens = x["x_norm_patchtokens"] |
| | |
| | linear_input = torch.cat([ |
| | cls_token, |
| | patch_tokens.mean(dim=1), |
| | ], dim=1) |
| | |
| | elif self.layers == 4: |
| | x = self.backbone.get_intermediate_layers(x, n=4, return_class_token=True) |
| | |
| | linear_input = torch.cat([ |
| | x[0][1], |
| | x[1][1], |
| | x[2][1], |
| | x[3][1], |
| | x[3][0].mean(dim=1), |
| | ], dim=1) |
| | |
| | else: |
| | assert False, f"Unsupported number of layers: {self.layers}" |
| | return self.linear_head(linear_input) |
| |
|
| |
|
| | def _make_dinov2_linear_classifier( |
| | *, |
| | arch_name: str = "vit_large", |
| | layers: int = 4, |
| | pretrained: bool = True, |
| | weights: Union[Weights, str] = Weights.IMAGENET1K, |
| | num_register_tokens: int = 0, |
| | interpolate_antialias: bool = False, |
| | interpolate_offset: float = 0.1, |
| | **kwargs, |
| | ): |
| | backbone = _make_dinov2_model( |
| | arch_name=arch_name, |
| | pretrained=pretrained, |
| | num_register_tokens=num_register_tokens, |
| | interpolate_antialias=interpolate_antialias, |
| | interpolate_offset=interpolate_offset, |
| | **kwargs, |
| | ) |
| |
|
| | embed_dim = backbone.embed_dim |
| | patch_size = backbone.patch_size |
| | linear_head = _make_dinov2_linear_classification_head( |
| | arch_name=arch_name, |
| | patch_size=patch_size, |
| | embed_dim=embed_dim, |
| | layers=layers, |
| | pretrained=pretrained, |
| | weights=weights, |
| | num_register_tokens=num_register_tokens, |
| | ) |
| |
|
| | return _LinearClassifierWrapper(backbone=backbone, linear_head=linear_head, layers=layers) |
| |
|
| |
|
| | def dinov2_vits14_lc( |
| | *, |
| | layers: int = 4, |
| | pretrained: bool = True, |
| | weights: Union[Weights, str] = Weights.IMAGENET1K, |
| | **kwargs, |
| | ): |
| | """ |
| | Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-S/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k. |
| | """ |
| | return _make_dinov2_linear_classifier( |
| | arch_name="vit_small", |
| | layers=layers, |
| | pretrained=pretrained, |
| | weights=weights, |
| | **kwargs, |
| | ) |
| |
|
| |
|
| | def dinov2_vitb14_lc( |
| | *, |
| | layers: int = 4, |
| | pretrained: bool = True, |
| | weights: Union[Weights, str] = Weights.IMAGENET1K, |
| | **kwargs, |
| | ): |
| | """ |
| | Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-B/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k. |
| | """ |
| | return _make_dinov2_linear_classifier( |
| | arch_name="vit_base", |
| | layers=layers, |
| | pretrained=pretrained, |
| | weights=weights, |
| | **kwargs, |
| | ) |
| |
|
| |
|
| | def dinov2_vitl14_lc( |
| | *, |
| | layers: int = 4, |
| | pretrained: bool = True, |
| | weights: Union[Weights, str] = Weights.IMAGENET1K, |
| | **kwargs, |
| | ): |
| | """ |
| | Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-L/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k. |
| | """ |
| | return _make_dinov2_linear_classifier( |
| | arch_name="vit_large", |
| | layers=layers, |
| | pretrained=pretrained, |
| | weights=weights, |
| | **kwargs, |
| | ) |
| |
|
| |
|
| | def dinov2_vitg14_lc( |
| | *, |
| | layers: int = 4, |
| | pretrained: bool = True, |
| | weights: Union[Weights, str] = Weights.IMAGENET1K, |
| | **kwargs, |
| | ): |
| | """ |
| | Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-g/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k. |
| | """ |
| | return _make_dinov2_linear_classifier( |
| | arch_name="vit_giant2", |
| | layers=layers, |
| | ffn_layer="swiglufused", |
| | pretrained=pretrained, |
| | weights=weights, |
| | **kwargs, |
| | ) |
| |
|
| |
|
| | def dinov2_vits14_reg_lc( |
| | *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs |
| | ): |
| | """ |
| | Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-S/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k. |
| | """ |
| | return _make_dinov2_linear_classifier( |
| | arch_name="vit_small", |
| | layers=layers, |
| | pretrained=pretrained, |
| | weights=weights, |
| | num_register_tokens=4, |
| | interpolate_antialias=True, |
| | interpolate_offset=0.0, |
| | **kwargs, |
| | ) |
| |
|
| |
|
| | def dinov2_vitb14_reg_lc( |
| | *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs |
| | ): |
| | """ |
| | Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-B/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k. |
| | """ |
| | return _make_dinov2_linear_classifier( |
| | arch_name="vit_base", |
| | layers=layers, |
| | pretrained=pretrained, |
| | weights=weights, |
| | num_register_tokens=4, |
| | interpolate_antialias=True, |
| | interpolate_offset=0.0, |
| | **kwargs, |
| | ) |
| |
|
| |
|
| | def dinov2_vitl14_reg_lc( |
| | *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs |
| | ): |
| | """ |
| | Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-L/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k. |
| | """ |
| | return _make_dinov2_linear_classifier( |
| | arch_name="vit_large", |
| | layers=layers, |
| | pretrained=pretrained, |
| | weights=weights, |
| | num_register_tokens=4, |
| | interpolate_antialias=True, |
| | interpolate_offset=0.0, |
| | **kwargs, |
| | ) |
| |
|
| |
|
| | def dinov2_vitg14_reg_lc( |
| | *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs |
| | ): |
| | """ |
| | Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-g/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k. |
| | """ |
| | return _make_dinov2_linear_classifier( |
| | arch_name="vit_giant2", |
| | layers=layers, |
| | ffn_layer="swiglufused", |
| | pretrained=pretrained, |
| | weights=weights, |
| | num_register_tokens=4, |
| | interpolate_antialias=True, |
| | interpolate_offset=0.0, |
| | **kwargs, |
| | ) |
| |
|