|
|
|
|
|
|
|
|
|
|
| import os
|
| from enum import Enum
|
| from typing import List, Optional, Union
|
| from urllib.parse import urlparse
|
| from pathlib import Path
|
|
|
| import torch
|
|
|
| from .utils import DINOV3_BASE_URL
|
|
|
|
|
| class Weights(Enum):
|
| LVD1689M = "LVD1689M"
|
| SAT493M = "SAT493M"
|
|
|
|
|
| def is_url(path: str) -> bool:
|
| parsed = urlparse(path)
|
| return parsed.scheme in ("https", "file")
|
|
|
|
|
| def convert_path_or_url_to_url(path: str) -> str:
|
| if is_url(path):
|
| return path
|
| return Path(path).expanduser().resolve().as_uri()
|
|
|
|
|
| def _make_dinov3_vit_model_arch(
|
| *,
|
| patch_size: int = 16,
|
| compact_arch_name: str = "vitb",
|
| ):
|
| if "plus" in compact_arch_name:
|
| model_arch = compact_arch_name.replace("plus", f"{patch_size}plus")
|
| else:
|
| model_arch = f"{compact_arch_name}{patch_size}"
|
| return model_arch
|
|
|
|
|
| def _make_dinov3_vit_model_url(
|
| *,
|
| patch_size: int = 16,
|
| compact_arch_name: str = "vitb",
|
| version: Optional[str] = None,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| hash: Optional[str] = None,
|
| ):
|
| model_name = "dinov3"
|
| model_arch = _make_dinov3_vit_model_arch(patch_size=patch_size, compact_arch_name=compact_arch_name)
|
| version_suffix = f"_{version}" if version else ""
|
| weights_name = weights.value.lower()
|
| hash_suffix = f"-{hash}" if hash else ""
|
| model_dir = f"{model_name}_{model_arch}"
|
| model_filename = f"{model_name}_{model_arch}_pretrain_{weights_name}{version_suffix}{hash_suffix}.pth"
|
| return os.path.join(DINOV3_BASE_URL, model_dir, model_filename)
|
|
|
|
|
| def _make_dinov3_vit(
|
| *,
|
| img_size: int = 224,
|
| patch_size: int = 16,
|
| in_chans: int = 3,
|
| compact_arch_name: str = "vitb",
|
| pos_embed_rope_base: float = 100.0,
|
| pos_embed_rope_min_period: float | None = None,
|
| pos_embed_rope_max_period: float | None = None,
|
| pos_embed_rope_normalize_coords: str = "separate",
|
| pos_embed_rope_shift_coords: float | None = None,
|
| pos_embed_rope_jitter_coords: float | None = None,
|
| pos_embed_rope_rescale_coords: float | None = None,
|
| pos_embed_rope_dtype: str = "fp32",
|
| embed_dim: int = 768,
|
| depth: int = 12,
|
| num_heads: int = 12,
|
| ffn_ratio: float = 4.0,
|
| qkv_bias: bool = True,
|
| drop_path_rate: float = 0.0,
|
| layerscale_init: float | None = None,
|
| norm_layer: str = "layernorm",
|
| ffn_layer: str = "mlp",
|
| ffn_bias: bool = True,
|
| proj_bias: bool = True,
|
| n_storage_tokens: int = 0,
|
| mask_k_bias: bool = False,
|
| pretrained: bool = True,
|
| version: Optional[str] = None,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| hash: Optional[str] = None,
|
| check_hash: bool = False,
|
| **kwargs,
|
| ):
|
| from ..models.vision_transformer import DinoVisionTransformer
|
|
|
| vit_kwargs = dict(
|
| img_size=img_size,
|
| patch_size=patch_size,
|
| in_chans=in_chans,
|
| pos_embed_rope_base=pos_embed_rope_base,
|
| pos_embed_rope_min_period=pos_embed_rope_min_period,
|
| pos_embed_rope_max_period=pos_embed_rope_max_period,
|
| pos_embed_rope_normalize_coords=pos_embed_rope_normalize_coords,
|
| pos_embed_rope_shift_coords=pos_embed_rope_shift_coords,
|
| pos_embed_rope_jitter_coords=pos_embed_rope_jitter_coords,
|
| pos_embed_rope_rescale_coords=pos_embed_rope_rescale_coords,
|
| pos_embed_rope_dtype=pos_embed_rope_dtype,
|
| embed_dim=embed_dim,
|
| depth=depth,
|
| num_heads=num_heads,
|
| ffn_ratio=ffn_ratio,
|
| qkv_bias=qkv_bias,
|
| drop_path_rate=drop_path_rate,
|
| layerscale_init=layerscale_init,
|
| norm_layer=norm_layer,
|
| ffn_layer=ffn_layer,
|
| ffn_bias=ffn_bias,
|
| proj_bias=proj_bias,
|
| n_storage_tokens=n_storage_tokens,
|
| mask_k_bias=mask_k_bias,
|
| )
|
| vit_kwargs.update(**kwargs)
|
| model = DinoVisionTransformer(**vit_kwargs)
|
| if pretrained:
|
| if type(weights) is Weights and weights not in {Weights.LVD1689M, Weights.SAT493M}:
|
| raise ValueError(f"Unsupported weights for the backbone: {weights}")
|
| elif type(weights) is Weights:
|
| url = _make_dinov3_vit_model_url(
|
| patch_size=patch_size,
|
| compact_arch_name=compact_arch_name,
|
| version=version,
|
| weights=weights,
|
| hash=hash,
|
| )
|
| else:
|
| url = convert_path_or_url_to_url(weights)
|
| state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu", check_hash=check_hash)
|
| model.load_state_dict(state_dict, strict=True)
|
| else:
|
| model.init_weights()
|
| return model
|
|
|
|
|
| def _make_dinov3_convnext_model_url(
|
| *,
|
| compact_arch_name: str = "convnext_base",
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| hash: Optional[str] = None,
|
| ):
|
| model_name = "dinov3"
|
| weights_name = weights.value.lower()
|
| hash_suffix = f"-{hash}" if hash else ""
|
|
|
| model_dir = f"{model_name}_{compact_arch_name}"
|
| model_filename = f"{model_name}_{compact_arch_name}_pretrain_{weights_name}{hash_suffix}.pth"
|
| return os.path.join(DINOV3_BASE_URL, model_dir, model_filename)
|
|
|
|
|
| def _make_dinov3_convnext(
|
| in_chans: int = 3,
|
| depths: List[int] = [3, 3, 27, 3],
|
| dims: List[int] = [128, 256, 512, 1024],
|
| compact_arch_name: str = "convnext_base",
|
| drop_path_rate: float = 0.0,
|
| layer_scale_init_value: float = 1e-6,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| hash: Optional[str] = None,
|
| **kwargs,
|
| ):
|
| from ..models.convnext import ConvNeXt
|
|
|
| model_kwargs = dict(
|
| in_chans=in_chans,
|
| depths=depths,
|
| dims=dims,
|
| drop_path_rate=drop_path_rate,
|
| layer_scale_init_value=layer_scale_init_value,
|
| )
|
| model_kwargs.update(**kwargs)
|
| model = ConvNeXt(**model_kwargs)
|
| if pretrained:
|
| if type(weights) is Weights and weights not in {Weights.LVD1689M, Weights.SAT493M}:
|
| raise ValueError(f"Unsupported weights for the backbone: {weights}")
|
| elif type(weights) is Weights:
|
| url = _make_dinov3_convnext_model_url(
|
| compact_arch_name=compact_arch_name,
|
| weights=weights,
|
| hash=hash,
|
| )
|
| else:
|
| url = convert_path_or_url_to_url(weights)
|
| state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
|
| model.load_state_dict(state_dict, strict=True)
|
| return model
|
|
|
|
|
| def dinov3_vits16(
|
| *,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| check_hash: bool = False,
|
| **kwargs,
|
| ):
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = "08c60483"
|
| kwargs["version"] = None
|
| return _make_dinov3_vit(
|
| img_size=224,
|
| patch_size=16,
|
| in_chans=3,
|
| pos_embed_rope_base=100,
|
| pos_embed_rope_normalize_coords="separate",
|
| pos_embed_rope_rescale_coords=2,
|
| pos_embed_rope_dtype="fp32",
|
| embed_dim=384,
|
| depth=12,
|
| num_heads=6,
|
| ffn_ratio=4,
|
| qkv_bias=True,
|
| drop_path_rate=0.0,
|
| layerscale_init=1.0e-05,
|
| norm_layer="layernormbf16",
|
| ffn_layer="mlp",
|
| ffn_bias=True,
|
| proj_bias=True,
|
| n_storage_tokens=4,
|
| mask_k_bias=True,
|
| pretrained=pretrained,
|
| weights=weights,
|
| compact_arch_name="vits",
|
| check_hash=check_hash,
|
| **kwargs,
|
| )
|
|
|
|
|
| def dinov3_vits16plus(
|
| *,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| check_hash: bool = False,
|
| **kwargs,
|
| ):
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = "4057cbaa"
|
| kwargs["version"] = None
|
| return _make_dinov3_vit(
|
| img_size=224,
|
| patch_size=16,
|
| in_chans=3,
|
| pos_embed_rope_base=100,
|
| pos_embed_rope_normalize_coords="separate",
|
| pos_embed_rope_rescale_coords=2,
|
| pos_embed_rope_dtype="fp32",
|
| embed_dim=384,
|
| depth=12,
|
| num_heads=6,
|
| ffn_ratio=6,
|
| qkv_bias=True,
|
| drop_path_rate=0.0,
|
| layerscale_init=1.0e-05,
|
| norm_layer="layernormbf16",
|
| ffn_layer="swiglu",
|
| ffn_bias=True,
|
| proj_bias=True,
|
| n_storage_tokens=4,
|
| mask_k_bias=True,
|
| pretrained=pretrained,
|
| weights=weights,
|
| compact_arch_name="vitsplus",
|
| check_hash=check_hash,
|
| **kwargs,
|
| )
|
|
|
|
|
| def dinov3_vitb16(
|
| *,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| check_hash: bool = False,
|
| **kwargs,
|
| ):
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = "73cec8be"
|
| kwargs["version"] = None
|
| return _make_dinov3_vit(
|
| img_size=224,
|
| patch_size=16,
|
| in_chans=3,
|
| pos_embed_rope_base=100,
|
| pos_embed_rope_normalize_coords="separate",
|
| pos_embed_rope_rescale_coords=2,
|
| pos_embed_rope_dtype="fp32",
|
| embed_dim=768,
|
| depth=12,
|
| num_heads=12,
|
| ffn_ratio=4,
|
| qkv_bias=True,
|
| drop_path_rate=0.0,
|
| layerscale_init=1.0e-05,
|
| norm_layer="layernormbf16",
|
| ffn_layer="mlp",
|
| ffn_bias=True,
|
| proj_bias=True,
|
| n_storage_tokens=4,
|
| mask_k_bias=True,
|
| pretrained=pretrained,
|
| weights=weights,
|
| compact_arch_name="vitb",
|
| check_hash=check_hash,
|
| **kwargs,
|
| )
|
|
|
|
|
| def dinov3_vitl16(
|
| *,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| check_hash: bool = False,
|
| **kwargs,
|
| ):
|
| untie_global_and_local_cls_norm = False
|
| if weights == Weights.LVD1689M:
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = "8aa4cbdd"
|
| elif weights == Weights.SAT493M:
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = "eadcf0ff"
|
| untie_global_and_local_cls_norm = True
|
| elif type(weights) is str:
|
| import re
|
|
|
| pattern = r"-(.{8}).pth"
|
| matches = re.findall(pattern, weights)
|
| if len(matches) != 1:
|
| raise ValueError(f"Unexpected weights specification for the ViT-L backbone: {weights}")
|
| hash = matches[0]
|
| if hash == "eadcf0ff":
|
| untie_global_and_local_cls_norm = True
|
| kwargs["version"] = None
|
| return _make_dinov3_vit(
|
| img_size=224,
|
| patch_size=16,
|
| in_chans=3,
|
| pos_embed_rope_base=100,
|
| pos_embed_rope_normalize_coords="separate",
|
| pos_embed_rope_rescale_coords=2,
|
| pos_embed_rope_dtype="fp32",
|
| embed_dim=1024,
|
| depth=24,
|
| num_heads=16,
|
| ffn_ratio=4,
|
| qkv_bias=True,
|
| drop_path_rate=0.0,
|
| layerscale_init=1.0e-05,
|
| norm_layer="layernormbf16",
|
| ffn_layer="mlp",
|
| ffn_bias=True,
|
| proj_bias=True,
|
| n_storage_tokens=4,
|
| mask_k_bias=True,
|
| untie_global_and_local_cls_norm=untie_global_and_local_cls_norm,
|
| pretrained=pretrained,
|
| weights=weights,
|
| compact_arch_name="vitl",
|
| check_hash=check_hash,
|
| **kwargs,
|
| )
|
|
|
|
|
| def dinov3_vitl16plus(
|
| *,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| check_hash: bool = False,
|
| **kwargs,
|
| ):
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = "46503df0"
|
|
|
| return _make_dinov3_vit(
|
| img_size=224,
|
| patch_size=16,
|
| in_chans=3,
|
| pos_embed_rope_base=100,
|
| pos_embed_rope_normalize_coords="separate",
|
| pos_embed_rope_rescale_coords=2,
|
| pos_embed_rope_dtype="fp32",
|
| embed_dim=1024,
|
| depth=24,
|
| num_heads=16,
|
| ffn_ratio=6.0,
|
| qkv_bias=True,
|
| drop_path_rate=0.0,
|
| layerscale_init=1.0e-05,
|
| norm_layer="layernormbf16",
|
| ffn_layer="swiglu",
|
| ffn_bias=True,
|
| proj_bias=True,
|
| n_storage_tokens=4,
|
| mask_k_bias=True,
|
| pretrained=pretrained,
|
| weights=weights,
|
| compact_arch_name="vitlplus",
|
| check_hash=check_hash,
|
| **kwargs,
|
| )
|
|
|
|
|
| def dinov3_vith16plus(
|
| *,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| check_hash: bool = False,
|
| **kwargs,
|
| ):
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = "7c1da9a5"
|
|
|
| return _make_dinov3_vit(
|
| img_size=224,
|
| patch_size=16,
|
| in_chans=3,
|
| pos_embed_rope_base=100,
|
| pos_embed_rope_normalize_coords="separate",
|
| pos_embed_rope_rescale_coords=2,
|
| pos_embed_rope_dtype="fp32",
|
| embed_dim=1280,
|
| depth=32,
|
| num_heads=20,
|
| ffn_ratio=6.0,
|
| qkv_bias=True,
|
| drop_path_rate=0.0,
|
| layerscale_init=1.0e-05,
|
| norm_layer="layernormbf16",
|
| ffn_layer="swiglu",
|
| ffn_bias=True,
|
| proj_bias=True,
|
| n_storage_tokens=4,
|
| mask_k_bias=True,
|
| pretrained=pretrained,
|
| weights=weights,
|
| compact_arch_name="vithplus",
|
| check_hash=check_hash,
|
| **kwargs,
|
| )
|
|
|
|
|
| def dinov3_vit7b16(
|
| *,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| check_hash: bool = False,
|
| **kwargs,
|
| ):
|
| if weights == Weights.LVD1689M:
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = "a955f4ea"
|
| elif weights == Weights.SAT493M:
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = "a6675841"
|
| kwargs["version"] = None
|
| untie_global_and_local_cls_norm = True
|
| return _make_dinov3_vit(
|
| img_size=224,
|
| patch_size=16,
|
| in_chans=3,
|
| pos_embed_rope_base=100,
|
| pos_embed_rope_normalize_coords="separate",
|
| pos_embed_rope_rescale_coords=2,
|
| pos_embed_rope_dtype="fp32",
|
| embed_dim=4096,
|
| depth=40,
|
| num_heads=32,
|
| ffn_ratio=3,
|
| qkv_bias=False,
|
| drop_path_rate=0.4,
|
| layerscale_init=1.0e-05,
|
| norm_layer="layernormbf16",
|
| ffn_layer="swiglu64",
|
| ffn_bias=True,
|
| proj_bias=True,
|
| n_storage_tokens=4,
|
| mask_k_bias=True,
|
| untie_global_and_local_cls_norm=untie_global_and_local_cls_norm,
|
| pretrained=pretrained,
|
| weights=weights,
|
| compact_arch_name="vit7b",
|
| check_hash=check_hash,
|
| **kwargs,
|
| )
|
|
|
|
|
| def dinov3_convnext_tiny(
|
| *,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| **kwargs,
|
| ):
|
| _hash_convnext = "21b726bb"
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = _hash_convnext
|
|
|
| from ..models.convnext import convnext_sizes
|
|
|
| size_dict = convnext_sizes["tiny"]
|
|
|
| model = _make_dinov3_convnext(
|
| in_chans=3,
|
| depths=size_dict["depths"],
|
| dims=size_dict["dims"],
|
| compact_arch_name="convnext_tiny",
|
| drop_path_rate=0,
|
| layer_scale_init_value=1e-6,
|
| pretrained=pretrained,
|
| weights=weights,
|
| **kwargs,
|
| )
|
| if not pretrained:
|
| model.init_weights()
|
| return model
|
|
|
|
|
| def dinov3_convnext_small(
|
| *,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| **kwargs,
|
| ):
|
| _hash_convnext = "296db49d"
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = _hash_convnext
|
|
|
| from ..models.convnext import convnext_sizes
|
|
|
| size_dict = convnext_sizes["small"]
|
|
|
| model = _make_dinov3_convnext(
|
| in_chans=3,
|
| depths=size_dict["depths"],
|
| dims=size_dict["dims"],
|
| compact_arch_name="convnext_small",
|
| drop_path_rate=0,
|
| layer_scale_init_value=1e-6,
|
| pretrained=pretrained,
|
| weights=weights,
|
| **kwargs,
|
| )
|
| if not pretrained:
|
| model.init_weights()
|
| return model
|
|
|
|
|
| def dinov3_convnext_base(
|
| *,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| **kwargs,
|
| ):
|
| _hash_convnext = "801f2ba9"
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = _hash_convnext
|
|
|
| from ..models.convnext import convnext_sizes
|
|
|
| size_dict = convnext_sizes["base"]
|
|
|
| model = _make_dinov3_convnext(
|
| in_chans=3,
|
| depths=size_dict["depths"],
|
| dims=size_dict["dims"],
|
| compact_arch_name="convnext_base",
|
| drop_path_rate=0,
|
| layer_scale_init_value=1e-6,
|
| pretrained=pretrained,
|
| weights=weights,
|
| **kwargs,
|
| )
|
| if not pretrained:
|
| model.init_weights()
|
| return model
|
|
|
|
|
| def dinov3_convnext_large(
|
| *,
|
| pretrained: bool = True,
|
| weights: Union[Weights, str] = Weights.LVD1689M,
|
| **kwargs,
|
| ):
|
| _hash_convnext = "61fa432d"
|
| if "hash" not in kwargs:
|
| kwargs["hash"] = _hash_convnext
|
|
|
| from ..models.convnext import convnext_sizes
|
|
|
| size_dict = convnext_sizes["large"]
|
|
|
| model = _make_dinov3_convnext(
|
| in_chans=3,
|
| depths=size_dict["depths"],
|
| dims=size_dict["dims"],
|
| compact_arch_name="convnext_large",
|
| drop_path_rate=0,
|
| layer_scale_init_value=1e-6,
|
| pretrained=pretrained,
|
| weights=weights,
|
| **kwargs,
|
| )
|
| if not pretrained:
|
| model.init_weights()
|
| return model
|
|
|