Spaces:

osherr
/

Prior2DSM

Sleeping

App Files Files Community

Prior2DSM / src /dinov3 /hub /backbones.py

osherr

Upload 222 files

bc90483 verified 5 months ago

raw

history blame contribute delete

17.8 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	#
	# This software may be used and distributed in accordance with
	# the terms of the DINOv3 License Agreement.

	import os
	from enum import Enum
	from typing import List, Optional, Union
	from urllib.parse import urlparse
	from pathlib import Path

	import torch

	from .utils import DINOV3_BASE_URL


	class Weights(Enum):
	LVD1689M = "LVD1689M"
	SAT493M = "SAT493M"


	def is_url(path: str) -> bool:
	parsed = urlparse(path)
	return parsed.scheme in ("https", "file")


	def convert_path_or_url_to_url(path: str) -> str:
	if is_url(path):
	return path
	return Path(path).expanduser().resolve().as_uri()


	def _make_dinov3_vit_model_arch(
	*,
	patch_size: int = 16,
	compact_arch_name: str = "vitb",
	):
	if "plus" in compact_arch_name:
	model_arch = compact_arch_name.replace("plus", f"{patch_size}plus")
	else:
	model_arch = f"{compact_arch_name}{patch_size}"
	return model_arch


	def _make_dinov3_vit_model_url(
	*,
	patch_size: int = 16,
	compact_arch_name: str = "vitb",
	version: Optional[str] = None,
	weights: Union[Weights, str] = Weights.LVD1689M,
	hash: Optional[str] = None,
	):
	model_name = "dinov3"
	model_arch = _make_dinov3_vit_model_arch(patch_size=patch_size, compact_arch_name=compact_arch_name)
	version_suffix = f"_{version}" if version else ""
	weights_name = weights.value.lower()
	hash_suffix = f"-{hash}" if hash else ""
	model_dir = f"{model_name}_{model_arch}"
	model_filename = f"{model_name}_{model_arch}_pretrain_{weights_name}{version_suffix}{hash_suffix}.pth"
	return os.path.join(DINOV3_BASE_URL, model_dir, model_filename)


	def _make_dinov3_vit(
	*,
	img_size: int = 224,
	patch_size: int = 16,
	in_chans: int = 3,
	compact_arch_name: str = "vitb",
	pos_embed_rope_base: float = 100.0,
	pos_embed_rope_min_period: float \| None = None,
	pos_embed_rope_max_period: float \| None = None,
	pos_embed_rope_normalize_coords: str = "separate",
	pos_embed_rope_shift_coords: float \| None = None,
	pos_embed_rope_jitter_coords: float \| None = None,
	pos_embed_rope_rescale_coords: float \| None = None,
	pos_embed_rope_dtype: str = "fp32",
	embed_dim: int = 768,
	depth: int = 12,
	num_heads: int = 12,
	ffn_ratio: float = 4.0,
	qkv_bias: bool = True,
	drop_path_rate: float = 0.0,
	layerscale_init: float \| None = None,
	norm_layer: str = "layernorm",
	ffn_layer: str = "mlp",
	ffn_bias: bool = True,
	proj_bias: bool = True,
	n_storage_tokens: int = 0,
	mask_k_bias: bool = False,
	pretrained: bool = True,
	version: Optional[str] = None,
	weights: Union[Weights, str] = Weights.LVD1689M,
	hash: Optional[str] = None,
	check_hash: bool = False,
	**kwargs,
	):
	from ..models.vision_transformer import DinoVisionTransformer

	vit_kwargs = dict(
	img_size=img_size,
	patch_size=patch_size,
	in_chans=in_chans,
	pos_embed_rope_base=pos_embed_rope_base,
	pos_embed_rope_min_period=pos_embed_rope_min_period,
	pos_embed_rope_max_period=pos_embed_rope_max_period,
	pos_embed_rope_normalize_coords=pos_embed_rope_normalize_coords,
	pos_embed_rope_shift_coords=pos_embed_rope_shift_coords,
	pos_embed_rope_jitter_coords=pos_embed_rope_jitter_coords,
	pos_embed_rope_rescale_coords=pos_embed_rope_rescale_coords,
	pos_embed_rope_dtype=pos_embed_rope_dtype,
	embed_dim=embed_dim,
	depth=depth,
	num_heads=num_heads,
	ffn_ratio=ffn_ratio,
	qkv_bias=qkv_bias,
	drop_path_rate=drop_path_rate,
	layerscale_init=layerscale_init,
	norm_layer=norm_layer,
	ffn_layer=ffn_layer,
	ffn_bias=ffn_bias,
	proj_bias=proj_bias,
	n_storage_tokens=n_storage_tokens,
	mask_k_bias=mask_k_bias,
	)
	vit_kwargs.update(**kwargs)
	model = DinoVisionTransformer(**vit_kwargs)
	if pretrained:
	if type(weights) is Weights and weights not in {Weights.LVD1689M, Weights.SAT493M}:
	raise ValueError(f"Unsupported weights for the backbone: {weights}")
	elif type(weights) is Weights:
	url = _make_dinov3_vit_model_url(
	patch_size=patch_size,
	compact_arch_name=compact_arch_name,
	version=version,
	weights=weights,
	hash=hash,
	)
	else:
	url = convert_path_or_url_to_url(weights)
	state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu", check_hash=check_hash)
	model.load_state_dict(state_dict, strict=True)
	else:
	model.init_weights()
	return model


	def _make_dinov3_convnext_model_url(
	*,
	compact_arch_name: str = "convnext_base",
	weights: Union[Weights, str] = Weights.LVD1689M,
	hash: Optional[str] = None,
	):
	model_name = "dinov3"
	weights_name = weights.value.lower()
	hash_suffix = f"-{hash}" if hash else ""

	model_dir = f"{model_name}_{compact_arch_name}"
	model_filename = f"{model_name}_{compact_arch_name}_pretrain_{weights_name}{hash_suffix}.pth"
	return os.path.join(DINOV3_BASE_URL, model_dir, model_filename)


	def _make_dinov3_convnext(
	in_chans: int = 3,
	depths: List[int] = [3, 3, 27, 3],
	dims: List[int] = [128, 256, 512, 1024],
	compact_arch_name: str = "convnext_base",
	drop_path_rate: float = 0.0,
	layer_scale_init_value: float = 1e-6,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	hash: Optional[str] = None,
	**kwargs,
	):
	from ..models.convnext import ConvNeXt

	model_kwargs = dict(
	in_chans=in_chans,
	depths=depths,
	dims=dims,
	drop_path_rate=drop_path_rate,
	layer_scale_init_value=layer_scale_init_value,
	)
	model_kwargs.update(**kwargs)
	model = ConvNeXt(**model_kwargs)
	if pretrained:
	if type(weights) is Weights and weights not in {Weights.LVD1689M, Weights.SAT493M}:
	raise ValueError(f"Unsupported weights for the backbone: {weights}")
	elif type(weights) is Weights:
	url = _make_dinov3_convnext_model_url(
	compact_arch_name=compact_arch_name,
	weights=weights,
	hash=hash,
	)
	else:
	url = convert_path_or_url_to_url(weights)
	state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
	model.load_state_dict(state_dict, strict=True)
	return model


	def dinov3_vits16(
	*,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	check_hash: bool = False,
	**kwargs,
	):
	if "hash" not in kwargs:
	kwargs["hash"] = "08c60483"
	kwargs["version"] = None
	return _make_dinov3_vit(
	img_size=224,
	patch_size=16,
	in_chans=3,
	pos_embed_rope_base=100,
	pos_embed_rope_normalize_coords="separate",
	pos_embed_rope_rescale_coords=2,
	pos_embed_rope_dtype="fp32",
	embed_dim=384,
	depth=12,
	num_heads=6,
	ffn_ratio=4,
	qkv_bias=True,
	drop_path_rate=0.0,
	layerscale_init=1.0e-05,
	norm_layer="layernormbf16",
	ffn_layer="mlp",
	ffn_bias=True,
	proj_bias=True,
	n_storage_tokens=4,
	mask_k_bias=True,
	pretrained=pretrained,
	weights=weights,
	compact_arch_name="vits",
	check_hash=check_hash,
	**kwargs,
	)


	def dinov3_vits16plus(
	*,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	check_hash: bool = False,
	**kwargs,
	):
	if "hash" not in kwargs:
	kwargs["hash"] = "4057cbaa"
	kwargs["version"] = None
	return _make_dinov3_vit(
	img_size=224,
	patch_size=16,
	in_chans=3,
	pos_embed_rope_base=100,
	pos_embed_rope_normalize_coords="separate",
	pos_embed_rope_rescale_coords=2,
	pos_embed_rope_dtype="fp32",
	embed_dim=384,
	depth=12,
	num_heads=6,
	ffn_ratio=6,
	qkv_bias=True,
	drop_path_rate=0.0,
	layerscale_init=1.0e-05,
	norm_layer="layernormbf16",
	ffn_layer="swiglu",
	ffn_bias=True,
	proj_bias=True,
	n_storage_tokens=4,
	mask_k_bias=True,
	pretrained=pretrained,
	weights=weights,
	compact_arch_name="vitsplus",
	check_hash=check_hash,
	**kwargs,
	)


	def dinov3_vitb16(
	*,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	check_hash: bool = False,
	**kwargs,
	):
	if "hash" not in kwargs:
	kwargs["hash"] = "73cec8be"
	kwargs["version"] = None
	return _make_dinov3_vit(
	img_size=224,
	patch_size=16,
	in_chans=3,
	pos_embed_rope_base=100,
	pos_embed_rope_normalize_coords="separate",
	pos_embed_rope_rescale_coords=2,
	pos_embed_rope_dtype="fp32",
	embed_dim=768,
	depth=12,
	num_heads=12,
	ffn_ratio=4,
	qkv_bias=True,
	drop_path_rate=0.0,
	layerscale_init=1.0e-05,
	norm_layer="layernormbf16",
	ffn_layer="mlp",
	ffn_bias=True,
	proj_bias=True,
	n_storage_tokens=4,
	mask_k_bias=True,
	pretrained=pretrained,
	weights=weights,
	compact_arch_name="vitb",
	check_hash=check_hash,
	**kwargs,
	)


	def dinov3_vitl16(
	*,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	check_hash: bool = False,
	**kwargs,
	):
	untie_global_and_local_cls_norm = False
	if weights == Weights.LVD1689M:
	if "hash" not in kwargs:
	kwargs["hash"] = "8aa4cbdd"
	elif weights == Weights.SAT493M:
	if "hash" not in kwargs:
	kwargs["hash"] = "eadcf0ff"
	untie_global_and_local_cls_norm = True
	elif type(weights) is str:
	import re

	pattern = r"-(.{8}).pth"
	matches = re.findall(pattern, weights)
	if len(matches) != 1:
	raise ValueError(f"Unexpected weights specification for the ViT-L backbone: {weights}")
	hash = matches[0]
	if hash == "eadcf0ff":
	untie_global_and_local_cls_norm = True
	kwargs["version"] = None
	return _make_dinov3_vit(
	img_size=224,
	patch_size=16,
	in_chans=3,
	pos_embed_rope_base=100,
	pos_embed_rope_normalize_coords="separate",
	pos_embed_rope_rescale_coords=2,
	pos_embed_rope_dtype="fp32",
	embed_dim=1024,
	depth=24,
	num_heads=16,
	ffn_ratio=4,
	qkv_bias=True,
	drop_path_rate=0.0,
	layerscale_init=1.0e-05,
	norm_layer="layernormbf16",
	ffn_layer="mlp",
	ffn_bias=True,
	proj_bias=True,
	n_storage_tokens=4,
	mask_k_bias=True,
	untie_global_and_local_cls_norm=untie_global_and_local_cls_norm,
	pretrained=pretrained,
	weights=weights,
	compact_arch_name="vitl",
	check_hash=check_hash,
	**kwargs,
	)


	def dinov3_vitl16plus(
	*,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	check_hash: bool = False,
	**kwargs,
	):
	if "hash" not in kwargs:
	kwargs["hash"] = "46503df0"

	return _make_dinov3_vit(
	img_size=224,
	patch_size=16,
	in_chans=3,
	pos_embed_rope_base=100,
	pos_embed_rope_normalize_coords="separate",
	pos_embed_rope_rescale_coords=2,
	pos_embed_rope_dtype="fp32",
	embed_dim=1024,
	depth=24,
	num_heads=16,
	ffn_ratio=6.0,
	qkv_bias=True,
	drop_path_rate=0.0,
	layerscale_init=1.0e-05,
	norm_layer="layernormbf16",
	ffn_layer="swiglu",
	ffn_bias=True,
	proj_bias=True,
	n_storage_tokens=4,
	mask_k_bias=True,
	pretrained=pretrained,
	weights=weights,
	compact_arch_name="vitlplus",
	check_hash=check_hash,
	**kwargs,
	)


	def dinov3_vith16plus(
	*,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	check_hash: bool = False,
	**kwargs,
	):
	if "hash" not in kwargs:
	kwargs["hash"] = "7c1da9a5"

	return _make_dinov3_vit(
	img_size=224,
	patch_size=16,
	in_chans=3,
	pos_embed_rope_base=100,
	pos_embed_rope_normalize_coords="separate",
	pos_embed_rope_rescale_coords=2,
	pos_embed_rope_dtype="fp32",
	embed_dim=1280,
	depth=32,
	num_heads=20,
	ffn_ratio=6.0,
	qkv_bias=True,
	drop_path_rate=0.0,
	layerscale_init=1.0e-05,
	norm_layer="layernormbf16",
	ffn_layer="swiglu",
	ffn_bias=True,
	proj_bias=True,
	n_storage_tokens=4,
	mask_k_bias=True,
	pretrained=pretrained,
	weights=weights,
	compact_arch_name="vithplus",
	check_hash=check_hash,
	**kwargs,
	)


	def dinov3_vit7b16(
	*,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	check_hash: bool = False,
	**kwargs,
	):
	if weights == Weights.LVD1689M:
	if "hash" not in kwargs:
	kwargs["hash"] = "a955f4ea"
	elif weights == Weights.SAT493M:
	if "hash" not in kwargs:
	kwargs["hash"] = "a6675841"
	kwargs["version"] = None
	untie_global_and_local_cls_norm = True
	return _make_dinov3_vit(
	img_size=224,
	patch_size=16,
	in_chans=3,
	pos_embed_rope_base=100,
	pos_embed_rope_normalize_coords="separate",
	pos_embed_rope_rescale_coords=2,
	pos_embed_rope_dtype="fp32",
	embed_dim=4096,
	depth=40,
	num_heads=32,
	ffn_ratio=3,
	qkv_bias=False,
	drop_path_rate=0.4,
	layerscale_init=1.0e-05,
	norm_layer="layernormbf16",
	ffn_layer="swiglu64",
	ffn_bias=True,
	proj_bias=True,
	n_storage_tokens=4,
	mask_k_bias=True,
	untie_global_and_local_cls_norm=untie_global_and_local_cls_norm,
	pretrained=pretrained,
	weights=weights,
	compact_arch_name="vit7b",
	check_hash=check_hash,
	**kwargs,
	)


	def dinov3_convnext_tiny(
	*,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	**kwargs,
	):
	_hash_convnext = "21b726bb"
	if "hash" not in kwargs:
	kwargs["hash"] = _hash_convnext

	from ..models.convnext import convnext_sizes

	size_dict = convnext_sizes["tiny"]

	model = _make_dinov3_convnext(
	in_chans=3,
	depths=size_dict["depths"],
	dims=size_dict["dims"],
	compact_arch_name="convnext_tiny",
	drop_path_rate=0,
	layer_scale_init_value=1e-6,
	pretrained=pretrained,
	weights=weights,
	**kwargs,
	)
	if not pretrained:
	model.init_weights()
	return model


	def dinov3_convnext_small(
	*,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	**kwargs,
	):
	_hash_convnext = "296db49d"
	if "hash" not in kwargs:
	kwargs["hash"] = _hash_convnext

	from ..models.convnext import convnext_sizes

	size_dict = convnext_sizes["small"]

	model = _make_dinov3_convnext(
	in_chans=3,
	depths=size_dict["depths"],
	dims=size_dict["dims"],
	compact_arch_name="convnext_small",
	drop_path_rate=0,
	layer_scale_init_value=1e-6,
	pretrained=pretrained,
	weights=weights,
	**kwargs,
	)
	if not pretrained:
	model.init_weights()
	return model


	def dinov3_convnext_base(
	*,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	**kwargs,
	):
	_hash_convnext = "801f2ba9"
	if "hash" not in kwargs:
	kwargs["hash"] = _hash_convnext

	from ..models.convnext import convnext_sizes

	size_dict = convnext_sizes["base"]

	model = _make_dinov3_convnext(
	in_chans=3,
	depths=size_dict["depths"],
	dims=size_dict["dims"],
	compact_arch_name="convnext_base",
	drop_path_rate=0,
	layer_scale_init_value=1e-6,
	pretrained=pretrained,
	weights=weights,
	**kwargs,
	)
	if not pretrained:
	model.init_weights()
	return model


	def dinov3_convnext_large(
	*,
	pretrained: bool = True,
	weights: Union[Weights, str] = Weights.LVD1689M,
	**kwargs,
	):
	_hash_convnext = "61fa432d"
	if "hash" not in kwargs:
	kwargs["hash"] = _hash_convnext

	from ..models.convnext import convnext_sizes

	size_dict = convnext_sizes["large"]

	model = _make_dinov3_convnext(
	in_chans=3,
	depths=size_dict["depths"],
	dims=size_dict["dims"],
	compact_arch_name="convnext_large",
	drop_path_rate=0,
	layer_scale_init_value=1e-6,
	pretrained=pretrained,
	weights=weights,
	**kwargs,
	)
	if not pretrained:
	model.init_weights()
	return model