depth-anything-3

Running on Zero

depth-anything-3 / depth_anything_3 /model /dinov2 /layers /block.py

linhaotong

update

b9f87ab 29 days ago

5.01 kB

	# flake8: noqa: F821
	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	# References:
	# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
	# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py

	import logging
	from typing import Callable, Optional
	import torch
	from torch import Tensor, nn

	from .attention import Attention
	from .drop_path import DropPath
	from .layer_scale import LayerScale
	from .mlp import Mlp

	logger = logging.getLogger("dinov2")
	XFORMERS_AVAILABLE = True


	class Block(nn.Module):
	def __init__(
	self,
	dim: int,
	num_heads: int,
	mlp_ratio: float = 4.0,
	qkv_bias: bool = False,
	proj_bias: bool = True,
	ffn_bias: bool = True,
	drop: float = 0.0,
	attn_drop: float = 0.0,
	init_values=None,
	drop_path: float = 0.0,
	act_layer: Callable[..., nn.Module] = nn.GELU,
	norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
	attn_class: Callable[..., nn.Module] = Attention,
	ffn_layer: Callable[..., nn.Module] = Mlp,
	qk_norm: bool = False,
	rope=None,
	ln_eps: float = 1e-6,
	) -> None:
	super().__init__()
	# print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
	self.norm1 = norm_layer(dim, eps=ln_eps)
	self.attn = attn_class(
	dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	proj_bias=proj_bias,
	attn_drop=attn_drop,
	proj_drop=drop,
	qk_norm=qk_norm,
	rope=rope,
	)
	self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
	self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

	self.norm2 = norm_layer(dim, eps=ln_eps)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = ffn_layer(
	in_features=dim,
	hidden_features=mlp_hidden_dim,
	act_layer=act_layer,
	drop=drop,
	bias=ffn_bias,
	)
	self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
	self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

	self.sample_drop_ratio = drop_path

	def forward(self, x: Tensor, pos=None, attn_mask=None) -> Tensor:
	def attn_residual_func(x: Tensor, pos=None, attn_mask=None) -> Tensor:
	return self.ls1(self.attn(self.norm1(x), pos=pos, attn_mask=attn_mask))

	def ffn_residual_func(x: Tensor) -> Tensor:
	return self.ls2(self.mlp(self.norm2(x)))

	if self.training and self.sample_drop_ratio > 0.1:
	# the overhead is compensated only for a drop path rate larger than 0.1
	x = drop_add_residual_stochastic_depth(
	x,
	residual_func=attn_residual_func,
	sample_drop_ratio=self.sample_drop_ratio,
	pos=pos,
	)
	x = drop_add_residual_stochastic_depth(
	x,
	residual_func=ffn_residual_func,
	sample_drop_ratio=self.sample_drop_ratio,
	)
	elif self.training and self.sample_drop_ratio > 0.0:
	x = x + self.drop_path1(attn_residual_func(x, pos=pos, attn_mask=attn_mask))
	x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
	else:
	x = x + attn_residual_func(x, pos=pos, attn_mask=attn_mask)
	x = x + ffn_residual_func(x)
	return x


	def drop_add_residual_stochastic_depth(
	x: Tensor,
	residual_func: Callable[[Tensor], Tensor],
	sample_drop_ratio: float = 0.0,
	pos: Optional[Tensor] = None,
	) -> Tensor:
	# 1) extract subset using permutation
	b, n, d = x.shape
	sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
	brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
	x_subset = x[brange]

	# 2) apply residual_func to get residual
	if pos is not None:
	# if necessary, apply rope to the subset
	pos = pos[brange]
	residual = residual_func(x_subset, pos=pos)
	else:
	residual = residual_func(x_subset)

	x_flat = x.flatten(1)
	residual = residual.flatten(1)

	residual_scale_factor = b / sample_subset_size

	# 3) add the residual
	x_plus_residual = torch.index_add(
	x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor
	)
	return x_plus_residual.view_as(x)


	def get_branges_scales(x, sample_drop_ratio=0.0):
	b, n, d = x.shape
	sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
	brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
	residual_scale_factor = b / sample_subset_size
	return brange, residual_scale_factor