Spaces:

OK-AI
/

ViT-Patch-PCA-Visualisation

Running

App Files Files Community

Tenbatsu24 commited on 8 days ago

Commit

a10ce46

1 Parent(s): 94c37e9

add: missing files

Browse files

Files changed (30) hide show

configuration_vitv2.py +28 -0
hf_src/__init__.py +0 -0
hf_src/layers/__init__.py +31 -0
hf_src/layers/attention.py +105 -0
hf_src/layers/block.py +331 -0
hf_src/layers/cva_head.py +184 -0
hf_src/layers/dino_head.py +76 -0
hf_src/layers/drop_path.py +31 -0
hf_src/layers/fp8_linear.py +144 -0
hf_src/layers/layer_scale.py +28 -0
hf_src/layers/mlp.py +49 -0
hf_src/layers/patch_embed.py +96 -0
hf_src/layers/rms_norm.py +24 -0
hf_src/layers/rope_attention.py +182 -0
hf_src/layers/rope_block.py +299 -0
hf_src/layers/rope_position_encoding.py +184 -0
hf_src/layers/sparse_linear.py +94 -0
hf_src/layers/swiglu_ffn.py +64 -0
hf_src/model/__init__.py +0 -0
hf_src/model/image/__init__.py +0 -0
hf_src/model/image/vitv2/__init__.py +0 -0
hf_src/model/image/vitv2/transformer.py +475 -0
hf_src/utils/__init__.py +16 -0
hf_src/utils/download.py +99 -0
hf_src/utils/dtype.py +37 -0
hf_src/utils/masking.py +113 -0
hf_src/utils/seedlet_masking.py +0 -0
hf_src/utils/utils.py +136 -0
modelling_vitv2.py +32 -0
requirements.txt +2 -0

configuration_vitv2.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from transformers import PretrainedConfig
+class ViTv2Config(PretrainedConfig):
+    model_type = "vitv2"
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        num_register_tokens=0,
+        init_values=None,
+        **ignored_kwargs,
+    ):
+        super().__init__(**ignored_kwargs)
+        self.depth = depth
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.init_values = init_values
+        self.num_register_tokens = num_register_tokens

hf_src/__init__.py ADDED Viewed

File without changes

hf_src/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from .mlp import Mlp
+from .block import Block  # noqa: F401
+from .rms_norm import RMSNorm
+from .drop_path import DropPath
+from .dino_head import DINOHead
+from .layer_scale import LayerScale
+from .patch_embed import PatchEmbed
+from .block import NestedTensorBlock
+from .attention import MemEffAttention
+from .rope_block import SelfAttentionBlock
+from .cva_head import CVAHead, IdentityHead
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .rope_position_encoding import RopePositionEmbedding
+__all__ = [
+    "CVAHead",
+    "RMSNorm",
+    "IdentityHead",
+    "DINOHead",
+    "DropPath",
+    "Block",
+    "Mlp",
+    "PatchEmbed",
+    "LayerScale",
+    "SwiGLUFFN",
+    "SwiGLUFFNFused",
+    "NestedTensorBlock",
+    "MemEffAttention",
+    "SelfAttentionBlock",
+    "RopePositionEmbedding",
+]

hf_src/layers/attention.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import os
+from torch import Tensor
+from torch import nn
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+    else:
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor, return_attn=False) -> Tensor:
+        """
+        Adapted from https://gitlab.com/ziegleto-machine-learning/dino/-/tree/main/
+        """
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        # Adaptation for returing attentions
+        if return_attn:
+            return attn
+        return x
+class MemEffAttention(Attention):
+    """
+    Adapted from https://gitlab.com/ziegleto-machine-learning/dino/-/tree/main/
+    """
+    def forward(self, x: Tensor, attn_bias=None, return_attn=False) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            # Change this line
+            # return super().forward(x)
+            # Adaptation for returing attentions
+            return super().forward(x, return_attn)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        if return_attn:
+            # Support for XFORMERS to return attention
+            # Adapted from https://github.com/facebookresearch/dinov2/issues/90#issuecomment-1574001076
+            attn = x.permute(0, 2, 1, 3) @ v.permute(0, 2, 3, 1)
+            return attn
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+if __name__ == "__main__":
+    import torch
+    _att = MemEffAttention(dim=32, num_heads=4).to("cuda")
+    print(_att(torch.randn(4, 16, 32, device="cuda"), return_attn=True).shape)
+    print(_att(torch.randn(4, 16, 32, device="cuda")).shape)

hf_src/layers/block.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import os
+from typing import Callable, List, Any, Tuple, Dict
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha
+        XFORMERS_AVAILABLE = True
+    else:
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor, return_attention=False) -> Tensor:
+        """
+        Adapted from https://gitlab.com/ziegleto-machine-learning/dino/-/tree/main/
+        """
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        # Adaptation for returning attentions
+        if return_attention:
+            attn = self.attn(self.norm1(x), return_attn=True)
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        # Adaptation for returing attentions
+        if return_attention:
+            return x, attn
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(
+        x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor
+    )
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, ls=None):
+    if ls is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = x_flat.index_add_(
+            dim=0,
+            index=brange,
+            source=residual.to(dtype=x.dtype),
+            alpha=residual_scale_factor,
+        )
+    else:
+        x_plus_residual = x.index_add_(
+            dim=0,
+            source=ls(residual.to(dtype=x.dtype)),
+            index=brange,
+            alpha=residual_scale_factor,
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = (
+        [b.shape[0] for b in branges]
+        if branges is not None
+        else [x.shape[0] for x in x_list]
+    )
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = torch.cat(
+            [
+                _s.index_select(0, _i).reshape(-1)
+                for _s, _i in zip([_x.flatten(1) for _x in x_list], branges)
+            ],
+            dim=0,
+        ).view(1, -1, x_list[0].shape[-1])
+        # cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(
+        #     1, -1, x_list[0].shape[-1]
+        # )
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [
+        get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list
+    ]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(
+        x_list, branges, residual_list, residual_scale_factors
+    ):
+        outputs.append(
+            add_residual(
+                x, brange, residual, residual_scale_factor, scaling_vector
+            ).view_as(x)
+        )
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1 if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2 if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list, return_attention=False):
+        """
+        Adapted from https://gitlab.com/ziegleto-machine-learning/dino/-/tree/main/
+        """
+        if isinstance(x_or_x_list, Tensor):
+            # Change the following line
+            # return super().forward(x_or_x_list)
+            return super().forward(x_or_x_list, return_attention)
+        elif isinstance(x_or_x_list, list):
+            if return_attention:
+                raise NotImplementedError(
+                    "return_attention not supported for nested tensors"
+                )
+            assert (
+                XFORMERS_AVAILABLE
+            ), "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError
+if __name__ == "__main__":
+    _device = (
+        "cuda"
+        if torch.cuda.is_available()
+        else "mps" if torch.backends.mps.is_available() else "cpu"
+    )
+    # Example usage
+    block = Block(dim=64, num_heads=8, drop_path=0.3).to(_device)
+    x = torch.randn(
+        10, 16, 64, device=_device
+    )  # Batch size 10, sequence length 16, feature dimension 64
+    output = block(x)
+    print(output.shape)  # Should be (10, 16, 64)
+    nested_block = NestedTensorBlock(
+        dim=64, num_heads=8, attn_class=MemEffAttention
+    ).to(_device)
+    nested_x = [
+        torch.randn(10, 16, 64, device=_device),
+        torch.randn(10, 16, 64, device=_device),
+    ]  # List of tensors
+    nested_output = nested_block(nested_x)
+    print(
+        [o.shape for o in nested_output]
+    )  # Should print shapes of tensors in the list

hf_src/layers/cva_head.py ADDED Viewed

	@@ -0,0 +1,184 @@

+from functools import partial
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch.nn.init import trunc_normal_
+def _make_lna_block(input_dim, output_dim, bias, norm_op, activation):
+    layers = [nn.Linear(input_dim, output_dim, bias=bias)]
+    if norm_op is not None:
+        layers.append(norm_op(output_dim))
+    if activation is not None:
+        layers.append(activation())
+    return nn.Sequential(*layers)
+def _build_projector(n_layers, in_dim, out_dim, hidden_dim, activation=nn.GELU):
+    norm_op = partial(nn.BatchNorm1d, track_running_stats=False)
+    if n_layers > 1:
+        layers = _make_lna_block(in_dim, hidden_dim, True, norm_op, activation)
+        for _ in range(n_layers - 2):
+            layers += _make_lna_block(hidden_dim, hidden_dim, True, norm_op, activation)
+        layers += nn.Sequential(
+            *[nn.Linear(hidden_dim, out_dim, bias=False), norm_op(out_dim)]
+        )
+        return nn.Sequential(*layers)
+    else:
+        layers = [nn.Linear(in_dim, out_dim, bias=False), norm_op(out_dim)]
+        return nn.Sequential(*layers)
+def _build_predictor(n_layers, in_out_dim, bottleneck_dim, activation=nn.GELU):
+    norm_op = partial(nn.BatchNorm1d, track_running_stats=False)
+    layers = [_make_lna_block(in_out_dim, bottleneck_dim, True, norm_op, activation)]
+    for _ in range(n_layers - 1):
+        layers += _make_lna_block(
+            bottleneck_dim, bottleneck_dim, True, norm_op, activation
+        )
+    layers += _make_lna_block(bottleneck_dim, in_out_dim, False, None, None)
+    return nn.Sequential(*layers)
+class CVAHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim=1024,
+        projector_layers=3,
+        predictor_layers=1,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        act_op=nn.GELU,
+        use_predictor=True,
+    ):
+        super().__init__()
+        projector_layers = max(projector_layers, 1)
+        self.projector = _build_projector(
+            projector_layers,
+            in_dim,
+            out_dim,
+            hidden_dim=hidden_dim,
+            activation=act_op,
+        )
+        if use_predictor:
+            self.predictor = _build_predictor(
+                predictor_layers,
+                out_dim,
+                bottleneck_dim,
+                activation=act_op,
+            )
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def project(self, latent):
+        if latent.ndim == 2:
+            return self.projector(latent)
+        if latent.ndim == 4:
+            # spatial_latent: (B, C, H, W)
+            b, _, h, w = latent.shape
+            flattened_latent = rearrange(latent, "b c h w -> (b h w) c").contiguous()
+            proj = self.projector(flattened_latent)
+            # make it spatial again
+            return rearrange(proj, "(b h w) c -> b c h w", b=b, h=h, w=w).contiguous()
+        if latent.ndim == 3:
+            # (B, N, C)
+            b, n, _ = latent.shape
+            return self.projector(latent.flatten(0, 1)).unflatten(0, (b, n))
+        raise ValueError(f"{latent.ndim=}D latent input is not supported")
+    def predict(self, latent):
+        if latent.ndim == 2:
+            return self.predictor(self.projector(latent))
+        if latent.ndim == 4:
+            # spatial_latent: (B, C, H, W)
+            b, _, h, w = latent.shape
+            flattened_latent = rearrange(latent, "b c h w -> (b h w) c").contiguous()
+            projection = self.projector(flattened_latent)
+            pred = self.predictor(projection)
+            # make it spatial again
+            return rearrange(pred, "(b h w) c -> b c h w", b=b, h=h, w=w).contiguous()
+        if latent.ndim == 3:
+            # (B, N, C)
+            b, n, _ = latent.shape
+            return self.predictor(self.projector(latent.flatten(0, 1))).unflatten(
+                0, (b, n)
+            )
+        raise ValueError(f"{latent.ndim=}D latent input is not supported")
+    def project_predict(self, latent):
+        projected = self.project(latent)
+        predicted = self.predictor(projected)
+        return projected, predicted
+    def forward(self, latent, project_only=False):
+        if project_only:
+            return self.project(latent)
+        return self.predict(latent)
+class IdentityHead(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def project(self, x):
+        return x
+    def predict(self, x):
+        return x
+    def project_predict(self, x):
+        return x, x
+    def forward(self, x, **kwargs):
+        return x
+class CVAHeadList(torch.nn.Module):
+    def __init__(self, num_scales=2, **params):
+        super().__init__()
+        self.heads = torch.nn.ModuleList([CVAHead(**params) for _ in range(num_scales)])
+    def forward(self, x, scale_idx, project_only=False):
+        return self.heads[scale_idx](x, project_only=project_only)
+if __name__ == "__main__":
+    model = CVAHead(
+        768,
+        512,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        act_op=nn.GELU,
+    )
+    print(model)
+    x = torch.randn(2, 36, 768)
+    out = model(x, project_only=True)
+    print("Output shape:", out.shape)  # Expected: (2, 2048, 6, 6)
+    out2 = model(x, project_only=False)
+    print("Output shape after prediction:", out2.shape)  # Expected: (2, 2048, 6, 6)

hf_src/layers/dino_head.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils.parametrizations import weight_norm
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim=2**16,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+        use_last_layer=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.use_last_layer = use_last_layer
+        self.mlp = _build_mlp(
+            nlayers,
+            in_dim,
+            bottleneck_dim,
+            hidden_dim=hidden_dim,
+            use_bn=use_bn,
+            bias=mlp_bias,
+        )
+        if use_last_layer:
+            self.last_layer = weight_norm(
+                nn.Linear(bottleneck_dim, out_dim, bias=False)
+            )
+            self.last_layer.parametrizations.weight.original0.data.fill_(1)
+    def init_weights(self) -> None:
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x, **kwargs):
+        x = self.mlp(x)
+        if self.use_last_layer:
+            eps = torch.finfo(x.dtype).eps
+            x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+            return self.last_layer(x)
+        else:
+            return x
+def _build_mlp(
+    nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True
+):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=not use_bn)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim, track_running_stats=False))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim, track_running_stats=False))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=not use_bn))
+        return nn.Sequential(*layers)

hf_src/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

hf_src/layers/fp8_linear.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+import re
+import torch
+from hf_src.utils import named_replace
+from hf_src.layers.rope_attention import LinearKMaskedBias
+# avoid division by zero when calculating scale
+EPS = 1e-12
+def scale(t, amax_t):
+    max_v = torch.finfo(torch.float8_e4m3fn).max
+    scale_t = torch.clamp(amax_t.float(), min=EPS) / max_v
+    t_fp8 = (t / scale_t).to(torch.float8_e4m3fn)
+    return t_fp8, scale_t
+def matmul(first, amax_first, second_t, amax_second_t, bias):
+    first_fp8, scale_first = scale(first, amax_first)
+    second_t_fp8, scale_second_t = scale(second_t, amax_second_t)
+    # PyTorch's row-wise scaled matmul kernel is based on CUTLASS and is quite
+    # slow. Hence we fall back to an "unscaled" matmul, which uses cuBLAS, and
+    # apply the scale manually afterwards.
+    output = torch._scaled_mm(
+        first_fp8,
+        second_t_fp8.t(),
+        scale_a=scale_first.new_ones((1, 1)),
+        scale_b=scale_second_t.t().new_ones((1, 1)),
+        bias=None,
+        out_dtype=torch.bfloat16,
+        use_fast_accum=False,
+    )
+    output = (output * scale_first * scale_second_t.t()).to(torch.bfloat16)
+    if bias is not None:
+        output = output + bias
+    return output
+@torch.compiler.allow_in_graph
+class Fp8LinearFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b_t, bias):
+        amax_a = a.abs().amax(dim=-1, keepdim=True)
+        amax_b_t = b_t.abs().amax(dim=-1, keepdim=True)
+        out = matmul(a, amax_a, b_t, amax_b_t, bias)
+        ctx.a_requires_grad = a.requires_grad
+        ctx.b_requires_grad = b_t.requires_grad
+        ctx.bias_requires_grad = bias.requires_grad if bias is not None else False
+        ctx.save_for_backward(a, b_t, amax_b_t.max())
+        return out
+    @staticmethod
+    def backward(ctx, grad_out):
+        a, b_t, amax_b = ctx.saved_tensors
+        if ctx.a_requires_grad:
+            b = b_t.t().contiguous()
+            amax_grad_out = grad_out.abs().amax(dim=-1, keepdim=True)
+            amax_b = amax_b.repeat(b.shape[0], 1)
+            grad_a = matmul(grad_out, amax_grad_out, b, amax_b, None)
+        else:
+            grad_a = None
+        if ctx.b_requires_grad:
+            grad_b = grad_out.t() @ a
+        else:
+            grad_b = None
+        if ctx.bias_requires_grad:
+            grad_bias = grad_out.sum(dim=0)
+        else:
+            grad_bias = None
+        return grad_a, grad_b, grad_bias
+class Fp8Linear(torch.nn.Linear):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        out = Fp8LinearFn.apply(input.flatten(end_dim=-2), self.weight, self.bias)
+        out = out.unflatten(0, input.shape[:-1])
+        return out
+class Fp8LinearKMaskedBias(LinearKMaskedBias):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        masked_bias = self.bias * self.bias_mask if self.bias is not None else None
+        out = Fp8LinearFn.apply(input.flatten(end_dim=-2), self.weight, masked_bias)
+        out = out.unflatten(0, input.shape[:-1])
+        return out
+def convert_linears_to_fp8(
+    root_module: torch.nn.Module, *, filter: str
+) -> torch.nn.Module:
+    filter_re = re.compile(filter)
+    total_count = 0
+    def replace(module: torch.nn.Module, name: str) -> torch.nn.Module:
+        nonlocal total_count
+        if not isinstance(module, torch.nn.Linear) or not filter_re.search(name):
+            return module
+        if type(module) == torch.nn.Linear:
+            new_cls = Fp8Linear
+        elif type(module) == LinearKMaskedBias:
+            new_cls = Fp8LinearKMaskedBias
+        else:
+            assert False, str(type(module))
+        if module.in_features % 64 != 0 or module.out_features % 64 != 0:
+            # This is not a strict requirement, but H100 TensorCores for fp8
+            # operate on tiles of 64 elements anyways, and Inductor sometimes
+            # pads inner dims to become multiples of 64. Also, if one day we
+            # switch back to cuBLAS, it artificially requires dims to be
+            # multiples of 16.
+            raise RuntimeError(
+                "fp8 requires all dimensions to be multiples of 64 "
+                "(consider using ffn_layer=swiglu64 or higher)"
+            )
+        new_module = new_cls(
+            in_features=module.in_features,
+            out_features=module.out_features,
+            bias=module.bias is not None,
+            dtype=module.weight.dtype,
+            device=module.weight.device,
+        )
+        new_module.weight = module.weight
+        new_module.bias = module.bias
+        total_count += 1
+        return new_module
+    out = named_replace(replace, root_module)
+    assert total_count > 0, "fp8: no layer found to convert"
+    # Force re-compile everything
+    torch._dynamo.reset_code_caches()
+    from torch._inductor.cudagraph_trees import reset_cudagraph_trees
+    reset_cudagraph_trees()
+    return out

hf_src/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import nn
+from torch import Tensor
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(torch.empty(dim, device=device))
+        self.init_values = init_values
+    def reset_parameters(self):
+        nn.init.constant_(self.gamma, self.init_values)
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

hf_src/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, List, Optional
+from torch import Tensor, nn
+from hf_src.utils import cat_keep_shapes, uncat_with_shapes
+class ListForwardMixin(object):
+    def forward(self, x: Tensor):
+        raise NotImplementedError
+    def forward_list(self, x_list: List[Tensor]) -> List[Tensor]:
+        x_flat, shapes, num_tokens = cat_keep_shapes(x_list)
+        x_flat = self.forward(x_flat)
+        return uncat_with_shapes(x_flat, shapes, num_tokens)
+class Mlp(nn.Module, ListForwardMixin):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+        device=None,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias, device=device)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, device=device)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

hf_src/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import math
+from typing import Callable, Tuple, Union
+from torch import Tensor, nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Callable | None = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        # patch_H, patch_W = self.patch_size
+        # assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        # assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = (
+            Ho
+            * Wo
+            * self.embed_dim
+            * self.in_chans
+            * (self.patch_size[0] * self.patch_size[1])
+        )
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+    def reset_parameters(self):
+        k = 1 / (self.in_chans * (self.patch_size[0] ** 2))
+        nn.init.uniform_(self.proj.weight, -math.sqrt(k), math.sqrt(k))
+        if self.proj.bias is not None:
+            nn.init.uniform_(self.proj.bias, -math.sqrt(k), math.sqrt(k))

hf_src/layers/rms_norm.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+import torch
+from torch import Tensor, nn
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def reset_parameters(self) -> None:
+        nn.init.constant_(self.weight, 1)
+    def _norm(self, x: Tensor) -> Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight

hf_src/layers/rope_attention.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+import math
+from typing import List, Tuple
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from hf_src.utils import cat_keep_shapes, uncat_with_shapes
+# RoPE-related functions:
+def rope_rotate_half(x: Tensor) -> Tensor:
+    # x:   [ x0  x1  x2  x3  x4  x5]
+    # out: [-x3 -x4 -x5  x0  x1  x2]
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat([-x2, x1], dim=-1)
+def rope_apply(x: Tensor, sin: Tensor, cos: Tensor) -> Tensor:
+    # x:   [..., D], eg [x0,     x1,   x2,   x3,   x4,   x5]
+    # sin: [..., D], eg [sin0, sin1, sin2, sin0, sin1, sin2]
+    # cos: [..., D], eg [cos0, cos1, cos2, cos0, cos1, cos2]
+    return (x * cos) + (rope_rotate_half(x) * sin)
+class LinearKMaskedBias(nn.Linear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        o = self.out_features
+        assert o % 3 == 0
+        if self.bias is not None:
+            self.register_buffer(
+                "bias_mask", torch.full_like(self.bias, fill_value=math.nan)
+            )
+    def forward(self, input: Tensor) -> Tensor:
+        masked_bias = (
+            self.bias * self.bias_mask.to(self.bias.dtype)
+            if self.bias is not None
+            else None
+        )
+        return F.linear(input, self.weight, masked_bias)
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        mask_k_bias: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        linear_class = LinearKMaskedBias if mask_k_bias else nn.Linear
+        self.qkv = linear_class(dim, dim * 3, bias=qkv_bias, device=device)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias, device=device)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def apply_rope(
+        self, q: Tensor, k: Tensor, rope: Tensor | Tuple[Tensor, Tensor]
+    ) -> Tuple[Tensor, Tensor]:
+        # All operations will use the dtype of rope, the output is cast back to the dtype of q and k
+        q_dtype = q.dtype
+        k_dtype = k.dtype
+        sin, cos = rope
+        rope_dtype = sin.dtype
+        q = q.to(dtype=rope_dtype)
+        k = k.to(dtype=rope_dtype)
+        N = q.shape[-2]
+        prefix = N - sin.shape[-2]
+        assert prefix >= 0
+        q_prefix = q[:, :, :prefix, :]
+        q = rope_apply(q[:, :, prefix:, :], sin, cos)  # [B, head, hw, D//head]
+        q = torch.cat((q_prefix, q), dim=-2)  # [B, head, N, D//head]
+        k_prefix = k[:, :, :prefix, :]
+        k = rope_apply(k[:, :, prefix:, :], sin, cos)  # [B, head, hw, D//head]
+        k = torch.cat((k_prefix, k), dim=-2)  # [B, head, N, D//head]
+        q = q.to(dtype=q_dtype)
+        k = k.to(dtype=k_dtype)
+        return q, k
+    def forward(self, x: Tensor, attn_bias=None, rope: Tensor = None) -> Tensor:
+        qkv = self.qkv(x)
+        attn_v = self.compute_attention(qkv=qkv, attn_bias=attn_bias, rope=rope)
+        x = self.proj(attn_v)
+        x = self.proj_drop(x)
+        return x
+    def forward_list(self, x_list, attn_bias=None, rope_list=None) -> List[Tensor]:
+        assert len(x_list) == len(rope_list)  # should be enforced by the Block
+        x_flat, shapes, num_tokens = cat_keep_shapes(x_list)
+        qkv_flat = self.qkv(x_flat)
+        qkv_list = uncat_with_shapes(qkv_flat, shapes, num_tokens)
+        att_out = []
+        for _, (qkv, _, rope) in enumerate(zip(qkv_list, shapes, rope_list)):
+            att_out.append(self.compute_attention(qkv, attn_bias=attn_bias, rope=rope))
+        x_flat, shapes, num_tokens = cat_keep_shapes(att_out)
+        x_flat = self.proj(x_flat)
+        return uncat_with_shapes(x_flat, shapes, num_tokens)
+    def compute_attention(self, qkv: Tensor, attn_bias=None, rope=None) -> Tensor:
+        assert attn_bias is None
+        B, N, _ = qkv.shape
+        C = self.qkv.in_features
+        qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = torch.unbind(qkv, 2)
+        q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
+        if rope is not None:
+            q, k = self.apply_rope(q, k, rope)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        x = x.transpose(1, 2)
+        return x.reshape([B, N, C])
+class CausalSelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = attn_drop
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def init_weights(
+        self,
+        init_attn_std: float | None = None,
+        init_proj_std: float | None = None,
+        factor: float = 1.0,
+    ) -> None:
+        init_attn_std = init_attn_std or (self.dim**-0.5)
+        init_proj_std = init_proj_std or init_attn_std * factor
+        nn.init.normal_(self.qkv.weight, std=init_attn_std)
+        nn.init.normal_(self.proj.weight, std=init_proj_std)
+        if self.qkv.bias is not None:
+            nn.init.zeros_(self.qkv.bias)
+        if self.proj.bias is not None:
+            nn.init.zeros_(self.proj.bias)
+    def forward(self, x: Tensor, is_causal: bool = True) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = torch.unbind(qkv, 2)
+        q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=None,
+            dropout_p=self.attn_drop if self.training else 0,
+            is_causal=is_causal,
+        )
+        x = x.transpose(1, 2).contiguous().view(B, N, C)
+        x = self.proj_drop(self.proj(x))
+        return x

hf_src/layers/rope_block.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+from typing import Callable, List, Optional
+import torch
+from torch import Tensor, nn
+from hf_src.utils import cat_keep_shapes, uncat_with_shapes
+from .mlp import Mlp
+from .layer_scale import LayerScale  # , DropPath
+from .rope_attention import CausalSelfAttention, SelfAttention
+torch._dynamo.config.automatic_dynamic_shapes = False
+torch._dynamo.config.accumulated_cache_size_limit = 1024
+class SelfAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        ffn_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = SelfAttention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        mask_k_bias: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            mask_k_bias=mask_k_bias,
+            device=device,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values, device=device)
+            if init_values
+            else nn.Identity()
+        )
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * ffn_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+            device=device,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values, device=device)
+            if init_values
+            else nn.Identity()
+        )
+        self.sample_drop_ratio = drop_path
+    @staticmethod
+    def _maybe_index_rope(
+        rope: tuple[Tensor, Tensor] | None, indices: Tensor
+    ) -> tuple[Tensor, Tensor] | None:
+        if rope is None:
+            return None
+        sin, cos = rope
+        assert sin.ndim == cos.ndim
+        if sin.ndim == 4:
+            # If the rope embedding has a batch dimension (is different for each batch element), index into it
+            return sin[indices], cos[indices]  # [batch, heads, patches, embed_dim]
+        else:
+            # No batch dimension, do not index
+            return sin, cos  # [heads, patches, embed_dim] or [patches, embed_dim]
+    def _forward(self, x: Tensor, rope=None) -> Tensor:
+        """
+        This is the reference implementation for a single tensor, matching what is done below for a list.
+        We call the list op on [x] instead of this function.
+        """
+        b, _, _ = x.shape
+        sample_subset_size = max(int(b * (1 - self.sample_drop_ratio)), 1)
+        residual_scale_factor = b / sample_subset_size
+        if self.training and self.sample_drop_ratio > 0.0:
+            indices_1 = (torch.randperm(b, device=x.device))[:sample_subset_size]
+            x_subset_1 = x[indices_1]
+            rope_subset = self._maybe_index_rope(rope, indices_1)
+            residual_1 = self.attn(self.norm1(x_subset_1), rope=rope_subset)
+            x_attn = torch.index_add(
+                x,
+                dim=0,
+                source=self.ls1(residual_1),
+                index=indices_1,
+                alpha=residual_scale_factor,
+            )
+            indices_2 = (torch.randperm(b, device=x.device))[:sample_subset_size]
+            x_subset_2 = x_attn[indices_2]
+            residual_2 = self.mlp(self.norm2(x_subset_2))
+            x_ffn = torch.index_add(
+                x_attn,
+                dim=0,
+                source=self.ls2(residual_2),
+                index=indices_2,
+                alpha=residual_scale_factor,
+            )
+        else:
+            x_attn = x + self.ls1(self.attn(self.norm1(x), rope=rope))
+            x_ffn = x_attn + self.ls2(self.mlp(self.norm2(x_attn)))
+        return x_ffn
+    def _forward_list(self, x_list: List[Tensor], rope_list=None) -> List[Tensor]:
+        """
+        This list operator concatenates the tokens from the list of inputs together to save
+        on the elementwise operations. Torch-compile memory-planning allows hiding the overhead
+        related to concat ops.
+        """
+        b_list = [x.shape[0] for x in x_list]
+        sample_subset_sizes = [
+            max(int(b * (1 - self.sample_drop_ratio)), 1) for b in b_list
+        ]
+        residual_scale_factors = [
+            b / sample_subset_size
+            for b, sample_subset_size in zip(b_list, sample_subset_sizes)
+        ]
+        if self.training and self.sample_drop_ratio > 0.0:
+            indices_1_list = [
+                (torch.randperm(b, device=x.device))[:sample_subset_size]
+                for x, b, sample_subset_size in zip(x_list, b_list, sample_subset_sizes)
+            ]
+            x_subset_1_list = [
+                x[indices_1] for x, indices_1 in zip(x_list, indices_1_list)
+            ]
+            if rope_list is not None:
+                rope_subset_list = [
+                    self._maybe_index_rope(rope, indices_1)
+                    for rope, indices_1 in zip(rope_list, indices_1_list)
+                ]
+            else:
+                rope_subset_list = rope_list
+            flattened, shapes, num_tokens = cat_keep_shapes(x_subset_1_list)
+            norm1 = uncat_with_shapes(self.norm1(flattened), shapes, num_tokens)
+            residual_1_list = self.attn.forward_list(norm1, rope_list=rope_subset_list)
+            x_attn_list = [
+                torch.index_add(
+                    x,
+                    dim=0,
+                    source=self.ls1(residual_1),
+                    index=indices_1,
+                    alpha=residual_scale_factor,
+                )
+                for x, residual_1, indices_1, residual_scale_factor in zip(
+                    x_list, residual_1_list, indices_1_list, residual_scale_factors
+                )
+            ]
+            indices_2_list = [
+                (torch.randperm(b, device=x.device))[:sample_subset_size]
+                for x, b, sample_subset_size in zip(x_list, b_list, sample_subset_sizes)
+            ]
+            x_subset_2_list = [
+                x[indices_2] for x, indices_2 in zip(x_attn_list, indices_2_list)
+            ]
+            flattened, shapes, num_tokens = cat_keep_shapes(x_subset_2_list)
+            norm2_flat = self.norm2(flattened)
+            norm2_list = uncat_with_shapes(norm2_flat, shapes, num_tokens)
+            residual_2_list = self.mlp.forward_list(norm2_list)
+            x_ffn = [
+                torch.index_add(
+                    x_attn,
+                    dim=0,
+                    source=self.ls2(residual_2),
+                    index=indices_2,
+                    alpha=residual_scale_factor,
+                )
+                for x_attn, residual_2, indices_2, residual_scale_factor in zip(
+                    x_attn_list, residual_2_list, indices_2_list, residual_scale_factors
+                )
+            ]
+        else:
+            x_out = []
+            for x, rope in zip(x_list, rope_list):
+                x_attn = x + self.ls1(self.attn(self.norm1(x), rope=rope))
+                x_ffn = x_attn + self.ls2(self.mlp(self.norm2(x_attn)))
+                x_out.append(x_ffn)
+            x_ffn = x_out
+        return x_ffn
+    def forward(self, x_or_x_list, rope_or_rope_list=None) -> List[Tensor]:
+        if isinstance(x_or_x_list, Tensor):
+            # for reference:
+            # return self._forward(x_or_x_list, rope=rope_or_rope_list)
+            # in order to match implementations we call the list op:
+            return self._forward_list([x_or_x_list], rope_list=[rope_or_rope_list])[0]
+        elif isinstance(x_or_x_list, list):
+            if rope_or_rope_list is None:
+                rope_or_rope_list = [None for x in x_or_x_list]
+            # return [self._forward(x, rope=rope) for x, rope in zip(x_or_x_list, rope_or_rope_list)]
+            return self._forward_list(x_or_x_list, rope_list=rope_or_rope_list)
+        else:
+            raise AssertionError
+class CausalSelfAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        ffn_ratio: float = 4.0,
+        ls_init_value: Optional[float] = None,
+        is_causal: bool = True,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+        dropout_prob: float = 0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.is_causal = is_causal
+        self.ls1 = (
+            LayerScale(dim, init_values=ls_init_value)
+            if ls_init_value
+            else nn.Identity()
+        )
+        self.attention_norm = norm_layer(dim)
+        self.attention = CausalSelfAttention(
+            dim, num_heads, attn_drop=dropout_prob, proj_drop=dropout_prob
+        )
+        self.ffn_norm = norm_layer(dim)
+        ffn_hidden_dim = int(dim * ffn_ratio)
+        self.feed_forward = Mlp(
+            in_features=dim,
+            hidden_features=ffn_hidden_dim,
+            drop=dropout_prob,
+            act_layer=act_layer,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=ls_init_value)
+            if ls_init_value
+            else nn.Identity()
+        )
+    def init_weights(
+        self,
+        init_attn_std: float | None = None,
+        init_proj_std: float | None = None,
+        init_fc_std: float | None = None,
+        factor: float = 1.0,
+    ) -> None:
+        init_attn_std = init_attn_std or (self.dim**-0.5)
+        init_proj_std = init_proj_std or init_attn_std * factor
+        init_fc_std = init_fc_std or (2 * self.dim) ** -0.5
+        self.attention.init_weights(init_attn_std, init_proj_std)
+        self.attention_norm.reset_parameters()
+        nn.init.normal_(self.feed_forward.fc1.weight, std=init_fc_std)
+        nn.init.normal_(self.feed_forward.fc2.weight, std=init_proj_std)
+        self.ffn_norm.reset_parameters()
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        x_attn = x + self.ls1(self.attention(self.attention_norm(x), self.is_causal))
+        x_ffn = x_attn + self.ls2(self.feed_forward(self.ffn_norm(x_attn)))
+        return x_ffn

hf_src/layers/rope_position_encoding.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+import math
+from typing import Literal
+import numpy as np
+import torch
+from torch import Tensor, nn
+# RoPE positional embedding with no mixing of coordinates (axial) and no learnable weights
+# Supports two parametrizations of the rope parameters: either using `base` or `min_period` and `max_period`.
+class RopePositionEmbedding(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        *,
+        num_heads: int,
+        base: float | None = 100.0,
+        min_period: float | None = None,
+        max_period: float | None = None,
+        normalize_coords: Literal["min", "max", "separate"] = "separate",
+        shift_coords: float | None = None,
+        jitter_coords: float | None = None,
+        rescale_coords: float | None = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ):
+        super().__init__()
+        assert embed_dim % (4 * num_heads) == 0
+        both_periods = min_period is not None and max_period is not None
+        if (base is None and not both_periods) or (base is not None and both_periods):
+            raise ValueError(
+                "Either `base` or `min_period`+`max_period` must be provided."
+            )
+        D_head = embed_dim // num_heads
+        self.base = base
+        self.min_period = min_period
+        self.max_period = max_period
+        self.D_head = D_head
+        self.normalize_coords = normalize_coords
+        self.shift_coords = shift_coords
+        self.jitter_coords = jitter_coords
+        self.rescale_coords = rescale_coords
+        # Needs persistent=True because we do teacher.load_state_dict(student.state_dict()) to initialize the teacher
+        self.dtype = dtype  # Don't rely on self.periods.dtype
+        self.register_buffer(
+            "periods",
+            torch.empty(D_head // 4, device=device, dtype=dtype),
+            persistent=True,
+        )
+        self._init_weights()
+    def forward(self, *, H: int, W: int) -> tuple[Tensor, Tensor]:
+        device = self.periods.device
+        dtype = self.dtype
+        dd = {"device": device, "dtype": dtype}
+        # Prepare coords in range [-1, +1]
+        if self.normalize_coords == "max":
+            max_HW = max(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / max_HW  # [H]
+            coords_w = torch.arange(0.5, W, **dd) / max_HW  # [W]
+        elif self.normalize_coords == "min":
+            min_HW = min(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / min_HW  # [H]
+            coords_w = torch.arange(0.5, W, **dd) / min_HW  # [W]
+        elif self.normalize_coords == "separate":
+            coords_h = torch.arange(0.5, H, **dd) / H  # [H]
+            coords_w = torch.arange(0.5, W, **dd) / W  # [W]
+        else:
+            raise ValueError(f"Unknown normalize_coords: {self.normalize_coords}")
+        coords = torch.stack(
+            torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1
+        )  # [H, W, 2]
+        coords = coords.flatten(0, 1)  # [HW, 2]
+        coords = 2.0 * coords - 1.0  # Shift range [0, 1] to [-1, +1]
+        # Shift coords by adding a uniform value in [-shift, shift]
+        if self.training and self.shift_coords is not None:
+            shift_hw = torch.empty(2, **dd).uniform_(
+                -self.shift_coords, self.shift_coords
+            )
+            coords += shift_hw[None, :]
+        # Jitter coords by multiplying the range [-1, 1] by a log-uniform value in [1/jitter, jitter]
+        if self.training and self.jitter_coords is not None:
+            jitter_max = np.log(self.jitter_coords)
+            jitter_min = -jitter_max
+            jitter_hw = torch.empty(2, **dd).uniform_(jitter_min, jitter_max).exp()
+            coords *= jitter_hw[None, :]
+        # Rescale coords by multiplying the range [-1, 1] by a log-uniform value in [1/rescale, rescale]
+        if self.training and self.rescale_coords is not None:
+            rescale_max = np.log(self.rescale_coords)
+            rescale_min = -rescale_max
+            rescale_hw = torch.empty(1, **dd).uniform_(rescale_min, rescale_max).exp()
+            coords *= rescale_hw
+        # Prepare angles and sin/cos
+        angles = (
+            2 * math.pi * coords[:, :, None] / self.periods[None, None, :]
+        )  # [HW, 2, D//4]
+        angles = angles.flatten(1, 2)  # [HW, D//2]
+        angles = angles.tile(2)  # [HW, D]
+        cos = torch.cos(angles)  # [HW, D]
+        sin = torch.sin(angles)  # [HW, D]
+        return sin, cos  # 2 * [HW, D]
+    def _init_weights(self):
+        device = self.periods.device
+        dtype = self.dtype
+        if self.base is not None:
+            periods = self.base ** (
+                2
+                * torch.arange(self.D_head // 4, device=device, dtype=dtype)
+                / (self.D_head // 2)
+            )  # [D//4]
+        else:
+            base = self.max_period / self.min_period
+            exponents = torch.linspace(
+                0, 1, self.D_head // 4, device=device, dtype=dtype
+            )  # [D//4] range [0, 1]
+            periods = base**exponents  # range [1, max_period / min_period]
+            periods = periods / base  # range [min_period / max_period, 1]
+            periods = periods * self.max_period  # range [min_period, max_period]
+        self.periods.data = periods
+if __name__ == "__main__":
+    import torch
+    import numpy as np
+    import matplotlib.pyplot as plt
+    def get_rope_values(H, W, embed_dim, num_heads, base):
+        # Setup parameters similar to Repo 1
+        D_head = embed_dim // num_heads
+        print(D_head // 4, D_head // 2, (D_head // 4) / (D_head // 2))
+        # We'll pick the first period (the "fastest" one)
+        period = base ** (2 * torch.arange(D_head // 4) / (D_head // 2))
+        period = period[3]  # First period
+        # Normalized coordinates as per Repo 1
+        coords_h = torch.arange(0.5, H) / H
+        coords_w = torch.arange(0.5, W) / W
+        grid_h, grid_w = torch.meshgrid(coords_h, coords_w, indexing="ij")
+        # Convert to [-1, 1]
+        grid_h = 2.0 * grid_h - 1.0
+        grid_w = 2.0 * grid_w - 1.0
+        # Calculate Sine value (using H-coordinate for visualization)
+        # Formula: sin(2 * pi * coord / period)
+        vals = torch.sin(2 * np.pi * grid_h / period)
+        return vals.numpy()
+    # Settings
+    embed_dim = 768
+    num_heads = 12
+    bases = [100, 10000]
+    sizes = [14, 28]
+    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+    for i, base in enumerate(bases):
+        for j, size in enumerate(sizes):
+            vals = get_rope_values(size, size, embed_dim, num_heads, base)
+            ax = axes[i, j]
+            im = ax.imshow(vals, cmap="RdBu", extent=[-1, 1, -1, 1])
+            ax.set_title(f"Base: {base} | Grid: {size}x{size}")
+            ax.set_xlabel("Width (Normalized)")
+            ax.set_ylabel("Height (Normalized)")
+            plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+    plt.tight_layout()
+    plt.show()

hf_src/layers/sparse_linear.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+import logging
+from typing import Callable
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import xformers.ops as xops
+from hf_src.utils import named_apply, named_replace
+class LinearW24(torch.nn.Linear):
+    ALGO = "largest_abs_values_greedy"
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.sparsity_enabled = False
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if not self.sparsity_enabled:
+            return super().forward(input)
+        input_shape = input.shape
+        input = input.flatten(end_dim=-2)
+        dim0 = input.shape[0]
+        if dim0 % 8 != 0:
+            # NOTE: This should be torch-compiled away
+            input = F.pad(input, [0, 0, 0, -dim0 % 8])
+        w_sparse = xops.sparsify24(
+            self.weight,
+            algo=self.ALGO,
+            gradient="ste",
+            backend="cusparselt",
+        )
+        return F.linear(
+            input,
+            w_sparse,
+            self.bias,
+        )[
+            :dim0
+        ].unflatten(dim=0, sizes=input_shape[:-1])
+def replace_linears_with_sparse_linear(
+    root_module: nn.Module, *, filter_fn: Callable[[str], bool]
+) -> nn.Module:
+    total_count = 0
+    def replace(module: nn.Module, name: str) -> nn.Module:
+        nonlocal total_count
+        if not isinstance(module, nn.Linear) or not filter_fn(name):
+            return module
+        assert type(module) == nn.Linear, "Subtypes not supported"
+        new_module = LinearW24(
+            in_features=module.in_features,
+            out_features=module.out_features,
+            bias=module.bias is not None,
+            dtype=module.weight.dtype,
+            device=module.weight.device,
+        )
+        new_module.weight = module.weight
+        new_module.bias = module.bias
+        total_count += 1
+        return new_module
+    out = named_replace(replace, root_module)
+    assert total_count > 0, "2:4 sparsity: no layer found to sparsify"
+    return out
+def update_24sparsity(root_module: nn.Module, enabled: bool) -> int:
+    num_modified = 0
+    def maybe_apply_sparsity(module: nn.Module, name: str) -> nn.Module:
+        nonlocal num_modified
+        if not isinstance(module, LinearW24):
+            return module
+        num_modified += 1
+        module.sparsity_enabled = enabled
+        return module
+    named_apply(maybe_apply_sparsity, root_module)
+    # Force re-compile everything
+    torch._dynamo.reset_code_caches()
+    from torch._inductor.cudagraph_trees import reset_cudagraph_trees
+    reset_cudagraph_trees()
+    return num_modified

hf_src/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+from typing import Callable, Optional
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import SwiGLU
+        XFORMERS_AVAILABLE = True
+    else:
+        raise ImportError
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

hf_src/model/__init__.py ADDED Viewed

File without changes

hf_src/model/image/__init__.py ADDED Viewed

File without changes

hf_src/model/image/vitv2/__init__.py ADDED Viewed

File without changes

hf_src/model/image/vitv2/transformer.py ADDED Viewed

	@@ -0,0 +1,475 @@

+# Adapted from: https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import math
+from functools import partial
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+from torch.nn.functional import interpolate
+from hf_src.layers import (
+    Mlp,
+    PatchEmbed,
+    SwiGLUFFNFused,
+    MemEffAttention,
+    NestedTensorBlock as Block,
+    LayerScale,
+    RMSNorm,
+)
+def named_apply(
+    fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x, return_attention=False):
+        # Adaptation for returing attentions
+        for i, b in enumerate(self):
+            if i < len(self) - 1:
+                x = b(x)
+            else:
+                return b(x, return_attention=return_attention)
+        return x
+class ViTv2(nn.Module):
+    def __init__(
+        self,
+        *,
+        img_size=518,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=True,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=0,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+        num_classes=None,
+        **ignored_kwargs,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__(**ignored_kwargs)
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.img_size = img_size
+        self.num_features = self.embed_dim = embed_dim
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + self.num_tokens, embed_dim)
+        )
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim))
+            if num_register_tokens
+            else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [
+                x.item() for x in torch.linspace(0, drop_path_rate, depth)
+            ]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append(
+                    [nn.Identity()] * i + blocks_list[i : i + chunksize]
+                )
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.mask_token = None
+        self.norm = norm_layer(embed_dim)
+        self.norm_patch = None
+        self.head = (
+            nn.Identity() if num_classes is None else nn.Linear(embed_dim, num_classes)
+        )
+        # Initialize the model's weights
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        if self.mask_token is not None:
+            nn.init.zeros_(self.mask_token)
+        named_apply(init_weights_vit, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(
+                0, 3, 1, 2
+            ),
+            scale_factor=(sx, sy),
+            mode="bicubic",
+            # antialias=self.interpolate_antialias,
+        )
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(
+            previous_dtype
+        )
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(
+                masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x
+            )
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [
+            self.prepare_tokens_with_masks(x, masks)
+            for x, masks in zip(x_list, masks_list)
+        ]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            cls_tokens = self.norm(x[:, : self.num_register_tokens + 1])
+            if self.norm_patch is None:
+                patch_tokens = self.norm(x[:, self.num_register_tokens + 1 :])
+            else:
+                patch_tokens = self.norm_patch(x[:, self.num_register_tokens + 1 :])
+            output.append(
+                {
+                    "latent": cls_tokens[:, 0],
+                    "patch_latent": patch_tokens,
+                    "raw_latent": x[:, 0],
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None, last_self_attention=False):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for i, blk in enumerate(self.blocks):
+            if i < len(self.blocks) - 1:
+                x = blk(x)
+            else:
+                x = blk(x, return_attention=last_self_attention)
+        attn = None
+        if last_self_attention:
+            x, attn = x
+            # Attention is selected from the cls token to the patch tokens only
+            # Thus, we ignore the cls from the patch tokens (i.e., start from 1)
+            attn = attn[:, :, 0, self.num_register_tokens + 1 :]
+        cls_tokens = self.norm(x[:, : self.num_register_tokens + 1])
+        if self.norm_patch is None:
+            patch_tokens = self.norm(x[:, self.num_register_tokens + 1 :])
+        else:
+            patch_tokens = self.norm_patch(x[:, self.num_register_tokens + 1 :])
+        return {
+            "latent": cls_tokens[:, 0],
+            "patch_latent": patch_tokens,
+            "raw_latent": x[:, 0],
+            "last_self_attention": attn,
+            "logits": self.head(cls_tokens[:, 0]),
+        }
+    def forward_head(self, x):
+        # Projection with l2-norm bottleneck
+        x = self.projection_head(x)
+        if self.l2_norm:
+            x = nn.functional.normalize(x, dim=1, p=2)
+        return x
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = (
+            range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        )
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(
+            blocks_to_take
+        ), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = (
+            range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        )
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(
+            blocks_to_take
+        ), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        class_tokens = [
+            (
+                out[:, 0]
+                if not norm
+                else self.norm(out[:, : 1 + self.num_register_tokens])[:, 0]
+            )
+            for out in outputs
+        ]
+        outputs = [
+            (
+                out[:, 1 + self.num_register_tokens :]
+                if not norm
+                else (
+                    self.norm(out[:, self.num_register_tokens + 1 :])
+                    if self.norm_patch is None
+                    else self.norm_patch(out[:, self.num_register_tokens + 1 :])
+                )
+            )
+            for out in outputs
+        ]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, xs, masks=None, last_self_attention=False, **kwargs):
+        if not (isinstance(xs, list) or isinstance(xs, tuple)):
+            return self.forward_features(xs, masks, last_self_attention)
+        if masks is None:
+            masks = [None] * len(xs)
+        return self.forward_features_list(xs, masks)
+    def forward_backbone(self, x, last_self_attention=False):
+        out_dict = self.forward_features(x, last_self_attention=last_self_attention)
+        cls_token = out_dict["latent"]
+        x = out_dict["patch_latent"]
+        # Combine the cls token and the patch tokens
+        x = torch.cat((cls_token.unsqueeze(1), x), dim=1)
+        if last_self_attention:
+            return x, out_dict["last_self_attention"]
+        return x
+    def get_last_selfattention(self, x, masks=None):
+        """
+        Adapted from https://gitlab.com/ziegleto-machine-learning/dino/-/tree/main/
+        """
+        if isinstance(x, list):
+            raise NotImplementedError("Not implemented for list of inputs")
+            # return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        # Run through model, at the last block just return the attention.
+        for i, blk in enumerate(self.blocks):
+            if i < len(self.blocks) - 1:
+                x = blk(x)
+            else:
+                _, attn = blk(x, return_attention=True)
+                return attn
+def init_weights_vit(module: nn.Module, name: str = ""):
+    if isinstance(module, nn.Linear):
+        torch.nn.init.trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+        if hasattr(module, "bias_mask") and module.bias_mask is not None:
+            o = module.out_features
+            module.bias_mask.fill_(1)
+            module.bias_mask[o // 3 : 2 * o // 3].fill_(0)
+    if isinstance(module, nn.LayerNorm):
+        module.reset_parameters()
+    if isinstance(module, LayerScale):
+        module.reset_parameters()
+    if isinstance(module, PatchEmbed):
+        module.reset_parameters()
+    if isinstance(module, RMSNorm):
+        module.reset_parameters()

hf_src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+from .dtype import as_torch_dtype
+from .utils import (
+    cat_keep_shapes,
+    count_parameters,
+    fix_random_seeds,
+    get_conda_env,
+    get_sha,
+    named_apply,
+    named_replace,
+    uncat_with_shapes,
+)

hf_src/utils/download.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+import requests
+import hashlib
+from tqdm import tqdm
+def check_sha1(filename, sha1_hash):
+    """Check whether the sha1 hash of the file content matches the expected hash.
+    Parameters
+    ----------
+    filename : str
+        Path to the file.
+    sha1_hash : str
+        Expected sha1 hash in hexadecimal digits.
+    Returns
+    -------
+    bool
+        Whether the file content matches the expected hash.
+    """
+    sha1 = hashlib.sha1()
+    with open(filename, "rb") as f:
+        while True:
+            data = f.read(1048576)
+            if not data:
+                break
+            sha1.update(data)
+    return sha1.hexdigest() == sha1_hash
+def download(url, path=None, overwrite=False, sha1_hash=None):
+    """
+    https://github.com/junfu1115/DANet/blob/master/encoding/utils/files.py
+    Download a given URL
+    Parameters
+    ----------
+    url : str
+        URL to download
+    path : str, optional
+        Destination path to store downloaded file. By default stores to the
+        current directory with same name as in url.
+    overwrite : bool, optional
+        Whether to overwrite destination file if already exists.
+    sha1_hash : str, optional
+        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
+        but doesn't match.
+    Returns
+    -------
+    str
+        The file path of the downloaded file.
+    """
+    if path is None:
+        fname = url.split("/")[-1]
+    else:
+        path = os.path.expanduser(path)
+        if os.path.isdir(path):
+            fname = os.path.join(path, url.split("/")[-1])
+        else:
+            fname = path
+    if (
+        overwrite
+        or not os.path.exists(fname)
+        or (sha1_hash and not check_sha1(fname, sha1_hash))
+    ):
+        dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+        print("Downloading %s from %s..." % (fname, url))
+        r = requests.get(url, stream=True)
+        if r.status_code != 200:
+            raise RuntimeError("Failed downloading url %s" % url)
+        total_length = r.headers.get("content-length")
+        with open(fname, "wb") as f:
+            if total_length is None:  # no content length header
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk:  # filter out keep-alive new chunks
+                        f.write(chunk)
+            else:
+                total_length = int(total_length)
+                for chunk in tqdm(
+                    r.iter_content(chunk_size=1024),
+                    total=int(total_length / 1024.0 + 0.5),
+                    unit="KB",
+                    unit_scale=False,
+                    dynamic_ncols=True,
+                ):
+                    f.write(chunk)
+        if sha1_hash and not check_sha1(fname, sha1_hash):
+            raise UserWarning(
+                "File {} is downloaded but the content hash does not match. "
+                "The repo may be outdated or download may be incomplete. "
+                'If the "repo_url" is overridden, consider switching to '
+                "the default repo.".format(fname)
+            )
+    return fname

hf_src/utils/dtype.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+from typing import Dict, Union
+import numpy as np
+import torch
+TypeSpec = Union[str, np.dtype, torch.dtype]
+_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
+    np.dtype("bool"): torch.bool,
+    np.dtype("uint8"): torch.uint8,
+    np.dtype("int8"): torch.int8,
+    np.dtype("int16"): torch.int16,
+    np.dtype("int32"): torch.int32,
+    np.dtype("int64"): torch.int64,
+    np.dtype("float16"): torch.float16,
+    np.dtype("float32"): torch.float32,
+    np.dtype("float64"): torch.float64,
+    np.dtype("complex64"): torch.complex64,
+    np.dtype("complex128"): torch.complex128,
+}
+def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, str):
+        dtype = np.dtype(dtype)
+    assert isinstance(
+        dtype, np.dtype
+    ), f"Expected an instance of nunpy dtype, got {type(dtype)}"
+    return _NUMPY_TO_TORCH_DTYPE[dtype]

hf_src/utils/masking.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import random
+import torch
+import numpy as np
+def complete_mask_randomly_np(mask, num_masking_patches, rng):
+    flat = mask.reshape(-1)
+    missing = num_masking_patches - flat.sum()
+    if missing <= 0:
+        return mask
+    available = np.flatnonzero(~flat)
+    chosen = rng.choice(available, size=missing, replace=False)
+    flat[chosen] = True
+    return mask
+class IBotMasker:
+    def __init__(
+        self,
+        input_size,
+        num_masking_patches=None,
+        min_num_patches=0,
+        max_num_patches=None,
+        min_aspect=0.3,
+        max_aspect=3.33,
+        max_tries=10,
+    ):
+        if isinstance(input_size, int):
+            input_size = (input_size, input_size)
+        self.h, self.w = input_size
+        self.num_patches = self.h * self.w
+        self.min_num_patches = min_num_patches
+        self.num_masking_patches = num_masking_patches
+        self.max_num_patches = max_num_patches or num_masking_patches
+        self.log_min_aspect = np.log(min_aspect)
+        self.log_max_aspect = np.log(max_aspect or 1 / min_aspect)
+        self.max_tries = max_tries
+    def __call__(self, num_masking_patches, starting_mask=None, rng=None):
+        if rng is None:
+            rng = np.random.default_rng()
+        if starting_mask is None:
+            mask = np.zeros((self.h, self.w), dtype=np.bool_)
+        else:
+            mask = starting_mask.copy()
+        mask_count = mask.sum()
+        while mask_count < num_masking_patches:
+            max_mask = num_masking_patches - mask_count
+            if self.max_num_patches is not None:
+                max_mask = min(max_mask, self.max_num_patches)
+            delta = self._mask(mask, max_mask, rng)
+            if delta == 0:
+                break
+            mask_count += delta
+        return complete_mask_randomly_np(mask, num_masking_patches, rng)
+    def _mask(self, mask, max_mask_patches, rng):
+        for _ in range(self.max_tries):
+            target = rng.uniform(self.min_num_patches, max_mask_patches)
+            aspect = np.exp(rng.uniform(self.log_min_aspect, self.log_max_aspect))
+            h = int(round(np.sqrt(target * aspect)))
+            w = int(round(np.sqrt(target / aspect)))
+            if h <= 0 or w <= 0 or h >= self.h or w >= self.w:
+                continue
+            top = rng.integers(0, self.h - h + 1)
+            left = rng.integers(0, self.w - w + 1)
+            region = mask[top : top + h, left : left + w]
+            newly = (~region).sum()
+            if 0 < newly <= max_mask_patches:
+                region[:] = True
+                return newly
+        return 0
+def generate_masks(
+    mask_generator, number_of_samples, mask_prob=0.5, per_sample_range=(0.1, 0.5)
+):
+    num_masks = int(number_of_samples * mask_prob)
+    num_tokens = mask_generator.num_patches
+    prob_per_sample = np.linspace(*per_sample_range, num=num_masks)
+    masks = [
+        (
+            mask_generator(num_masking_patches=int(prob_per_sample[i] * num_tokens))
+            if i < num_masks
+            else mask_generator(num_masking_patches=0)
+        )
+        for i in range(number_of_samples)
+    ]
+    random.shuffle(masks)
+    masks = np.stack(masks, dtype=bool)
+    masks = torch.from_numpy(masks).flatten(1, -1)
+    return masks

hf_src/utils/seedlet_masking.py ADDED Viewed

File without changes

hf_src/utils/utils.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This software may be used and distributed in accordance with
+# the terms of the DINOv3 License Agreement.
+import logging
+import os
+import random
+import subprocess
+from typing import Callable, List, Optional, Tuple
+import numpy as np
+import torch
+from torch import Tensor, nn
+logger = logging.getLogger("dinov3")
+def cat_keep_shapes(x_list: List[Tensor]) -> Tuple[Tensor, List[Tuple[int]], List[int]]:
+    shapes = [x.shape for x in x_list]
+    num_tokens = [x.select(dim=-1, index=0).numel() for x in x_list]
+    flattened = torch.cat([x.flatten(0, -2) for x in x_list])
+    return flattened, shapes, num_tokens
+def uncat_with_shapes(
+    flattened: Tensor, shapes: List[Tuple[int]], num_tokens: List[int]
+) -> List[Tensor]:
+    outputs_splitted = torch.split_with_sizes(flattened, num_tokens, dim=0)
+    shapes_adjusted = [
+        shape[:-1] + torch.Size([flattened.shape[-1]]) for shape in shapes
+    ]
+    outputs_reshaped = [
+        o.reshape(shape) for o, shape in zip(outputs_splitted, shapes_adjusted)
+    ]
+    return outputs_reshaped
+def named_replace(
+    fn: Callable,
+    module: nn.Module,
+    name: str = "",
+    depth_first: bool = True,
+    include_root: bool = False,
+) -> nn.Module:
+    if not depth_first and include_root:
+        module = fn(module=module, name=name)
+    for child_name_o, child_module in list(module.named_children()):
+        child_name = ".".join((name, child_name_o)) if name else child_name_o
+        new_child = named_replace(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+        setattr(module, child_name_o, new_child)
+    if depth_first and include_root:
+        module = fn(module=module, name=name)
+    return module
+def named_apply(
+    fn: Callable,
+    module: nn.Module,
+    name: str = "",
+    depth_first: bool = True,
+    include_root: bool = False,
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+def fix_random_seeds(seed: int = 31):
+    """
+    Fix random seeds.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+def get_sha() -> str:
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+    sha = "N/A"
+    diff = "clean"
+    branch = "N/A"
+    try:
+        sha = _run(["git", "rev-parse", "HEAD"])
+        subprocess.check_output(["git", "diff"], cwd=cwd)
+        diff = _run(["git", "diff-index", "HEAD"])
+        diff = "has uncommited changes" if diff else "clean"
+        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+def get_conda_env() -> Tuple[Optional[str], Optional[str]]:
+    conda_env_name = os.environ.get("CONDA_DEFAULT_ENV")
+    conda_env_path = os.environ.get("CONDA_PREFIX")
+    return conda_env_name, conda_env_path
+def count_parameters(module: nn.Module) -> int:
+    c = 0
+    for m in module.parameters():
+        c += m.nelement()
+    return c
+def has_batchnorms(model: nn.Module) -> bool:
+    bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
+    for _, module in model.named_modules():
+        if isinstance(module, bn_types):
+            return True
+    return False

modelling_vitv2.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import Union
+import torch
+from transformers import PreTrainedModel
+from configuration_vitv2 import ViTv2Config
+from hf_src.model.image.vitv2.transformer import ViTv2
+class ViTv2PretrainedModel(PreTrainedModel):
+    config_class = ViTv2Config
+    def __init__(self, config: ViTv2Config):
+        super().__init__(config)
+        self.backbone = ViTv2(
+            img_size=config.img_size,
+            patch_size=config.patch_size,
+            embed_dim=config.embed_dim,
+            depth=config.depth,
+            num_heads=config.num_heads,
+            mlp_ratio=config.mlp_ratio,
+            init_values=config.init_values,
+            num_register_tokens=config.num_register_tokens,
+        )
+        self.post_init()
+    def forward(self, *args, **kwargs) -> dict[str, Union[torch.Tensor, None]]:
+        return self.backbone(*args, **kwargs)

requirements.txt CHANGED Viewed

@@ -5,3 +5,5 @@ transformers>=4.38.0
 scikit-learn>=1.3.0
 Pillow>=9.0.0
 numpy>=1.24.0

 scikit-learn>=1.3.0
 Pillow>=9.0.0
 numpy>=1.24.0
+einops
+opt_einsum