Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

base.py +434 -0
config.json +17 -0
configuration_fisher.py +22 -0
images.py +215 -0
model.safetensors +3 -0
modeling_fisher.py +254 -0
modules.py +273 -0

base.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import logging
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from collections import namedtuple
+from dataclasses import dataclass
+from functools import partial
+from omegaconf import MISSING, II
+from typing import Optional, Callable
+from enum import Enum, auto
+logger = logging.getLogger(__name__)
+class Modality(Enum):
+    AUDIO = auto()
+    IMAGE = auto()
+    TEXT = auto()
+@dataclass
+class D2vModalityConfig:
+    type: Modality = MISSING
+    prenet_depth: int = 0
+    prenet_layerdrop: float = 0.0
+    prenet_dropout: float = 0.0
+    start_drop_path_rate: float = 0.0
+    end_drop_path_rate: float = 0.0
+    num_extra_tokens: int = 1
+    init_extra_token_zero: bool = False
+    mask_noise_std: float = 0.01
+    mask_prob_min: Optional[float] = None
+    mask_prob: float = 0.8
+    inverse_mask: bool = True
+    mask_prob_adjust: float = 0.07
+    keep_masked_pct: float = 0.0
+    flexible_mask: bool = False
+    mask_length: int = 5
+    add_masks: bool = False
+    remove_masks: bool = False
+    mask_dropout: float = 0.0
+    encoder_zero_mask: bool = True
+    mask_channel_prob: float = 0.0
+    mask_channel_length: int = 64
+    ema_local_encoder: bool = True  # used in data2vec_multi
+    ema_local_decoder: bool = False
+    local_grad_mult: float = 1.0
+    flatten: str = 'freq'
+    max_length: int = 128
+    max_freq: int = 50
+    use_alibi_encoder: bool = False
+    alibi_scale: float = 1.0
+    learned_alibi: bool = False
+    alibi_max_pos: Optional[int] = None
+    learned_alibi_scale: bool = False
+    learned_alibi_scale_per_head: bool = False
+    learned_alibi_scale_per_layer: bool = False
+    num_alibi_heads: int = II("model.num_heads")
+    model_depth: int = II("model.depth")
+MaskInfo = namedtuple("MaskInfo", ["x_unmasked", "mask", "ids_restore", "ids_keep"])
+class ModalitySpecificEncoder(nn.Module):
+    def __init__(
+        self,
+        modality_cfg: D2vModalityConfig,
+        embed_dim: int,
+        local_encoder: nn.Module,
+        project_features: nn.Module,
+        fixed_positional_encoder: Optional[nn.Module],
+        relative_positional_encoder: Optional[nn.Module],  # None
+        context_encoder: nn.Module,
+        decoder: Optional[nn.Module],
+        get_alibi_bias: Optional[Callable[[int, int, str, str], torch.Tensor]],
+    ):
+        super().__init__()
+        self.modality_cfg = modality_cfg
+        self.local_encoder = local_encoder
+        self.project_features = project_features
+        self.fixed_positional_encoder = fixed_positional_encoder
+        self.relative_positional_encoder = relative_positional_encoder
+        self.context_encoder = context_encoder
+        self.decoder = decoder
+        self.get_alibi_bias = get_alibi_bias if modality_cfg.use_alibi_encoder else None
+        self.local_grad_mult = self.modality_cfg.local_grad_mult
+        self.extra_tokens = None
+        if modality_cfg.num_extra_tokens > 0:
+            self.extra_tokens = nn.Parameter(
+                torch.zeros(1, modality_cfg.num_extra_tokens, embed_dim)
+            )
+            if not modality_cfg.init_extra_token_zero:
+                nn.init.normal_(self.extra_tokens)
+            elif self.extra_tokens.size(1) > 1:
+                nn.init.normal_(self.extra_tokens[:, 1:])
+        self.alibi_scale = None
+        if self.get_alibi_bias is not None:
+            self.alibi_scale = nn.Parameter(
+                torch.full(
+                    (
+                        (modality_cfg.prenet_depth + modality_cfg.model_depth)
+                        if modality_cfg.learned_alibi_scale_per_layer
+                        else 1,
+                        1,
+                        self.modality_cfg.num_alibi_heads
+                        if modality_cfg.learned_alibi_scale_per_head
+                        else 1,
+                        1,
+                        1,
+                    ),
+                    modality_cfg.alibi_scale,
+                    dtype=torch.float,
+                ),
+                requires_grad=modality_cfg.learned_alibi_scale,
+            )
+        if modality_cfg.learned_alibi and self.get_alibi_bias is not None:
+            assert modality_cfg.alibi_max_pos is not None
+            alibi_bias = self.get_alibi_bias(
+                batch_size=1,
+                time_steps=modality_cfg.alibi_max_pos,
+                heads=modality_cfg.num_alibi_heads,
+                scale=1.0,
+                dtype=torch.float,
+                device="cpu",
+            )
+            self.alibi_bias = nn.Parameter(alibi_bias)
+            self.get_alibi_bias = partial(
+                _learned_alibi_bias, alibi_bias=self.alibi_bias
+            )
+    def upgrade_state_dict_named(self, state_dict, name):
+        k = f"{name}.alibi_scale"
+        if k in state_dict and state_dict[k].dim() == 4:
+            state_dict[k] = state_dict[k].unsqueeze(0)
+        return state_dict
+    def convert_padding_mask(self, x, padding_mask):
+        return padding_mask
+    def local_features(self, features):
+        x = self.local_encoder(features)
+        x = self.project_features(x)  # nn.Identity()
+        return x
+    def contextualized_features(
+        self,
+        x,
+        padding_mask,
+        mask,  # True
+        remove_masked,  # train: True; infer: False
+        clone_batch: int = 1,
+        mask_seeds: Optional[torch.Tensor] = None,
+        precomputed_mask=None,
+    ):
+        if padding_mask is not None:
+            padding_mask = self.convert_padding_mask(x, padding_mask)  # [b,t,f] => [b,seq]
+        local_features = x
+        if mask and clone_batch == 1:
+            local_features = local_features.clone()
+        orig_B, orig_T, _ = x.shape
+        pre_mask_B = orig_B
+        mask_info = None
+        x_pos = None
+        # x: [B, seq_len, embed_dim]
+        if self.fixed_positional_encoder is not None:  # models.modules.FixPositionalEncoder
+            x = x + self.fixed_positional_encoder(x, padding_mask)[:, :x.size(1), :]
+        if self.relative_positional_encoder is not None:
+            x_pos = self.relative_positional_encoder(x)
+        masked_padding_mask = padding_mask
+        alibi_bias = None
+        alibi_scale = self.alibi_scale
+        if self.get_alibi_bias is not None:
+            alibi_bias = self.get_alibi_bias(
+                batch_size=pre_mask_B,
+                time_steps=orig_T,
+                heads=self.modality_cfg.num_alibi_heads,
+                dtype=torch.float32,
+                device=x.device,
+            )
+            if alibi_scale is not None:
+                alibi_scale = alibi_scale.clamp_min(0)
+                if alibi_scale.size(0) == 1:
+                    alibi_bias = alibi_bias * alibi_scale.squeeze(0).type_as(alibi_bias)
+                    alibi_scale = None
+            if clone_batch > 1:
+                alibi_bias = alibi_bias.repeat_interleave(clone_batch, 0)
+            if mask_info is not None and remove_masked:
+                alibi_bias = masked_alibi(alibi_bias, mask_info)
+        if self.extra_tokens is not None:
+            num = self.extra_tokens.size(1)
+            x = torch.cat([self.extra_tokens.expand(x.size(0), -1, -1), x], dim=1)
+            if masked_padding_mask is not None:
+                # B x T
+                masked_padding_mask = F.pad(masked_padding_mask, (num, 0))
+            if alibi_bias is not None:
+                # B x H x T x T
+                alibi_bias = F.pad(alibi_bias, (num, 0, num, 0))
+        x = self.context_encoder(
+            x,
+            masked_padding_mask,
+            alibi_bias,
+            alibi_scale[: self.modality_cfg.prenet_depth]
+            if alibi_scale is not None
+            else None,
+        )
+        return {
+            "x": x,
+            "local_features": local_features,
+            "padding_mask": masked_padding_mask,
+            "alibi_bias": alibi_bias,
+            "alibi_scale": alibi_scale[self.modality_cfg.prenet_depth :]
+            if alibi_scale is not None and alibi_scale.size(0) > 1
+            else alibi_scale,
+            "encoder_mask": mask_info,
+        }
+    def forward(
+        self,
+        features,
+        padding_mask,
+        mask: bool,
+        remove_masked: bool,
+        clone_batch: int = 1,
+        mask_seeds: Optional[torch.Tensor] = None,
+        precomputed_mask=None,
+    ):
+        x = self.local_features(features)  # patch embed
+        # x: [bs, time*freq, embed_dim], e.g. [12, 512, 768]
+        out = self.contextualized_features(
+            x,
+            padding_mask,
+            mask,
+            remove_masked,
+            clone_batch,
+            mask_seeds,
+            precomputed_mask,
+        )  # add mask, discarded masked, context encoder (only layer norm)
+        return out
+    def reset_parameters(self):
+        pass
+    def remove_pretraining_modules(self, keep_decoder=False):
+        if not keep_decoder:
+            self.decoder = None
+def get_annealed_rate(start, end, curr_step, total_steps):
+    if curr_step >= total_steps:
+        return end
+    r = end - start
+    pct_remaining = 1 - curr_step / total_steps
+    return end - r * pct_remaining
+def get_alibi(
+    max_positions: int,
+    attention_heads: int,
+    dims: int = 1,
+    distance: str = "manhattan",
+):
+    def get_slopes(n):
+        def get_slopes_power_of_2(n):
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            ratio = start
+            return [start * ratio**i for i in range(n)]
+        # In the paper, we only train models that have 2^a heads for some
+        # a. This function has some good properties that only occur when
+        # the input is a power of 2. To maintain that even when the number
+        # of heads is not a power of 2, we use this workaround.
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(n)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            return (
+                get_slopes_power_of_2(closest_power_of_2)
+                + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+            )
+    maxpos = max_positions
+    attn_heads = attention_heads
+    slopes = torch.Tensor(get_slopes(attn_heads))
+    if dims == 1:
+        # prepare alibi position linear bias. Note that wav2vec2 is non
+        # autoregressive model so we want a symmetric mask with 0 on the
+        # diagonal and other wise linear decreasing valuees
+        pos_bias = (
+            torch.abs(
+                torch.arange(maxpos).unsqueeze(0) - torch.arange(maxpos).unsqueeze(1)
+            )
+            * -1
+        )
+    elif dims == 2:
+        if distance == "manhattan":
+            df = lambda x1, y1, x2, y2: abs(x1 - x2) + abs(y1 - y2)
+        elif distance == "euclidean":
+            df = lambda x1, y1, x2, y2: math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
+        n = math.sqrt(max_positions)
+        assert n.is_integer(), n
+        n = int(n)
+        pos_bias = torch.zeros((max_positions, max_positions))
+        for i in range(n):
+            for j in range(n):
+                for k in range(n):
+                    for l in range(n):
+                        new_x = i * n + j
+                        new_y = k * n + l
+                        pos_bias[new_x, new_y] = -df(i, j, k, l)
+    else:
+        raise Exception(f"unsupported number of alibi dims: {dims}")
+    alibi_bias = slopes.unsqueeze(1).unsqueeze(1) * pos_bias.unsqueeze(0).expand(
+        attn_heads, -1, -1
+    )
+    return alibi_bias
+def get_alibi_bias(
+    alibi_biases,
+    batch_size,
+    time_steps,
+    heads,
+    dtype,
+    device,
+    dims=1,
+    distance="manhattan",
+):
+    cache_key = f"{dims}_{heads}_{distance}"
+    buffered = alibi_biases.get(cache_key, None)
+    target_size = heads * batch_size
+    if (
+        buffered is None
+        or buffered.size(0) < target_size
+        or buffered.size(1) < time_steps
+        or buffered.dtype != dtype
+        or buffered.device != device
+    ):
+        bt = max(time_steps, buffered.size(1) if buffered is not None else 0)
+        bn = max(target_size, buffered.size(0) if buffered is not None else 0) // heads
+        buffered = (
+            get_alibi(bt, heads, dims=dims, distance=distance)
+            .to(dtype=dtype, device=device)
+            .repeat(bn, 1, 1)
+        )
+        alibi_biases[cache_key] = buffered
+    b = buffered[:target_size, :time_steps, :time_steps]
+    b = b.view(batch_size, heads, time_steps, time_steps)
+    return b
+def _learned_alibi_bias(
+    alibi_bias,
+    batch_size,
+    time_steps,
+    heads,
+    scale,
+    dtype,
+    device,
+):
+    assert alibi_bias.size(1) == heads, alibi_bias.shape
+    assert alibi_bias.dtype == dtype, alibi_bias.dtype
+    assert alibi_bias.device == device, alibi_bias.device
+    if alibi_bias.size(-1) < time_steps:
+        psz = math.ceil((time_steps - alibi_bias.size(-1)) / 2)
+        alibi_bias = F.pad(alibi_bias, (psz, psz, psz, psz), mode="replicate")
+    alibi_bias = alibi_bias.expand(batch_size, -1, -1, -1) * scale
+    return alibi_bias[..., :time_steps, :time_steps]
+def masked_alibi(alibi_bias, mask_info):
+    H = alibi_bias.size(1)
+    orig_bias = alibi_bias
+    index = mask_info.ids_keep.unsqueeze(1)[..., 0].unsqueeze(-1)
+    alibi_bias = torch.gather(
+        orig_bias,
+        dim=-2,
+        index=index.expand(-1, H, -1, mask_info.ids_restore.size(1)),
+    )
+    alibi_bias = torch.gather(
+        alibi_bias,
+        dim=-1,
+        index=index.transpose(-1, -2).expand(-1, H, alibi_bias.size(-2), -1),
+    )
+    return alibi_bias

config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "architectures": [
+    "FISHERModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_fisher.FISHERConfig",
+    "AutoModel": "modeling_fisher.FISHERModel"
+  },
+  "band_width": 100,
+  "depth": 12,
+  "embed_dim": 192,
+  "max_band_per_sample": 64,
+  "model_type": "fisher",
+  "num_heads": 3,
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.3"
+}

configuration_fisher.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from transformers import PretrainedConfig
+class FISHERConfig(PretrainedConfig):
+    model_type = "fisher"
+    def __init__(
+        self,
+        band_width=100,
+        embed_dim=192,
+        num_heads=3,
+        max_band_per_sample=64,
+        depth=12,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.band_width = band_width
+        self.embed_dim = embed_dim
+        self.depth = depth
+        self.num_heads = num_heads
+        self.max_band_per_sample = max_band_per_sample

images.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from functools import partial
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional
+from enum import Enum, auto
+from einops import rearrange
+from omegaconf import II
+from .modules import get_2d_sincos_pos_embed_flexible, PatchEmbed_new
+from .base import (
+    D2vModalityConfig,
+    ModalitySpecificEncoder,
+    get_alibi_bias,
+)
+from .modules import (
+    BlockEncoder,
+    FixedPositionalEncoder,
+)
+class Modality(Enum):
+    AUDIO = auto()
+    IMAGE = auto()
+    TEXT = auto()
+@dataclass
+class D2vImageConfig(D2vModalityConfig):
+    type: Modality = Modality.IMAGE
+    in_chans: int = 1
+    patch_size: int = 16
+    embed_dim: int = II('model.embed_dim')
+    alibi_dims: int = 2
+    alibi_distance: str = "manhattan"
+    fixed_positions: bool = True
+    transformer_decoder: bool = False
+    enc_dec_transformer: bool = False
+    target_length: int = 1024
+    max_length: int = 128
+    max_freq: int = 50
+    flatten: str = 'freq'  # 'time', 'freq'
+class ImageEncoder(ModalitySpecificEncoder):
+    # forward() implemented in models.base.ModalitySpecificEncoder
+    modality_cfg: D2vImageConfig
+    def __init__(
+        self,
+        modality_cfg: D2vImageConfig,
+        embed_dim: int,
+        make_block: Callable[[float, Optional[int], Optional[int]], nn.ModuleList],
+        norm_layer: Callable[[int], nn.LayerNorm],
+        layer_norm_first: bool,
+        alibi_biases: Dict,
+        task=None,
+    ):
+        self.patch_size = modality_cfg.patch_size
+        self.H = modality_cfg.target_length // self.patch_size  # 64
+        # convert spec to patch embed, using conv1d
+        local_encoder = PatchEmbed_new(
+            patch_size=modality_cfg.patch_size,  # 16
+            in_chans=modality_cfg.in_chans,  # 1
+            embed_dim=modality_cfg.embed_dim,  # 768
+            stride=modality_cfg.patch_size,  # 16
+            flatten=modality_cfg.flatten
+        )
+        # CNN initialize
+        w = local_encoder.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        if modality_cfg.embed_dim != embed_dim:
+            local_encoder = nn.Sequential(
+                local_encoder,
+                nn.Linear(modality_cfg.embed_dim, embed_dim),
+            )
+        project_features = nn.Identity()
+        # note: max_length control the maximum time length of audio -> "64" for 10s, here we define it as 2min, you can change it yourself
+        max_length = modality_cfg.max_length
+        max_freq = modality_cfg.max_freq
+        # side_n = int(num_patches ** 0.5)
+        # note: we fix the variable length sequence problem here -> support up to 2min audio
+        emb = get_2d_sincos_pos_embed_flexible(
+            embed_dim,
+            (max_length, max_freq),
+            cls_token=False,
+        )
+        pos_embed = torch.from_numpy(emb[:max_length * max_freq, :]).float().unsqueeze(0)
+        fixed_positional_encoder = (
+            FixedPositionalEncoder(pos_embed) if modality_cfg.fixed_positions else None  # True
+        )
+        dpr = np.linspace(  # drop_path_rate
+            modality_cfg.start_drop_path_rate,
+            modality_cfg.end_drop_path_rate,
+            modality_cfg.prenet_depth,  # actual: 0
+        )
+        # actual: only layer norm
+        context_encoder = BlockEncoder(
+            nn.ModuleList(make_block(dpr[i]) for i in range(modality_cfg.prenet_depth)),
+            norm_layer(embed_dim) if not layer_norm_first else None,
+            layer_norm_first,
+            modality_cfg.prenet_layerdrop,
+            modality_cfg.prenet_dropout,
+        )
+        alibi_bias_fn = partial(
+            get_alibi_bias,
+            alibi_biases=alibi_biases,
+            heads=modality_cfg.num_alibi_heads,
+            dims=modality_cfg.alibi_dims,
+            distance=modality_cfg.alibi_distance,
+        )
+        super().__init__(
+            modality_cfg=modality_cfg,
+            embed_dim=embed_dim,
+            local_encoder=local_encoder,  # patch embed
+            project_features=project_features,  # nn.Identity()
+            fixed_positional_encoder=fixed_positional_encoder,
+            relative_positional_encoder=None,
+            context_encoder=context_encoder,  # apply mask
+            decoder=None,
+            get_alibi_bias=alibi_bias_fn,
+        )
+    def reset_parameters(self):
+        super().reset_parameters()
+    @torch.no_grad()
+    def patchify(self, imgs):
+        """
+        imgs: (N, 3, H, W)   audio: (N,1,H,W)   1024/16 = 64   128/16 = 8
+        x: (N, L, patch_size**2 *3)
+        """
+        if self.modality_cfg.in_chans == 1:  # actual: this one
+            p = self.modality_cfg.patch_size
+            h = imgs.shape[2] // p
+            w = imgs.shape[3] // p
+            # h,w = self.patch_embed.patch_hw
+            x = imgs.reshape(shape=(imgs.shape[0], 1, h, p, w, p))
+            x = torch.einsum('nchpwq->nhwpqc', x)
+            x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 1))
+        else:
+            p = self.modality_cfg.patch_size
+            h = w = imgs.shape[2] // p
+            x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+            x = torch.einsum("nchpwq->nhwpqc", x)
+            x = x.reshape(shape=(imgs.shape[0], h * w, p ** 2 * 3))
+        return x
+    @torch.no_grad()
+    def unpatchify(self, x):
+        """
+        x: (N, L, patch_size**2 *C)
+        imgs: (N, C, H, W)
+        """
+        p = self.modality_cfg.patch_size
+        h = w = int(x.shape[1] ** 0.5)  # num patch along two axis
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, -1))
+        x = torch.einsum("nhwpqc->nchpwq", x)
+        imgs = x.reshape(shape=(x.shape[0], -1, h * p, h * p))
+        return imgs
+    def convert_padding_mask(
+        self,
+        x: torch.Tensor,
+        padding_mask: torch.Tensor
+    ) -> torch.Tensor:
+        '''patchify and serialize padding_mask: [b,t,f] => [b,t_patch,f_patch] => [b,patch_seq]
+        Args:
+            x (torch.Tensor): input_features
+            padding_mask (torch.Tensor): [b,t_patch,f_patch], 1 for padded patch
+        Returns:
+            torch.Tensor: serialized padding mask. [b,patch_seq]
+        '''
+        B, T, F = x.shape
+        t_extra, f_extra = T % self.patch_size, F % self.patch_size
+        padding_mask = padding_mask[:, :-t_extra, :-f_extra]
+        padding_mask = rearrange(
+            padding_mask,
+            'b (tp p) (fp q) -> b tp fp (p q)',
+            p=self.patch_size, q=self.patch_size
+        )
+        padding_mask = padding_mask.all(-1)
+        if self.modality_cfg.flatten == 'time':
+            padding_mask = padding_mask.transpose(-2, -1).flatten(1)
+        else:
+            padding_mask = padding_mask.flatten(1)
+        return padding_mask

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cafbbf94ee1858decdf88f8f4a272e200f638561153d02f93f255995d47ba7c8
+size 21567672

modeling_fisher.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import torch
+import numpy as np
+import torch.nn as nn
+from functools import partial
+from einops import rearrange
+from typing import Callable, Optional
+from dataclasses import dataclass, field, is_dataclass
+from transformers import PreTrainedModel
+from .configuration_fisher import FISHERConfig
+from .base import (
+    D2vModalityConfig,
+    ModalitySpecificEncoder,
+)
+from .modules import AltBlock
+from .images import (
+    D2vImageConfig,
+    ImageEncoder,
+)
+@dataclass
+class D2vModalitiesConfig:
+    image: D2vImageConfig = field(default_factory=lambda *args: D2vImageConfig())
+@dataclass
+class Data2VecMultiConfig:
+    depth: int = 12
+    # band split
+    band_width: int = 100
+    # standard vision Transformer
+    start_drop_path_rate: float = 0.0
+    end_drop_path_rate: float = 0.0
+    num_heads: int = 12
+    norm_eps: float = 1e-6
+    norm_affine: bool = True
+    encoder_dropout: float = 0.0
+    post_mlp_drop: float = 0.0
+    attention_dropout: float = 0.0
+    activation_dropout: float = 0.0
+    dropout_input: float = 0.0
+    layerdrop: float = 0.0
+    embed_dim: int = 768
+    mlp_ratio: float = 4.0
+    layer_norm_first: bool = False
+    end_of_block_targets: bool = False
+    # clone batch for multi-mask strategy
+    max_band_per_sample: int = 64
+    # normalization for teacher Transformer layer output
+    layer_norm_target_layer: bool = False
+    batch_norm_target_layer: bool = False
+    instance_norm_target_layer: bool = True
+    instance_norm_targets: bool = False
+    layer_norm_targets: bool = True
+    modalities: D2vModalitiesConfig = field(default_factory=lambda *args: D2vModalitiesConfig())
+def update_dataclass(instance, data_dict):
+    if not data_dict:
+        return instance
+    for field_name, field_value in data_dict.items():
+        if hasattr(instance, field_name):
+            current_value = getattr(instance, field_name)
+            if is_dataclass(current_value) and isinstance(field_value, dict):
+                update_dataclass(current_value, field_value)
+            else:
+                setattr(instance, field_name, field_value)
+    return instance
+class FISHER(nn.Module):
+    def __init__(self, config: FISHERConfig):
+        super().__init__()
+        cfg = Data2VecMultiConfig()
+        update_dataclass(cfg, config.to_dict())
+        cfg.modalities.image.embed_dim = cfg.embed_dim
+        cfg.modalities.image.embed_dim = cfg.embed_dim
+        self.cfg = cfg
+        make_layer_norm = partial(
+            nn.LayerNorm, eps=cfg.norm_eps, elementwise_affine=cfg.norm_affine
+        )
+        def make_block(drop_path, dim=None, heads=None):
+            return AltBlock(
+                cfg.embed_dim if dim is None else dim,
+                cfg.num_heads if heads is None else heads,
+                cfg.mlp_ratio,
+                qkv_bias=True,
+                drop=0.0,
+                attn_drop=cfg.attention_dropout,
+                mlp_drop=cfg.activation_dropout,
+                post_mlp_drop=cfg.post_mlp_drop,
+                drop_path=drop_path,
+                norm_layer=make_layer_norm,
+                layer_norm_first=cfg.layer_norm_first,
+                ffn_targets=not cfg.end_of_block_targets,
+            )
+        self.alibi_biases = {}
+        self.modality_encoders = nn.ModuleDict()
+        mod_cfg = getattr(cfg.modalities, 'image')
+        enc = self.make_modality_encoder(
+            mod_cfg,
+            cfg.embed_dim,
+            make_block,
+            make_layer_norm,
+            cfg.layer_norm_first,
+            self.alibi_biases,
+        )
+        self.modality_encoders['IMAGE'] = enc
+        dpr = np.linspace(cfg.start_drop_path_rate, cfg.end_drop_path_rate, cfg.depth)
+        self.blocks = nn.ModuleList([make_block(dpr[i]) for i in range(cfg.depth)])
+        self.norm = None
+        if cfg.layer_norm_first:
+            self.norm = make_layer_norm(cfg.embed_dim)
+        # band split
+        self.band_width = cfg.band_width
+        self.patch_size = cfg.modalities.image.patch_size
+    def make_modality_encoder(
+        self,
+        cfg: D2vModalityConfig,
+        embed_dim: int,
+        make_block: Callable[[float], nn.ModuleList],
+        norm_layer: Callable[[int], nn.LayerNorm],
+        layer_norm_first: bool,
+        alibi_biases,
+        task=None,
+    ) -> ModalitySpecificEncoder:
+        return ImageEncoder(
+            cfg,
+            embed_dim,
+            make_block,
+            norm_layer,
+            layer_norm_first,
+            alibi_biases,
+            task,
+        )
+    def forward(
+        self,
+        source: torch.Tensor,
+        target=None,
+        id=None,
+        mode='IMAGE',
+        padding_mask: Optional[torch.Tensor] = None,
+        mask: bool = True,
+        features_only: bool = False,
+        force_remove_masked=False,
+        precomputed_mask: Optional[torch.Tensor] = None,
+    ):
+        # band split
+        num_band = source.shape[-1] // self.band_width
+        source = torch.stack(source.split(self.band_width, dim=-1)[:num_band])  # drop residual
+        source = rearrange(source, 'nb B c t f -> (B nb) c t f')
+        clone_batch = self.cfg.max_band_per_sample // num_band
+        feature_extractor = self.modality_encoders[mode]  # models.images.ImageEncoder
+        # extract (unmasked) features using CNN encoder
+        extractor_out = feature_extractor(
+            source,
+            padding_mask,
+            mask,
+            remove_masked=not features_only or force_remove_masked,  # train: True; infer: False
+            clone_batch=clone_batch if not features_only else 1,
+            mask_seeds=None,
+            precomputed_mask=precomputed_mask,
+        )
+        # x in shape (batch_size * clone batch, patch_frame(64) * patch_freqency(8) * unmask_ratio(0.2) + 1(cls_token), 768(feature dimension))
+        x = extractor_out["x"]
+        # encoder_mask is applied on sub-band level
+        encoder_mask = extractor_out["encoder_mask"]  # models.base.MaskInfo, ["x_unmasked", "mask", "ids_restore", "ids_keep"]
+        masked_padding_mask = extractor_out["padding_mask"]
+        masked_alibi_bias = extractor_out.get("alibi_bias", None)
+        alibi_scale = extractor_out.get("alibi_scale", None)
+        # standard Transformer (for student encoder)
+        layer_results = []
+        for i, blk in enumerate(self.blocks):
+            ab = masked_alibi_bias
+            if ab is not None and alibi_scale is not None:
+                scale = (
+                    alibi_scale[i]
+                    if alibi_scale.size(0) > 1
+                    else alibi_scale.squeeze(0)
+                )
+                ab = ab * scale.type_as(ab)
+            x, lr = blk(
+                x,
+                padding_mask=masked_padding_mask,
+                alibi_bias=ab,
+            )
+            if features_only:
+                layer_results.append(lr)
+        if self.norm is not None:
+            x = self.norm(x)
+        # extract features for fine-tuning
+        if features_only:
+            return {
+                "x": x,
+                "padding_mask": masked_padding_mask,
+                "layer_results": layer_results,
+                "mask": encoder_mask,
+            }
+    def extract_features(
+        self, source, mode='IMAGE', padding_mask=None, mask=False
+    ):
+        num_band = source.shape[-1] // self.band_width
+        res = self.forward(
+            source,
+            mode=mode,
+            padding_mask=padding_mask,
+            mask=mask,
+            features_only=True,
+        )
+        x = res['x'][:, 0]
+        x = rearrange(x, '(B nb) D -> B (nb D)', nb=num_band)
+        return x
+class FISHERModel(PreTrainedModel):
+    config_class = FISHERConfig
+    def __init__(self, cfg: FISHERConfig):
+        super().__init__(cfg)
+        self.cfg = cfg
+        self.model = FISHER(cfg)
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+    def extract_features(self, x):
+        return self.model.extract_features(x)

modules.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from timm.models.layers import to_2tuple
+class PatchEmbed_new(nn.Module):
+    """ Flexible Image to Patch Embedding
+    """
+    def __init__(
+        self,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        stride=16,
+        flatten='freq'
+    ):
+        super().__init__()
+        self.flatten = flatten
+        patch_size = to_2tuple(patch_size)
+        stride = to_2tuple(stride)
+        assert flatten in ['time', 'freq']
+        self.patch_size = patch_size
+        # no padding for conv
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride)  # with overlapped patches
+    def forward(self, x):
+        x = self.proj(x)  # (B,768,64,8)
+        if self.flatten == 'freq':
+            x = x.flatten(2).transpose(1, 2)  # flatten from dim
+        else:
+            x = x.transpose(-2, -1).flatten(2).transpose(1, 2)
+        return x
+def get_2d_sincos_pos_embed_flexible(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000 ** omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class FixedPositionalEncoder(nn.Module):
+    def __init__(self, pos_embed: torch.Tensor):
+        super().__init__()
+        self.positions = pos_embed
+    def forward(self, x: torch.Tensor, padding_mask):
+        return self.positions.to(x.device)
+class BlockEncoder(nn.Module):
+    def __init__(self, blocks, norm_layer, layer_norm_first, layerdrop, dropout):
+        super().__init__()
+        self.blocks = blocks
+        self.norm = norm_layer
+        self.layer_norm_first = layer_norm_first
+        self.layerdrop = layerdrop
+        self.dropout = nn.Dropout(dropout, inplace=True)
+    def forward(self, x, padding_mask, alibi_bias, alibi_scale):
+        if self.norm is not None and not self.layer_norm_first:
+            x = self.norm(x)
+        x = self.dropout(x)
+        for i, blk in enumerate(self.blocks):
+            if (
+                not self.training
+                or self.layerdrop == 0
+                or (np.random.random() > self.layerdrop)
+            ):
+                ab = alibi_bias
+                if ab is not None and alibi_scale is not None:
+                    scale = (
+                        alibi_scale[i]
+                        if alibi_scale.size(0) > 1
+                        else alibi_scale.squeeze(0)
+                    )
+                    ab = ab * scale.type_as(ab)
+                x, _ = blk(x, padding_mask, ab)
+        if self.norm is not None and self.layer_norm_first:
+            x = self.norm(x)
+        return x
+class AltBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        mlp_drop=0.0,
+        post_mlp_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        layer_norm_first=True,
+        ffn_targets=False,
+        cosine_attention=False,
+    ):
+        super().__init__()
+        self.layer_norm_first = layer_norm_first
+        self.ffn_targets = ffn_targets
+        from timm.models.vision_transformer import DropPath, Mlp
+        self.norm1 = norm_layer(dim)
+        self.attn = AltAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            cosine_attention=cosine_attention,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=mlp_drop,
+        )
+        self.post_mlp_dropout = nn.Dropout(post_mlp_drop, inplace=False)
+    def forward(self, x, padding_mask=None, alibi_bias=None):
+        if self.layer_norm_first:
+            x = x + self.drop_path(self.attn(self.norm1(x), padding_mask, alibi_bias))
+            r = x = self.mlp(self.norm2(x))
+            t = x
+            x = r + self.drop_path(self.post_mlp_dropout(x))
+            if not self.ffn_targets:
+                t = x
+        else:
+            x = x + self.drop_path(self.attn(x, padding_mask, alibi_bias))
+            r = x = self.norm1(x)
+            x = self.mlp(x)
+            t = x
+            x = self.norm2(r + self.drop_path(self.post_mlp_dropout(x)))
+            if not self.ffn_targets:
+                t = x
+        return x, t
+class AltAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        cosine_attention=False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.cosine_attention = cosine_attention
+        if cosine_attention:
+            self.logit_scale = nn.Parameter(
+                torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True
+            )
+    def forward(self, x, padding_mask=None, alibi_bias=None):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)  # qkv x B x H x L x D
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        dtype = q.dtype
+        if self.cosine_attention:
+            # cosine attention
+            attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)
+            logit_scale = torch.clamp(
+                self.logit_scale, max=torch.log(torch.tensor(1.0 / 0.01))
+            ).exp()
+            attn = attn * logit_scale
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+        if alibi_bias is not None:
+            attn = attn.type_as(alibi_bias)
+            attn[:, : alibi_bias.size(1)] += alibi_bias
+        if padding_mask is not None and padding_mask.any():
+            attn = attn.masked_fill(
+                padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                float("-inf"),
+            )
+        attn = attn.softmax(dim=-1, dtype=torch.float32).to(dtype=dtype)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2)  #
+        x = x.reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x