ericzhang0328 commited on Dec 19, 2025

Commit

0c1e054

verified ·

1 Parent(s): 269f7c8

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
models/MAE_SDT.py +639 -0
models/__init__.py +0 -0
models/__pycache__/MAE_SDT.cpython-311.pyc +0 -0
models/__pycache__/MAE_SDT.cpython-312.pyc +0 -0
models/__pycache__/__init__.cpython-311.pyc +0 -0
models/__pycache__/__init__.cpython-312.pyc +0 -0
models/__pycache__/__init__.cpython-39.pyc +0 -0
models/__pycache__/encoder.cpython-311.pyc +0 -0
models/__pycache__/encoder.cpython-312.pyc +0 -0
models/__pycache__/metaformer.cpython-311.pyc +0 -0
models/__pycache__/metaformer.cpython-312.pyc +0 -0
models/__pycache__/neuron.cpython-311.pyc +0 -0
models/__pycache__/neuron.cpython-312.pyc +0 -0
models/__pycache__/qk_model_v1_1003.cpython-311.pyc +0 -0
models/__pycache__/qkformer.cpython-311.pyc +0 -0
models/__pycache__/qkformer.cpython-312.pyc +0 -0
models/__pycache__/sd_former_v1.cpython-311.pyc +0 -0
models/__pycache__/sd_former_v1.cpython-312.pyc +0 -0
models/__pycache__/sdtv3.cpython-311.pyc +0 -0
models/__pycache__/sdtv3.cpython-312.pyc +0 -0
models/__pycache__/sdtv3.cpython-39.pyc +0 -0
models/__pycache__/sdtv3_large.cpython-311.pyc +0 -0
models/__pycache__/sdtv3_large.cpython-312.pyc +0 -0
models/__pycache__/spikformer.cpython-311.pyc +0 -0
models/__pycache__/spikformer.cpython-312.pyc +0 -0
models/__pycache__/vit.cpython-311.pyc +3 -0
models/__pycache__/vit.cpython-312.pyc +3 -0
models/encoder.py +158 -0
models/metaformer.py +1538 -0
models/neuron.py +1587 -0
models/q_vit/Quant.py +185 -0
models/q_vit/__init__.py +0 -0
models/q_vit/__pycache__/Quant.cpython-311.pyc +0 -0
models/q_vit/__pycache__/Quant.cpython-312.pyc +0 -0
models/q_vit/__pycache__/__init__.cpython-311.pyc +0 -0
models/q_vit/__pycache__/__init__.cpython-312.pyc +0 -0
models/q_vit/__pycache__/_quan_base.cpython-311.pyc +0 -0
models/q_vit/__pycache__/_quan_base.cpython-312.pyc +0 -0
models/q_vit/__pycache__/quant_vision_transformer.cpython-311.pyc +0 -0
models/q_vit/__pycache__/quant_vision_transformer.cpython-312.pyc +0 -0
models/q_vit/_quan_base.py +208 -0
models/q_vit/quant_vision_transformer.py +527 -0
models/qk_model_v1_1003.py +426 -0
models/qk_model_with_delay/__init__.py +0 -0
models/qk_model_with_delay/__pycache__/__init__.cpython-311.pyc +0 -0
models/qk_model_with_delay/__pycache__/delay_synaptic_func_inter.cpython-311.pyc +0 -0
models/qk_model_with_delay/__pycache__/delay_synaptic_inter_model.cpython-311.pyc +0 -0
models/qk_model_with_delay/delay_synaptic_func_inter.py +169 -0
models/qk_model_with_delay/delay_synaptic_inter_model.py +459 -0

.gitattributes CHANGED Viewed

@@ -91,3 +91,5 @@ visual-aids/vit-tiny-reluact-16-224/erf_vit_tiny_relu_16_224_w_pretrained_B8_att
 visual-aids/vit-tiny-reluact-16-224/erf_vit_tiny_relu_16_224_w_pretrained_B9_attn_proj.pdf filter=lfs diff=lfs merge=lfs -text
 visual-aids/vit-tiny-reluact-16-224/erf_vit_tiny_relu_16_224_w_pretrained_all_layers.pdf filter=lfs diff=lfs merge=lfs -text
 visual-aids/vit-tiny-reluact-16-224/erf_vit_tiny_relu_16_224_w_pretrained_average.pdf filter=lfs diff=lfs merge=lfs -text

 visual-aids/vit-tiny-reluact-16-224/erf_vit_tiny_relu_16_224_w_pretrained_B9_attn_proj.pdf filter=lfs diff=lfs merge=lfs -text
 visual-aids/vit-tiny-reluact-16-224/erf_vit_tiny_relu_16_224_w_pretrained_all_layers.pdf filter=lfs diff=lfs merge=lfs -text
 visual-aids/vit-tiny-reluact-16-224/erf_vit_tiny_relu_16_224_w_pretrained_average.pdf filter=lfs diff=lfs merge=lfs -text
+models/__pycache__/vit.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+models/__pycache__/vit.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text

models/MAE_SDT.py ADDED Viewed

	@@ -0,0 +1,639 @@

+from functools import partial
+import torch
+import torch.nn as nn
+import torchinfo
+from timm.models.layers import to_2tuple, trunc_normal_, DropPath
+from timm.models.registry import register_model
+from timm.models.vision_transformer import _cfg
+from einops.layers.torch import Rearrange
+import torch.nn.functional as F
+from timm.models.vision_transformer import PatchEmbed, Block
+from spikingjelly.clock_driven import layer
+import copy
+from torchvision import transforms
+import matplotlib.pyplot as plt
+import models.encoder as encoder
+from .util.pos_embed import get_2d_sincos_pos_embed
+import torch
+#timestep
+T=4
+class multispike(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, lens=T):
+        ctx.save_for_backward(input)
+        ctx.lens = lens
+        return torch.floor(torch.clamp(input, 0, lens) + 0.5)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        grad_input = grad_output.clone()
+        temp1 = 0 < input
+        temp2 = input < ctx.lens
+        return grad_input * temp1.float() * temp2.float(), None
+class Multispike(nn.Module):
+    def __init__(self, spike=multispike,norm=T):
+        super().__init__()
+        self.lens = norm
+        self.spike = spike
+        self.norm=norm
+    def forward(self, inputs):
+        return self.spike.apply(inputs)/self.norm
+def MS_conv_unit(in_channels, out_channels,kernel_size=1,padding=0,groups=1):
+    return nn.Sequential(
+        layer.SeqToANNContainer(
+           encoder.SparseConv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding, groups=groups,bias=True),
+           encoder.SparseBatchNorm2d(out_channels)
+        )
+    )
+class MS_ConvBlock(nn.Module):
+    def __init__(self, dim,
+        mlp_ratio=4.0):
+        super().__init__()
+        self.neuron1 = Multispike()
+        self.conv1 = MS_conv_unit(dim, dim * mlp_ratio, 3, 1)
+        self.neuron2 = Multispike()
+        self.conv2 = MS_conv_unit(dim*mlp_ratio, dim, 3, 1)
+    def forward(self, x, mask=None):
+        short_cut = x
+        x = self.neuron1(x)
+        x = self.conv1(x)
+        x = self.neuron2(x)
+        x = self.conv2(x)
+        x = x +short_cut
+        return x
+class MS_MLP(nn.Module):
+    def __init__(
+        self, in_features, hidden_features=None, out_features=None, drop=0.0, layer=0
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1_conv = nn.Conv1d(in_features, hidden_features, kernel_size=1, stride=1)
+        self.fc1_bn = nn.BatchNorm1d(hidden_features)
+        self.fc1_lif =  Multispike()
+        self.fc2_conv = nn.Conv1d(
+            hidden_features, out_features, kernel_size=1, stride=1
+        )
+        self.fc2_bn = nn.BatchNorm1d(out_features)
+        self.fc2_lif = Multispike()
+        self.c_hidden = hidden_features
+        self.c_output = out_features
+    def forward(self, x):
+        T, B, C, N= x.shape
+        x = self.fc1_lif(x)
+        x = self.fc1_conv(x.flatten(0, 1))
+        x = self.fc1_bn(x).reshape(T, B, self.c_hidden, N).contiguous()
+        x = self.fc2_lif(x)
+        x = self.fc2_conv(x.flatten(0, 1))
+        x = self.fc2_bn(x).reshape(T, B, C, N).contiguous()
+        return x
+class RepConv(nn.Module):
+    def __init__(
+        self,
+        in_channel,
+        out_channel,
+        bias=False,
+    ):
+        super().__init__()
+        # TODO in_channel-> 2*in_channel->in_channel
+        self.conv1 = nn.Sequential(nn.Conv1d(in_channel, int(in_channel*1.5), kernel_size=1, stride=1,bias=False), nn.BatchNorm1d(int(in_channel*1.5)))
+        self.conv2 = nn.Sequential(nn.Conv1d(int(in_channel*1.5), out_channel, kernel_size=1, stride=1,bias=False), nn.BatchNorm1d(out_channel))
+    def forward(self, x):
+        return self.conv2(self.conv1(x))
+class RepConv2(nn.Module):
+    def __init__(
+        self,
+        in_channel,
+        out_channel,
+        bias=False,
+    ):
+        super().__init__()
+        # TODO in_channel-> 2*in_channel->in_channel
+        self.conv1 = nn.Sequential(nn.Conv1d(in_channel, int(in_channel*1.5), kernel_size=1, stride=1,bias=False), nn.BatchNorm1d(int(in_channel*1.5)))
+        self.conv2 = nn.Sequential(nn.Conv1d(int(in_channel*1.5), out_channel, kernel_size=1, stride=1,bias=False), nn.BatchNorm1d(out_channel))
+    def forward(self, x):
+        return self.conv2(self.conv1(x))
+class MS_Attention_Conv_qkv_id(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        self.scale = 0.125
+        self.sr_ratio=sr_ratio
+        self.head_lif = Multispike()
+        # track 1: split convs
+        self.q_conv = nn.Sequential(RepConv(dim,dim), nn.BatchNorm1d(dim))
+        self.k_conv = nn.Sequential(RepConv(dim,dim), nn.BatchNorm1d(dim))
+        self.v_conv = nn.Sequential(RepConv(dim,dim*sr_ratio), nn.BatchNorm1d(dim*sr_ratio))
+        # track 2: merge (prefer) NOTE: need `chunk` in forward
+        # self.qkv_conv = nn.Sequential(RepConv(dim,dim * 3), nn.BatchNorm2d(dim * 3))
+        self.q_lif = Multispike()
+        self.k_lif = Multispike()
+        self.v_lif = Multispike()
+        self.attn_lif = Multispike()
+        self.proj_conv = nn.Sequential(RepConv(sr_ratio*dim,dim), nn.BatchNorm1d(dim))
+    def forward(self, x):
+        T, B, C, N = x.shape
+        x = self.head_lif(x)
+        x_for_qkv = x.flatten(0, 1)
+        q_conv_out = self.q_conv(x_for_qkv).reshape(T, B, C, N)
+        q_conv_out = self.q_lif(q_conv_out)
+        q = q_conv_out.transpose(-1, -2).reshape(T, B, N, self.num_heads, C // self.num_heads).permute(0, 1, 3, 2,
+                                                                                                       4)
+        k_conv_out = self.k_conv(x_for_qkv).reshape(T, B, C, N)
+        k_conv_out = self.k_lif(k_conv_out)
+        k = k_conv_out.transpose(-1, -2).reshape(T, B, N, self.num_heads, C // self.num_heads).permute(0, 1, 3, 2,
+                                                                                                       4)
+        v_conv_out = self.v_conv(x_for_qkv).reshape(T, B, self.sr_ratio*C, N)
+        v_conv_out = self.v_lif(v_conv_out)
+        v = v_conv_out.transpose(-1, -2).reshape(T, B, N, self.num_heads, self.sr_ratio*C // self.num_heads).permute(0, 1, 3, 2,
+                                                                                                       4)
+        x = k.transpose(-2, -1) @ v
+        x = (q @ x) * self.scale
+        x = x.transpose(3, 4).reshape(T, B, self.sr_ratio*C, N)
+        x = self.attn_lif(x)
+        x = self.proj_conv(x.flatten(0, 1)).reshape(T, B, C, N)
+        return x
+class MS_DownSampling(nn.Module):
+    def __init__(
+            self,
+            in_channels=2,
+            embed_dims=256,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            first_layer=True,
+    ):
+        super().__init__()
+        self.encode_conv = encoder.SparseConv2d(
+            in_channels,
+            embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+        self.encode_bn = encoder.SparseBatchNorm2d(embed_dims)
+        self.first_layer = first_layer
+        if not first_layer:
+            self.encode_spike = Multispike()
+    def forward(self, x):
+        T, B, _, _, _ = x.shape
+        if hasattr(self, "encode_spike"):
+            x = self.encode_spike(x)
+        x = self.encode_conv(x.flatten(0, 1))
+        _, _, H, W = x.shape
+        x = self.encode_bn(x).reshape(T, B, -1, H, W)
+        return x
+class MS_Block(nn.Module):
+    def __init__(
+            self,
+            dim,
+            choice,
+            num_heads,
+            mlp_ratio=4.0,
+            qkv_bias=False,
+            qk_scale=None,
+            drop=0.0,
+            attn_drop=0.0,
+            drop_path=0.0,
+            norm_layer=nn.LayerNorm,
+            sr_ratio=1,init_values=1e-6,finetune=False,
+    ):
+        super().__init__()
+        self.model=choice
+        if self.model=="base":
+            self.rep_conv=RepConv2(dim,dim) #if have param==83M
+        self.lif = Multispike()
+        self.attn = MS_Attention_Conv_qkv_id(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio,
+        )
+        self.finetune = finetune
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MS_MLP(in_features=dim, hidden_features=mlp_hidden_dim, drop=drop)
+        if self.finetune:
+            self.layer_scale1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+            self.layer_scale2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+    def forward(self, x):
+        # T, B, C, N = x.shape
+        if self.model=="base":
+            x= x + self.rep_conv(self.lif(x).flatten(0, 1)).reshape(T, B, C, N)
+        # TODO: need channel-wise layer scale, init as 1e-6
+        if self.finetune:
+            x = x + self.drop_path(self.attn(x) * self.layer_scale1.unsqueeze(0).unsqueeze(0).unsqueeze(-1))
+            x = x + self.drop_path(self.mlp(x) * self.layer_scale2.unsqueeze(0).unsqueeze(0).unsqueeze(-1))
+        else:
+            x = x + self.attn(x)
+            x = x + self.mlp(x)
+        return x
+class Spikmae(nn.Module):
+    def __init__(self, T=1,choice=None,
+        img_size_h=224,
+        img_size_w=224,
+        patch_size=16,
+        embed_dim=[128, 256, 512],
+        num_heads=8,
+        mlp_ratios=4,
+        in_channels=3,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        num_classes=1000,
+        qkv_bias=False,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), #norm_layer=nn.LayerNorm shaokun
+        depths=8,
+        sr_ratios=1,
+        decoder_embed_dim=768,
+        decoder_depth=4,
+        decoder_num_heads=16,
+        mlp_ratio=4.,
+        norm_pix_loss=False, nb_classes=1000):
+        super().__init__()
+        self.num_classes = num_classes
+        self.depths = depths
+        self.T = 1
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depths)
+        ]  # stochastic depth decay rule
+        self.downsample1_1 = MS_DownSampling(
+            in_channels=in_channels,
+            embed_dims=embed_dim[0] // 2,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            first_layer=True,
+        )
+        self.ConvBlock1_1 = nn.ModuleList(
+            [MS_ConvBlock(dim=embed_dim[0] // 2, mlp_ratio=mlp_ratios)]
+        )
+        self.downsample1_2 = MS_DownSampling(
+            in_channels=embed_dim[0] // 2,
+            embed_dims=embed_dim[0],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            first_layer=False,
+        )
+        self.ConvBlock1_2 = nn.ModuleList(
+            [MS_ConvBlock(dim=embed_dim[0], mlp_ratio=mlp_ratios)]
+        )
+        self.downsample2 = MS_DownSampling(
+            in_channels=embed_dim[0],
+            embed_dims=embed_dim[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            first_layer=False,
+        )
+        self.ConvBlock2_1 = nn.ModuleList(
+            [MS_ConvBlock(dim=embed_dim[1], mlp_ratio=mlp_ratios)]
+        )
+        self.ConvBlock2_2 = nn.ModuleList(
+            [MS_ConvBlock(dim=embed_dim[1], mlp_ratio=mlp_ratios)]
+        )
+        self.downsample3 = MS_DownSampling(
+            in_channels=embed_dim[1],
+            embed_dims=embed_dim[2],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            first_layer=False,
+        )
+        self.block3 = nn.ModuleList(
+            [
+                MS_Block(
+                    dim=embed_dim[2],
+                    choice=choice,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratios,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[j],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios,
+                    finetune=False,
+                )
+                for j in range(depths)
+            ]
+        )
+        self.norm = nn.BatchNorm1d(embed_dim[-1])
+        self.downsample_raito =16
+        num_patches = 196
+        self.pos_embed = nn.Parameter(torch.zeros(1,  embed_dim[-1],num_patches), requires_grad=False)
+        ## MAE decoder vit
+        self.decoder_embed = nn.Linear(embed_dim[-1], decoder_embed_dim,bias=True)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+        # Try  larned decoder
+        self.decoder_pos_embed = nn.Parameter(torch.zeros(1, num_patches, decoder_embed_dim), requires_grad=False)
+        self.decoder_blocks = nn.ModuleList([
+            Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=False, norm_layer=norm_layer)
+            for i in range(decoder_depth)])
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+        self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size ** 2 * in_channels,bias=True)  # decoder to patch
+        self.initialize_weights()
+    def initialize_weights(self):
+        num_patches=196
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[1], int(num_patches ** .5),
+                                            cls_token=False)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed.transpose(1,0)).float().unsqueeze(0))
+        decoder_pos_embed = get_2d_sincos_pos_embed(self.decoder_pos_embed.shape[-1],
+                                                    int(num_patches** .5), cls_token=False)
+        self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0))
+        torch.nn.init.normal_(self.mask_token, std=.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def random_masking(self, x, mask_ratio):
+        """
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        num_patches=196
+        T, N, _, _, _ = x.shape  # batch, length, dim
+        L = num_patches
+        len_keep = int(L * (1 - mask_ratio))
+        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([N, L], device=x.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+        # active is inverse mask
+        active = torch.ones([N, L], device=x.device)
+        active[:, len_keep:] = 0
+        active = torch.gather(active, dim=1, index=ids_restore)
+        return ids_keep, active, ids_restore
+    def forward_encoder(self, x , mask_ratio=1.0):
+        x  = (x.unsqueeze(0)).repeat(self.T, 1, 1, 1, 1)
+        # step1. Mask
+        ids_keep, active, ids_restore = self.random_masking(x , mask_ratio)
+        B,N=active.shape
+        active_b1ff=active.reshape(B,1,14,14)
+        encoder._cur_active = active_b1ff
+        active_hw = active_b1ff.repeat_interleave(self.downsample_raito, 2).repeat_interleave(self.downsample_raito, 3)
+        active_hw = active_hw.unsqueeze(0)
+        masked_bchw = x * active_hw
+        x = masked_bchw
+        x = self.downsample1_1(x)
+        for blk in self.ConvBlock1_1:
+            x = blk(x)
+        x = self.downsample1_2(x)
+        for blk in self.ConvBlock1_2:
+            x = blk(x)
+        x = self.downsample2(x)
+        for blk in self.ConvBlock2_1:
+            x = blk(x)
+        for blk in self.ConvBlock2_2:
+            x = blk(x)
+        x = self.downsample3(x)
+        x = x.flatten(3)
+        for blk in self.block3:
+            x = blk(x)
+        x = x.mean(0)
+        x = self.norm(x).transpose(-1, -2).contiguous()
+        return x, active,ids_restore,active_hw
+    def forward_decoder(self, x, ids_restore):
+        # embed tokens
+        B, N, C = x.shape
+        x = self.decoder_embed(x)  # B, N, C
+        # append mask tokens to sequence
+        # ids_restore#1,196
+        mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] - x.shape[1], 1)
+        x_ = torch.cat([x[:, :, :], mask_tokens], dim=1)  # no cls token
+        x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]))  # unshuffle
+        x = x_
+#
+        # add pos embed
+        x = x + self.decoder_pos_embed
+        # apply Transformer blocks
+        for blk in self.decoder_blocks:
+            x = blk(x)
+        x = self.decoder_norm(x)
+        x = self.decoder_pred(x)
+        return x
+    def patchify(self, imgs):
+        """
+        imgs: (N, 3, H, W)
+        x: (N, L, patch_size**2 *3)
+        """
+        p = 16
+        assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+        h = w = imgs.shape[2] // p
+        x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+        x = torch.einsum('nchpwq->nhwpqc', x)
+        x = x.reshape(shape=(imgs.shape[0], h * w, p ** 2 * 3))
+        return x
+    def unpatchify(self, x):
+        """
+        x: (N, L, patch_size**2 *3)
+        imgs: (N, 3, H, W)
+        """
+        p = 16
+        h = w = int(x.shape[1] ** .5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, 3))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p))
+        return imgs
+    def forward_loss(self, imgs, pred, mask):
+        """
+        imgs: [N, 3, H, W]
+        pred: [N, L, p*p*3]
+        mask: [N, L], 0 is keep, 1 is remove,
+        """
+        inp, rec = self.patchify(imgs), pred # inp and rec: (B, L = f*f, N = C*downsample_raito**2)
+        mean = inp.mean(dim=-1, keepdim=True)
+        var = (inp.var(dim=-1, keepdim=True) + 1e-6) ** .5
+        inp = (inp - mean) / var
+        l2_loss = ((rec - inp) ** 2).mean(dim=2, keepdim=False)  # (B, L, C) ==mean==> (B, L)
+        non_active = mask.logical_not().int().view(mask.shape[0], -1)  # (B, 1, f, f) => (B, L)
+        recon_loss = l2_loss.mul_(non_active).sum() / (non_active.sum() + 1e-8)  # loss only on masked (non-active) patches
+        return recon_loss,mean,var
+    def forward(self, imgs, mask_ratio=0.5,vis=False):
+        latent, active, ids_restore,active_hw = self.forward_encoder(imgs, mask_ratio)
+        rec = self.forward_decoder(latent, ids_restore)  # [N, L, p*p*3]
+        recon_loss,mean,var = self.forward_loss(imgs, rec, active)
+        if vis:
+            masked_bchw = imgs * active_hw.flatten(0,1)
+            rec_bchw = self.unpatchify(rec * var + mean)
+            rec_or_inp = torch.where(active_hw.flatten(0,1).bool(), imgs, rec_bchw)
+            return imgs, masked_bchw, rec_or_inp
+        else:
+            return recon_loss
+def spikmae_12_512(**kwargs):
+    model = Spikmae(
+        T=1,
+        choice="base",
+        img_size_h=224,
+        img_size_w=224,
+        patch_size=16,
+        embed_dim=[128,256,512],
+        num_heads=8,
+        mlp_ratios=4,
+        in_channels=3,
+        num_classes=1000,
+        qkv_bias=False,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        depths=12,
+        sr_ratios=1, decoder_embed_dim=256, decoder_depth=4, decoder_num_heads=4,
+        **kwargs)
+    return model
+def spikmae_12_768(**kwargs):
+    model = Spikmae(
+        T=1,
+        choice="large",
+        img_size_h=224,
+        img_size_w=224,
+        patch_size=16,
+        embed_dim=[192,384,768],
+        num_heads=8,
+        mlp_ratios=4,
+        in_channels=3,
+        num_classes=1000,
+        qkv_bias=False,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        depths=12,
+        sr_ratios=1, decoder_embed_dim=256, decoder_depth=4, decoder_num_heads=4,
+        **kwargs)
+    return model
+if __name__ == "__main__":
+    model = spikmae_12_768()
+    x=torch.randn(1,3,224,224)
+    loss = model(x,mask_ratio=0.50)
+    print('loss',loss)
+    torchinfo.summary(model, (1, 3, 224, 224))
+    print(f"number of params: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

models/__init__.py ADDED Viewed

File without changes

models/__pycache__/MAE_SDT.cpython-311.pyc ADDED Viewed

Binary file (36.2 kB). View file

models/__pycache__/MAE_SDT.cpython-312.pyc ADDED Viewed

Binary file (32.1 kB). View file

models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (201 Bytes). View file

models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (212 Bytes). View file

models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (153 Bytes). View file

models/__pycache__/encoder.cpython-311.pyc ADDED Viewed

Binary file (10.6 kB). View file

models/__pycache__/encoder.cpython-312.pyc ADDED Viewed

Binary file (9.31 kB). View file

models/__pycache__/metaformer.cpython-311.pyc ADDED Viewed

Binary file (72.1 kB). View file

models/__pycache__/metaformer.cpython-312.pyc ADDED Viewed

Binary file (63.8 kB). View file

models/__pycache__/neuron.cpython-311.pyc ADDED Viewed

Binary file (78.9 kB). View file

models/__pycache__/neuron.cpython-312.pyc ADDED Viewed

Binary file (75.7 kB). View file

models/__pycache__/qk_model_v1_1003.cpython-311.pyc ADDED Viewed

Binary file (30 kB). View file

models/__pycache__/qkformer.cpython-311.pyc ADDED Viewed

Binary file (31.5 kB). View file

models/__pycache__/qkformer.cpython-312.pyc ADDED Viewed

Binary file (27.1 kB). View file

models/__pycache__/sd_former_v1.cpython-311.pyc ADDED Viewed

Binary file (29.7 kB). View file

models/__pycache__/sd_former_v1.cpython-312.pyc ADDED Viewed

Binary file (25.6 kB). View file

models/__pycache__/sdtv3.cpython-311.pyc ADDED Viewed

Binary file (64.6 kB). View file

models/__pycache__/sdtv3.cpython-312.pyc ADDED Viewed

Binary file (55.6 kB). View file

models/__pycache__/sdtv3.cpython-39.pyc ADDED Viewed

Binary file (21 kB). View file

models/__pycache__/sdtv3_large.cpython-311.pyc ADDED Viewed

Binary file (27.3 kB). View file

models/__pycache__/sdtv3_large.cpython-312.pyc ADDED Viewed

Binary file (23.7 kB). View file

models/__pycache__/spikformer.cpython-311.pyc ADDED Viewed

Binary file (28 kB). View file

models/__pycache__/spikformer.cpython-312.pyc ADDED Viewed

Binary file (24.6 kB). View file

models/__pycache__/vit.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c206e15daa2f79c2abc87acf17b7e6263bb292fe86a0581f10e58b94da50c3d5
+size 204918

models/__pycache__/vit.cpython-312.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:541616a16f1f3839624aff7ca6c0d0f168227ee19a642340a85e71e77d6ea63d
+size 183274

models/encoder.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from timm.models.layers import DropPath
+_cur_active: torch.Tensor = None            # B1ff
+# todo: try to use `gather` for speed?
+def _get_active_ex_or_ii(H, W, returning_active_ex=True):
+    h_repeat, w_repeat = H // _cur_active.shape[-2], W // _cur_active.shape[-1]
+    active_ex = _cur_active.repeat_interleave(h_repeat, dim=2).repeat_interleave(w_repeat, dim=3)
+    return active_ex if returning_active_ex else active_ex.squeeze(1).nonzero(as_tuple=True)  # ii: bi, hi, wi
+def sp_conv_forward(self, x: torch.Tensor):
+    x = super(type(self), self).forward(x)
+    x *= _get_active_ex_or_ii(H=x.shape[2], W=x.shape[3], returning_active_ex=True)    # (BCHW) *= (B1HW), mask the output of conv
+    return x
+def sp_bn_forward(self, x: torch.Tensor):
+    ii = _get_active_ex_or_ii(H=x.shape[2], W=x.shape[3], returning_active_ex=False)
+    bhwc = x.permute(0, 2, 3, 1)
+    nc = bhwc[ii]                               # select the features on non-masked positions to form a flatten feature `nc`
+    nc = super(type(self), self).forward(nc)    # use BN1d to normalize this flatten feature `nc`
+    bchw = torch.zeros_like(bhwc)
+    bchw[ii] = nc
+    bchw = bchw.permute(0, 3, 1, 2)
+    return bchw
+class SparseConv2d(nn.Conv2d):
+    forward = sp_conv_forward   # hack: override the forward function; see `sp_conv_forward` above for more details
+class SparseMaxPooling(nn.MaxPool2d):
+    forward = sp_conv_forward   # hack: override the forward function; see `sp_conv_forward` above for more details
+class SparseAvgPooling(nn.AvgPool2d):
+    forward = sp_conv_forward   # hack: override the forward function; see `sp_conv_forward` above for more details
+class SparseBatchNorm2d(nn.BatchNorm1d):
+    forward = sp_bn_forward     # hack: override the forward function; see `sp_bn_forward` above for more details
+class SparseSyncBatchNorm2d(nn.SyncBatchNorm):
+    forward = sp_bn_forward     # hack: override the forward function; see `sp_bn_forward` above for more details
+class SparseConvNeXtLayerNorm(nn.LayerNorm):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last", sparse=True):
+        if data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        super().__init__(normalized_shape, eps, elementwise_affine=True)
+        self.data_format = data_format
+        self.sparse = sparse
+    def forward(self, x):
+        if x.ndim == 4: # BHWC or BCHW
+            if self.data_format == "channels_last": # BHWC
+                if self.sparse:
+                    ii = _get_active_ex_or_ii(H=x.shape[1], W=x.shape[2], returning_active_ex=False)
+                    nc = x[ii]
+                    nc = super(SparseConvNeXtLayerNorm, self).forward(nc)
+                    x = torch.zeros_like(x)
+                    x[ii] = nc
+                    return x
+                else:
+                    return super(SparseConvNeXtLayerNorm, self).forward(x)
+            else:       # channels_first, BCHW
+                if self.sparse:
+                    ii = _get_active_ex_or_ii(H=x.shape[2], W=x.shape[3], returning_active_ex=False)
+                    bhwc = x.permute(0, 2, 3, 1)
+                    nc = bhwc[ii]
+                    nc = super(SparseConvNeXtLayerNorm, self).forward(nc)
+                    x = torch.zeros_like(bhwc)
+                    x[ii] = nc
+                    return x.permute(0, 3, 1, 2)
+                else:
+                    u = x.mean(1, keepdim=True)
+                    s = (x - u).pow(2).mean(1, keepdim=True)
+                    x = (x - u) / torch.sqrt(s + self.eps)
+                    x = self.weight[:, None, None] * x + self.bias[:, None, None]
+                    return x
+        else:           # BLC or BC
+            if self.sparse:
+                raise NotImplementedError
+            else:
+                return super(SparseConvNeXtLayerNorm, self).forward(x)
+    def __repr__(self):
+        return super(SparseConvNeXtLayerNorm, self).__repr__()[:-1] + f', ch={self.data_format.split("_")[-1]}, sp={self.sparse})'
+class SparseConvNeXtBlock(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6, sparse=True, ks=7):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=ks, padding=ks//2, groups=dim)  # depthwise conv
+        self.norm = SparseConvNeXtLayerNorm(dim, eps=1e-6, sparse=sparse)
+        self.pwconv1 = nn.Linear(dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
+                                  requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path: nn.Module = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.sparse = sparse
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)            # GELU(0) == (0), so there is no need to mask x (no need to `x *= _get_active_ex_or_ii`)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+        if self.sparse:
+            x *= _get_active_ex_or_ii(H=x.shape[2], W=x.shape[3], returning_active_ex=True)
+        x = input + self.drop_path(x)
+        return x
+    def __repr__(self):
+        return super(SparseConvNeXtBlock, self).__repr__()[:-1] + f', sp={self.sparse})'

models/metaformer.py ADDED Viewed

	@@ -0,0 +1,1538 @@

+# Copyright 2022 Garena Online Private Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MetaFormer baselines including IdentityFormer, RandFormer, PoolFormerV2,
+ConvFormer and CAFormer.
+Some implementations are modified from timm (https://github.com/rwightman/pytorch-image-models).
+"""
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.layers import trunc_normal_, DropPath
+from timm.models.registry import register_model
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.layers.helpers import to_2tuple
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': 1.0, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, 'classifier': 'head',
+        **kwargs
+    }
+default_cfgs = {
+    'identityformer_s12': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/identityformer/identityformer_s12.pth'),
+    'identityformer_s24': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/identityformer/identityformer_s24.pth'),
+    'identityformer_s36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/identityformer/identityformer_s36.pth'),
+    'identityformer_m36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/identityformer/identityformer_m36.pth'),
+    'identityformer_m48': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/identityformer/identityformer_m48.pth'),
+    'randformer_s12': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/randformer/randformer_s12.pth'),
+    'randformer_s24': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/randformer/randformer_s24.pth'),
+    'randformer_s36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/randformer/randformer_s36.pth'),
+    'randformer_m36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/randformer/randformer_m36.pth'),
+    'randformer_m48': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/randformer/randformer_m48.pth'),
+    'poolformerv2_s12': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/poolformerv2/poolformerv2_s12.pth'),
+    'poolformerv2_s24': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/poolformerv2/poolformerv2_s24.pth'),
+    'poolformerv2_s36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/poolformerv2/poolformerv2_s36.pth'),
+    'poolformerv2_m36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/poolformerv2/poolformerv2_m36.pth'),
+    'poolformerv2_m48': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/poolformerv2/poolformerv2_m48.pth'),
+    'convformer_s18': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s18.pth'),
+    'convformer_s18_384': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s18_384.pth',
+        input_size=(3, 384, 384)),
+    'convformer_s18_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s18_in21ft1k.pth'),
+    'convformer_s18_384_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s18_384_in21ft1k.pth',
+        input_size=(3, 384, 384)),
+    'convformer_s18_in21k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s18_in21k.pth',
+        num_classes=21841),
+    'convformer_s36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s36.pth'),
+    'convformer_s36_384': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s36_384.pth',
+        input_size=(3, 384, 384)),
+    'convformer_s36_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s36_in21ft1k.pth'),
+    'convformer_s36_384_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s36_384_in21ft1k.pth',
+        input_size=(3, 384, 384)),
+    'convformer_s36_in21k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s36_in21k.pth',
+        num_classes=21841),
+    'convformer_m36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_m36.pth'),
+    'convformer_m36_384': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_m36_384.pth',
+        input_size=(3, 384, 384)),
+    'convformer_m36_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_m36_in21ft1k.pth'),
+    'convformer_m36_384_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_m36_384_in21ft1k.pth',
+        input_size=(3, 384, 384)),
+    'convformer_m36_in21k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_m36_in21k.pth',
+        num_classes=21841),
+    'convformer_b36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_b36.pth'),
+    'convformer_b36_384': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_b36_384.pth',
+        input_size=(3, 384, 384)),
+    'convformer_b36_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_b36_in21ft1k.pth'),
+    'convformer_b36_384_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_b36_384_in21ft1k.pth',
+        input_size=(3, 384, 384)),
+    'convformer_b36_in21k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_b36_in21k.pth',
+        num_classes=21841),
+    'caformer_s18': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_s18.pth'),
+    'caformer_s18_384': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_s18_384.pth',
+        input_size=(3, 384, 384)),
+    'caformer_s18_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_s18_in21ft1k.pth'),
+    'caformer_s18_384_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_s18_384_in21ft1k.pth',
+        input_size=(3, 384, 384)),
+    'caformer_s18_in21k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_s18_in21k.pth',
+        num_classes=21841),
+    'caformer_s36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_s36.pth'),
+    'caformer_s36_384': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_s36_384.pth',
+        input_size=(3, 384, 384)),
+    'caformer_s36_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_s36_in21ft1k.pth'),
+    'caformer_s36_384_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_s36_384_in21ft1k.pth',
+        input_size=(3, 384, 384)),
+    'caformer_s36_in21k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_s36_in21k.pth',
+        num_classes=21841),
+    'caformer_m36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_m36.pth'),
+    'caformer_m36_384': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_m36_384.pth',
+        input_size=(3, 384, 384)),
+    'caformer_m36_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_m36_in21ft1k.pth'),
+    'caformer_m36_384_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_m36_384_in21ft1k.pth',
+        input_size=(3, 384, 384)),
+    'caformer_m36_in21k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_m36_in21k.pth',
+        num_classes=21841),
+    'caformer_b36': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_b36.pth'),
+    'caformer_b36_384': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_b36_384.pth',
+        input_size=(3, 384, 384)),
+    'caformer_b36_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_b36_in21ft1k.pth'),
+    'caformer_b36_384_in21ft1k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_b36_384_in21ft1k.pth',
+        input_size=(3, 384, 384)),
+    'caformer_b36_in21k': _cfg(
+        url='https://huggingface.co/sail/dl/resolve/main/caformer/caformer_b36_in21k.pth',
+        num_classes=21841),
+}
+class Downsampling(nn.Module):
+    """
+    Downsampling implemented by a layer of convolution.
+    """
+    def __init__(self, in_channels, out_channels,
+        kernel_size, stride=1, padding=0,
+        pre_norm=None, post_norm=None, pre_permute=False):
+        super().__init__()
+        self.pre_norm = pre_norm(in_channels) if pre_norm else nn.Identity()
+        self.pre_permute = pre_permute
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size,
+                              stride=stride, padding=padding)
+        self.post_norm = post_norm(out_channels) if post_norm else nn.Identity()
+    def forward(self, x):
+        x = self.pre_norm(x)
+        if self.pre_permute:
+            # if take [B, H, W, C] as input, permute it to [B, C, H, W]
+            x = x.permute(0, 3, 1, 2)
+        x = self.conv(x)
+        x = x.permute(0, 2, 3, 1) # [B, C, H, W] -> [B, H, W, C]
+        x = self.post_norm(x)
+        return x
+class Scale(nn.Module):
+    """
+    Scale vector by element multiplications.
+    """
+    def __init__(self, dim, init_value=1.0, trainable=True):
+        super().__init__()
+        self.scale = nn.Parameter(init_value * torch.ones(dim), requires_grad=trainable)
+    def forward(self, x):
+        return x * self.scale
+class SquaredReLU(nn.Module):
+    """
+        Squared ReLU: https://arxiv.org/abs/2109.08668
+    """
+    def __init__(self, inplace=False):
+        super().__init__()
+        self.relu = nn.ReLU(inplace=inplace)
+    def forward(self, x):
+        return torch.square(self.relu(x))
+class StarReLU(nn.Module):
+    """
+    StarReLU: s * relu(x) ** 2 + b
+    """
+    def __init__(self, scale_value=1.0, bias_value=0.0,
+        scale_learnable=True, bias_learnable=True,
+        mode=None, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.relu = nn.ReLU(inplace=inplace)
+        self.scale = nn.Parameter(scale_value * torch.ones(1),
+            requires_grad=scale_learnable)
+        self.bias = nn.Parameter(bias_value * torch.ones(1),
+            requires_grad=bias_learnable)
+    def forward(self, x):
+        return self.scale * self.relu(x)**2 + self.bias
+class Attention(nn.Module):
+    """
+    Vanilla self-attention from Transformer: https://arxiv.org/abs/1706.03762.
+    Modified from timm.
+    """
+    def __init__(self, dim, head_dim=32, num_heads=None, qkv_bias=False,
+        attn_drop=0., proj_drop=0., proj_bias=False, **kwargs):
+        super().__init__()
+        self.head_dim = head_dim
+        self.scale = head_dim ** -0.5
+        self.num_heads = num_heads if num_heads else dim // head_dim
+        if self.num_heads == 0:
+            self.num_heads = 1
+        self.attention_dim = self.num_heads * self.head_dim
+        self.qkv = nn.Linear(dim, self.attention_dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(self.attention_dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, H, W, C = x.shape
+        N = H * W
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)   # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, H, W, self.attention_dim)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class RandomMixing(nn.Module):
+    def __init__(self, num_tokens=196, **kwargs):
+        super().__init__()
+        self.random_matrix = nn.parameter.Parameter(
+            data=torch.softmax(torch.rand(num_tokens, num_tokens), dim=-1),
+            requires_grad=False)
+    def forward(self, x):
+        B, H, W, C = x.shape
+        x = x.reshape(B, H*W, C)
+        x = torch.einsum('mn, bnc -> bmc', self.random_matrix, x)
+        x = x.reshape(B, H, W, C)
+        return x
+class LayerNormGeneral(nn.Module):
+    r""" General LayerNorm for different situations.
+    Args:
+        affine_shape (int, list or tuple): The shape of affine weight and bias.
+            Usually the affine_shape=C, but in some implementation, like torch.nn.LayerNorm,
+            the affine_shape is the same as normalized_dim by default.
+            To adapt to different situations, we offer this argument here.
+        normalized_dim (tuple or list): Which dims to compute mean and variance.
+        scale (bool): Flag indicates whether to use scale or not.
+        bias (bool): Flag indicates whether to use scale or not.
+        We give several examples to show how to specify the arguments.
+        LayerNorm (https://arxiv.org/abs/1607.06450):
+            For input shape of (B, *, C) like (B, N, C) or (B, H, W, C),
+                affine_shape=C, normalized_dim=(-1, ), scale=True, bias=True;
+            For input shape of (B, C, H, W),
+                affine_shape=(C, 1, 1), normalized_dim=(1, ), scale=True, bias=True.
+        Modified LayerNorm (https://arxiv.org/abs/2111.11418)
+            that is idental to partial(torch.nn.GroupNorm, num_groups=1):
+            For input shape of (B, N, C),
+                affine_shape=C, normalized_dim=(1, 2), scale=True, bias=True;
+            For input shape of (B, H, W, C),
+                affine_shape=C, normalized_dim=(1, 2, 3), scale=True, bias=True;
+            For input shape of (B, C, H, W),
+                affine_shape=(C, 1, 1), normalized_dim=(1, 2, 3), scale=True, bias=True.
+        For the several metaformer baslines,
+            IdentityFormer, RandFormer and PoolFormerV2 utilize Modified LayerNorm without bias (bias=False);
+            ConvFormer and CAFormer utilizes LayerNorm without bias (bias=False).
+    """
+    def __init__(self, affine_shape=None, normalized_dim=(-1, ), scale=True,
+        bias=True, eps=1e-5):
+        super().__init__()
+        self.normalized_dim = normalized_dim
+        self.use_scale = scale
+        self.use_bias = bias
+        self.weight = nn.Parameter(torch.ones(affine_shape)) if scale else None
+        self.bias = nn.Parameter(torch.zeros(affine_shape)) if bias else None
+        self.eps = eps
+    def forward(self, x):
+        c = x - x.mean(self.normalized_dim, keepdim=True)
+        s = c.pow(2).mean(self.normalized_dim, keepdim=True)
+        x = c / torch.sqrt(s + self.eps)
+        if self.use_scale:
+            x = x * self.weight
+        if self.use_bias:
+            x = x + self.bias
+        return x
+class LayerNormWithoutBias(nn.Module):
+    """
+    Equal to partial(LayerNormGeneral, bias=False) but faster,
+    because it directly utilizes otpimized F.layer_norm
+    """
+    def __init__(self, normalized_shape, eps=1e-5, **kwargs):
+        super().__init__()
+        self.eps = eps
+        self.bias = None
+        if isinstance(normalized_shape, int):
+            normalized_shape = (normalized_shape,)
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
+class SepConv(nn.Module):
+    r"""
+    Inverted separable convolution from MobileNetV2: https://arxiv.org/abs/1801.04381.
+    """
+    def __init__(self, dim, expansion_ratio=2,
+        act1_layer=StarReLU, act2_layer=nn.Identity,
+        bias=False, kernel_size=7, padding=3,
+        **kwargs, ):
+        super().__init__()
+        med_channels = int(expansion_ratio * dim)
+        self.pwconv1 = nn.Linear(dim, med_channels, bias=bias)
+        self.act1 = act1_layer()
+        self.dwconv = nn.Conv2d(
+            med_channels, med_channels, kernel_size=kernel_size,
+            padding=padding, groups=med_channels, bias=bias) # depthwise conv
+        self.act2 = act2_layer()
+        self.pwconv2 = nn.Linear(med_channels, dim, bias=bias)
+    def forward(self, x):
+        x = self.pwconv1(x)
+        x = self.act1(x)
+        x = x.permute(0, 3, 1, 2)
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)
+        x = self.act2(x)
+        x = self.pwconv2(x)
+        return x
+class Pooling(nn.Module):
+    """
+    Implementation of pooling for PoolFormer: https://arxiv.org/abs/2111.11418
+    Modfiled for [B, H, W, C] input
+    """
+    def __init__(self, pool_size=3, **kwargs):
+        super().__init__()
+        self.pool = nn.AvgPool2d(
+            pool_size, stride=1, padding=pool_size//2, count_include_pad=False)
+    def forward(self, x):
+        y = x.permute(0, 3, 1, 2)
+        y = self.pool(y)
+        y = y.permute(0, 2, 3, 1)
+        return y - x
+class Mlp(nn.Module):
+    """ MLP as used in MetaFormer models, eg Transformer, MLP-Mixer, PoolFormer, MetaFormer baslines and related networks.
+    Mostly copied from timm.
+    """
+    def __init__(self, dim, mlp_ratio=4, out_features=None, act_layer=StarReLU, drop=0., bias=False, **kwargs):
+        super().__init__()
+        in_features = dim
+        out_features = out_features or in_features
+        hidden_features = int(mlp_ratio * in_features)
+        drop_probs = to_2tuple(drop)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class MlpHead(nn.Module):
+    """ MLP classification head
+    """
+    def __init__(self, dim, num_classes=1000, mlp_ratio=4, act_layer=SquaredReLU,
+        norm_layer=nn.LayerNorm, head_dropout=0., bias=True):
+        super().__init__()
+        hidden_features = int(mlp_ratio * dim)
+        self.fc1 = nn.Linear(dim, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.norm = norm_layer(hidden_features)
+        self.fc2 = nn.Linear(hidden_features, num_classes, bias=bias)
+        self.head_dropout = nn.Dropout(head_dropout)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.norm(x)
+        x = self.head_dropout(x)
+        x = self.fc2(x)
+        return x
+class MetaFormerBlock(nn.Module):
+    """
+    Implementation of one MetaFormer block.
+    """
+    def __init__(self, dim,
+                 token_mixer=nn.Identity, mlp=Mlp,
+                 norm_layer=nn.LayerNorm,
+                 drop=0., drop_path=0.,
+                 layer_scale_init_value=None, res_scale_init_value=None
+                 ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.token_mixer = token_mixer(dim=dim, drop=drop)
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.layer_scale1 = Scale(dim=dim, init_value=layer_scale_init_value) \
+            if layer_scale_init_value else nn.Identity()
+        self.res_scale1 = Scale(dim=dim, init_value=res_scale_init_value) \
+            if res_scale_init_value else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = mlp(dim=dim, drop=drop)
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.layer_scale2 = Scale(dim=dim, init_value=layer_scale_init_value) \
+            if layer_scale_init_value else nn.Identity()
+        self.res_scale2 = Scale(dim=dim, init_value=res_scale_init_value) \
+            if res_scale_init_value else nn.Identity()
+    def forward(self, x):
+        x = self.res_scale1(x) + \
+            self.layer_scale1(
+                self.drop_path1(
+                    self.token_mixer(self.norm1(x))
+                )
+            )
+        x = self.res_scale2(x) + \
+            self.layer_scale2(
+                self.drop_path2(
+                    self.mlp(self.norm2(x))
+                )
+            )
+        return x
+r"""
+downsampling (stem) for the first stage is a layer of conv with k7, s4 and p2
+downsamplings for the last 3 stages is a layer of conv with k3, s2 and p1
+DOWNSAMPLE_LAYERS_FOUR_STAGES format: [Downsampling, Downsampling, Downsampling, Downsampling]
+use `partial` to specify some arguments
+"""
+DOWNSAMPLE_LAYERS_FOUR_STAGES = [partial(Downsampling,
+            kernel_size=7, stride=4, padding=2,
+            post_norm=partial(LayerNormGeneral, bias=False, eps=1e-6)
+            )] + \
+            [partial(Downsampling,
+                kernel_size=3, stride=2, padding=1,
+                pre_norm=partial(LayerNormGeneral, bias=False, eps=1e-6), pre_permute=True
+            )]*3
+class MetaFormer(nn.Module):
+    r""" MetaFormer
+        A PyTorch impl of : `MetaFormer Baselines for Vision`  -
+          https://arxiv.org/abs/2210.13452
+    Args:
+        in_chans (int): Number of input image channels. Default: 3.
+        num_classes (int): Number of classes for classification head. Default: 1000.
+        depths (list or tuple): Number of blocks at each stage. Default: [2, 2, 6, 2].
+        dims (int): Feature dimension at each stage. Default: [64, 128, 320, 512].
+        downsample_layers: (list or tuple): Downsampling layers before each stage.
+        token_mixers (list, tuple or token_fcn): Token mixer for each stage. Default: nn.Identity.
+        mlps (list, tuple or mlp_fcn): Mlp for each stage. Default: Mlp.
+        norm_layers (list, tuple or norm_fcn): Norm layers for each stage. Default: partial(LayerNormGeneral, eps=1e-6, bias=False).
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        head_dropout (float): dropout for MLP classifier. Default: 0.
+        layer_scale_init_values (list, tuple, float or None): Init value for Layer Scale. Default: None.
+            None means not use the layer scale. Form: https://arxiv.org/abs/2103.17239.
+        res_scale_init_values (list, tuple, float or None): Init value for Layer Scale. Default: [None, None, 1.0, 1.0].
+            None means not use the layer scale. From: https://arxiv.org/abs/2110.09456.
+        output_norm: norm before classifier head. Default: partial(nn.LayerNorm, eps=1e-6).
+        head_fn: classification head. Default: nn.Linear.
+    """
+    def __init__(self, in_chans=3, num_classes=1000,
+                 depths=[2, 2, 6, 2],
+                 dims=[64, 128, 320, 512],
+                 downsample_layers=DOWNSAMPLE_LAYERS_FOUR_STAGES,
+                 token_mixers=nn.Identity,
+                 mlps=Mlp,
+                 norm_layers=partial(LayerNormWithoutBias, eps=1e-6), # partial(LayerNormGeneral, eps=1e-6, bias=False),
+                 drop_path_rate=0.,
+                 head_dropout=0.0,
+                 layer_scale_init_values=None,
+                 res_scale_init_values=[None, None, 1.0, 1.0],
+                 output_norm=partial(nn.LayerNorm, eps=1e-6),
+                 head_fn=nn.Linear,
+                 **kwargs,
+                 ):
+        super().__init__()
+        self.num_classes = num_classes
+        if not isinstance(depths, (list, tuple)):
+            depths = [depths] # it means the model has only one stage
+        if not isinstance(dims, (list, tuple)):
+            dims = [dims]
+        num_stage = len(depths)
+        self.num_stage = num_stage
+        if not isinstance(downsample_layers, (list, tuple)):
+            downsample_layers = [downsample_layers] * num_stage
+        down_dims = [in_chans] + dims
+        self.downsample_layers = nn.ModuleList(
+            [downsample_layers[i](down_dims[i], down_dims[i+1]) for i in range(num_stage)]
+        )
+        if not isinstance(token_mixers, (list, tuple)):
+            token_mixers = [token_mixers] * num_stage
+        if not isinstance(mlps, (list, tuple)):
+            mlps = [mlps] * num_stage
+        if not isinstance(norm_layers, (list, tuple)):
+            norm_layers = [norm_layers] * num_stage
+        dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        if not isinstance(layer_scale_init_values, (list, tuple)):
+            layer_scale_init_values = [layer_scale_init_values] * num_stage
+        if not isinstance(res_scale_init_values, (list, tuple)):
+            res_scale_init_values = [res_scale_init_values] * num_stage
+        self.stages = nn.ModuleList() # each stage consists of multiple metaformer blocks
+        cur = 0
+        for i in range(num_stage):
+            stage = nn.Sequential(
+                *[MetaFormerBlock(dim=dims[i],
+                token_mixer=token_mixers[i],
+                mlp=mlps[i],
+                norm_layer=norm_layers[i],
+                drop_path=dp_rates[cur + j],
+                layer_scale_init_value=layer_scale_init_values[i],
+                res_scale_init_value=res_scale_init_values[i],
+                ) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.norm = output_norm(dims[-1])
+        if head_dropout > 0.0:
+            self.head = head_fn(dims[-1], num_classes, head_dropout=head_dropout)
+        else:
+            self.head = head_fn(dims[-1], num_classes)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'norm'}
+    def forward_features(self, x):
+        for i in range(self.num_stage):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+        return self.norm(x.mean([1, 2])) # (B, H, W, C) -> (B, C)
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+@register_model
+def identityformer_s12(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[2, 2, 6, 2],
+        dims=[64, 128, 320, 512],
+        token_mixers=nn.Identity,
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['identityformer_s12']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def identityformer_s24(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[4, 4, 12, 4],
+        dims=[64, 128, 320, 512],
+        token_mixers=nn.Identity,
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['identityformer_s24']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def identityformer_s36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[6, 6, 18, 6],
+        dims=[64, 128, 320, 512],
+        token_mixers=nn.Identity,
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['identityformer_s36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def identityformer_m36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[6, 6, 18, 6],
+        dims=[96, 192, 384, 768],
+        token_mixers=nn.Identity,
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['identityformer_m36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def identityformer_m48(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[8, 8, 24, 8],
+        dims=[96, 192, 384, 768],
+        token_mixers=nn.Identity,
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['identityformer_m48']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def randformer_s12(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[2, 2, 6, 2],
+        dims=[64, 128, 320, 512],
+        token_mixers=[nn.Identity, nn.Identity, RandomMixing, partial(RandomMixing, num_tokens=49)],
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['randformer_s12']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def randformer_s24(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[4, 4, 12, 4],
+        dims=[64, 128, 320, 512],
+        token_mixers=[nn.Identity, nn.Identity, RandomMixing, partial(RandomMixing, num_tokens=49)],
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['randformer_s24']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def randformer_s36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[6, 6, 18, 6],
+        dims=[64, 128, 320, 512],
+        token_mixers=[nn.Identity, nn.Identity, RandomMixing, partial(RandomMixing, num_tokens=49)],
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['randformer_s36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def randformer_m36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[6, 6, 18, 6],
+        dims=[96, 192, 384, 768],
+        token_mixers=[nn.Identity, nn.Identity, RandomMixing, partial(RandomMixing, num_tokens=49)],
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['randformer_m36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def randformer_m48(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[8, 8, 24, 8],
+        dims=[96, 192, 384, 768],
+        token_mixers=[nn.Identity, nn.Identity, RandomMixing, partial(RandomMixing, num_tokens=49)],
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['randformer_m48']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def poolformerv2_s12(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[2, 2, 6, 2],
+        dims=[64, 128, 320, 512],
+        token_mixers=Pooling,
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['poolformerv2_s12']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def poolformerv2_s24(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[4, 4, 12, 4],
+        dims=[64, 128, 320, 512],
+        token_mixers=Pooling,
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['poolformerv2_s24']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def poolformerv2_s36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[6, 6, 18, 6],
+        dims=[64, 128, 320, 512],
+        token_mixers=Pooling,
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['poolformerv2_s36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def poolformerv2_m36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[6, 6, 18, 6],
+        dims=[96, 192, 384, 768],
+        token_mixers=Pooling,
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['poolformerv2_m36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def poolformerv2_m48(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[8, 8, 24, 8],
+        dims=[96, 192, 384, 768],
+        token_mixers=Pooling,
+        norm_layers=partial(LayerNormGeneral, normalized_dim=(1, 2, 3), eps=1e-6, bias=False),
+        **kwargs)
+    model.default_cfg = default_cfgs['poolformerv2_m48']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_s18(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 3, 9, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_s18']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_s18_384(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 3, 9, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_s18_384']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_s18_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 3, 9, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_s18_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_s18_384_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 3, 9, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_s18_384_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_s18_in21k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 3, 9, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_s18_in21k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_s36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_s36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_s36_384(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_s36_384']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_s36_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_s36_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_s36_384_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_s36_384_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_s36_in21k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_s36_in21k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_m36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[96, 192, 384, 576],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_m36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_m36_384(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[96, 192, 384, 576],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_m36_384']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_m36_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[96, 192, 384, 576],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_m36_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_m36_384_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[96, 192, 384, 576],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_m36_384_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_m36_in21k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[96, 192, 384, 576],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_m36_in21k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_b36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[128, 256, 512, 768],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_b36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_b36_384(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[128, 256, 512, 768],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_b36_384']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_b36_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[128, 256, 512, 768],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_b36_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_b36_384_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[128, 256, 512, 768],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_b36_384_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def convformer_b36_in21k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[128, 256, 512, 768],
+        token_mixers=SepConv,
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['convformer_b36_in21k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_s18(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 3, 9, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_s18']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_s18_384(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 3, 9, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_s18_384']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_s18_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 3, 9, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_s18_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_s18_384_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 3, 9, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_s18_384_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_s18_in21k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 3, 9, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_s18_in21k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_s36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_s36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_s36_384(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_s36_384']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_s36_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_s36_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_s36_384_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_s36_384_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_s36_in21k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[64, 128, 320, 512],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_s36_in21k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_m36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[96, 192, 384, 576],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_m36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_m36_384(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[96, 192, 384, 576],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_m36_384']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_m36_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[96, 192, 384, 576],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_m36_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_m36_384_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[96, 192, 384, 576],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_m36_384_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_m364_in21k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[96, 192, 384, 576],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_m364_in21k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_b36(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[128, 256, 512, 768],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_b36']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_b36_384(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[128, 256, 512, 768],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_b36_384']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_b36_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[128, 256, 512, 768],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_b36_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_b36_384_in21ft1k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[128, 256, 512, 768],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_b36_384_in21ft1k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model
+@register_model
+def caformer_b36_in21k(pretrained=False, **kwargs):
+    model = MetaFormer(
+        depths=[3, 12, 18, 3],
+        dims=[128, 256, 512, 768],
+        token_mixers=[SepConv, SepConv, Attention, Attention],
+        head_fn=MlpHead,
+        **kwargs)
+    model.default_cfg = default_cfgs['caformer_b36_in21k']
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(
+            url= model.default_cfg['url'], map_location="cpu", check_hash=True)
+        model.load_state_dict(state_dict)
+    return model

models/neuron.py ADDED Viewed

	@@ -0,0 +1,1587 @@

+from abc import abstractmethod
+from typing import Callable, overload
+import torch
+import torch.nn as nn
+from spikingjelly.clock_driven import surrogate, base, lava_exchange
+from spikingjelly import configure
+import math
+import numpy as np
+import logging
+import cupy
+from spikingjelly.clock_driven import neuron_kernel, cu_kernel_opt
+try:
+    import lava.lib.dl.slayer as slayer
+except BaseException as e:
+    logging.info(f'spikingjelly.clock_driven.neuron: {e}')
+    slayer = None
+def check_backend(backend: str):
+    if backend == 'torch':
+        return
+    elif backend == 'cupy':
+        assert cupy is not None, 'CuPy is not installed! You can install it from "https://github.com/cupy/cupy".'
+    elif backend == 'lava':
+        assert slayer is not None, 'Lava-DL is not installed! You can install it from "https://github.com/lava-nc/lava-dl".'
+    else:
+        raise NotImplementedError(backend)
+class BaseNode(base.MemoryModule):
+    def __init__(self, v_threshold: float = 1., v_reset: float = 0.,
+                 surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False):
+        """
+        * :ref:`API in English <BaseNode.__init__-en>`
+        .. _BaseNode.__init__-cn:
+        :param v_threshold: 神经元的阈值电压
+        :type v_threshold: float
+        :param v_reset: 神经元的重置电压。如果不为 ``None``，当神经元释放脉冲后，电压会被重置为 ``v_reset``；
+            如果设置为 ``None``，则电压会被减去 ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: 反向传播时用来计算脉冲函数梯度的替代函数
+        :type surrogate_function: Callable
+        :param detach_reset: 是否将reset过程的计算图分离
+        :type detach_reset: bool
+        可微分SNN神经元的基类神经元。
+        * :ref:`中文API <BaseNode.__init__-cn>`
+        .. _BaseNode.__init__-en:
+        :param v_threshold: threshold voltage of neurons
+        :type v_threshold: float
+        :param v_reset: reset voltage of neurons. If not ``None``, voltage of neurons that just fired spikes will be set to
+            ``v_reset``. If ``None``, voltage of neurons that just fired spikes will subtract ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: surrogate function for replacing gradient of spiking functions during back-propagation
+        :type surrogate_function: Callable
+        :param detach_reset: whether detach the computation graph of reset
+        :type detach_reset: bool
+        This class is the base class of differentiable spiking neurons.
+        """
+        assert isinstance(v_reset, float) or v_reset is None
+        assert isinstance(v_threshold, float)
+        assert isinstance(detach_reset, bool)
+        super().__init__()
+        if v_reset is None:
+            self.register_memory('v', 0.)
+        else:
+            self.register_memory('v', v_reset)
+        self.register_memory('v_threshold', v_threshold)
+        self.register_memory('v_reset', v_reset)
+        self.detach_reset = detach_reset
+        self.surrogate_function = surrogate_function
+    @abstractmethod
+    def neuronal_charge(self, x: torch.Tensor):
+        """
+         * :ref:`API in English <BaseNode.neuronal_charge-en>`
+        .. _BaseNode.neuronal_charge-cn:
+        定义神经元的充电差分方程。子类必须实现这个函数。
+        * :ref:`中文API <BaseNode.neuronal_charge-cn>`
+        .. _BaseNode.neuronal_charge-en:
+        Define the charge difference equation. The sub-class must implement this function.
+        """
+        raise NotImplementedError
+    def neuronal_fire(self):
+        """
+        * :ref:`API in English <BaseNode.neuronal_fire-en>`
+        .. _BaseNode.neuronal_fire-cn:
+        根据当前神经元的电压、阈值，计算输出脉冲。
+        * :ref:`中文API <BaseNode.neuronal_fire-cn>`
+        .. _BaseNode.neuronal_fire-en:
+        Calculate out spikes of neurons by their current membrane potential and threshold voltage.
+        """
+        return self.surrogate_function(self.v - self.v_threshold)
+    def neuronal_reset(self, spike):
+        """
+        * :ref:`API in English <BaseNode.neuronal_reset-en>`
+        .. _BaseNode.neuronal_reset-cn:
+        根据当前神经元释放的脉冲，对膜电位进行重置。
+        * :ref:`中文API <BaseNode.neuronal_reset-cn>`
+        .. _BaseNode.neuronal_reset-en:
+        Reset the membrane potential according to neurons' output spikes.
+        """
+        if self.detach_reset:
+            spike_d = spike.detach()
+        else:
+            spike_d = spike
+        if self.v_reset is None:
+            # soft reset
+            self.v = self.v - spike_d * self.v_threshold
+        else:
+            # hard reset
+            self.v = (1. - spike_d) * self.v + spike_d * self.v_reset
+    def extra_repr(self):
+        return f'v_threshold={self.v_threshold}, v_reset={self.v_reset}, detach_reset={self.detach_reset}'
+    def forward(self, x: torch.Tensor):
+        """
+        * :ref:`API in English <BaseNode.forward-en>`
+        .. _BaseNode.forward-cn:
+        :param x: 输入到神经元的电压增量
+        :type x: torch.Tensor
+        :return: 神经元的输出脉冲
+        :rtype: torch.Tensor
+        按照充电、放电、重置的顺序进行前向传播。
+        * :ref:`中文API <BaseNode.forward-cn>`
+        .. _BaseNode.forward-en:
+        :param x: increment of voltage inputted to neurons
+        :type x: torch.Tensor
+        :return: out spikes of neurons
+        :rtype: torch.Tensor
+        Forward by the order of `neuronal_charge`, `neuronal_fire`, and `neuronal_reset`.
+        """
+        self.neuronal_charge(x)
+        spike = self.neuronal_fire()
+        self.neuronal_reset(spike)
+        return spike
+class AdaptiveBaseNode(BaseNode):
+    def __init__(self, v_threshold: float = 1., v_reset: float = 0.,
+                 v_rest: float = 0., w_rest: float = 0, tau_w: float = 2., a: float = 0., b: float = 0.,
+                 surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False):
+        # b: jump amplitudes
+        # a: subthreshold coupling
+        assert isinstance(w_rest, float)
+        assert isinstance(v_rest, float)
+        assert isinstance(tau_w, float)
+        assert isinstance(a, float)
+        assert isinstance(b, float)
+        super.__init__(v_threshold, v_reset, surrogate_function, detach_reset)
+        self.register_memory('w', w_rest)
+        self.w_rest = w_rest
+        self.v_rest = v_rest
+        self.tau_w = tau_w
+        self.a = a
+        self.b = b
+    def neuronal_adaptation(self, spike):
+        self.w = self.w + 1. / self.tau_w * (self.a * (self.v - self.v_rest) - self.w) + self.b * spike
+    def extra_repr(self):
+        return super().extra_repr() + f', v_rest={self.v_rest}, w_rest={self.w_rest}, tau_w={self.tau_w}, a={self.a}, b={self.b}'
+    @overload
+    def forward(self, x: torch.Tensor):
+        self.neuronal_charge(x)
+        spike = self.neuronal_fire()
+        self.neuronal_adaptation(spike)
+        self.neuronal_reset(spike)
+        return spike
+class IFNode(BaseNode):
+    def __init__(self, v_threshold: float = 1., v_reset: float = 0.,
+                 surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False,
+                 cupy_fp32_inference=False):
+        """
+        * :ref:`API in English <IFNode.__init__-en>`
+        .. _IFNode.__init__-cn:
+        :param v_threshold: 神经元的阈值电压
+        :type v_threshold: float
+        :param v_reset: 神经元的重置电压。如果不为 ``None``，当神经元释放脉冲后，电压会被重置为 ``v_reset``；
+            如果设置为 ``None``，则电压会被减去 ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: 反向传播时用来计算脉冲函数梯度的替代函数
+        :type surrogate_function: Callable
+        :param detach_reset: 是否将reset过程的计算图分离
+        :type detach_reset: bool
+        :param cupy_fp32_inference: 若为 `True`，在 `eval` 模式下，使用float32，却在GPU上运行，并且 `cupy` 已经安装，则会自动使用 `cupy` 进行加速
+        :type cupy_fp32_inference: bool
+        Integrate-and-Fire 神经元模型，可以看作理想积分器，无输入时电压保持恒定，不会像LIF神经元那样衰减。其阈下神经动力学方程为：
+        .. math::
+            V[t] = V[t-1] + X[t]
+        * :ref:`中文API <IFNode.__init__-cn>`
+        .. _IFNode.__init__-en:
+        :param v_threshold: threshold voltage of neurons
+        :type v_threshold: float
+        :param v_reset: reset voltage of neurons. If not ``None``, voltage of neurons that just fired spikes will be set to
+            ``v_reset``. If ``None``, voltage of neurons that just fired spikes will subtract ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: surrogate function for replacing gradient of spiking functions during back-propagation
+        :type surrogate_function: Callable
+        :param detach_reset: whether detach the computation graph of reset
+        :type detach_reset: bool
+        :param cupy_fp32_inference: If `True`, if this module is in `eval` mode, using float32, running on GPU, and `cupy` is installed, then this
+            module will use `cupy` to accelerate
+        :type cupy_fp32_inference: bool
+        The Integrate-and-Fire neuron, which can be seen as a ideal integrator. The voltage of the IF neuron will not decay
+        as that of the LIF neuron. The subthreshold neural dynamics of it is as followed:
+        .. math::
+            V[t] = V[t-1] + X[t]
+        """
+        super().__init__(v_threshold, v_reset, surrogate_function, detach_reset)
+        if cupy_fp32_inference:
+            check_backend('cupy')
+        self.cupy_fp32_inference = cupy_fp32_inference
+    def neuronal_charge(self, x: torch.Tensor):
+        self.v = self.v + x
+    def forward(self, x: torch.Tensor):
+        if self.cupy_fp32_inference and cupy is not None and not self.training and x.dtype == torch.float32:
+            # cupy is installed && eval mode && fp32
+            device_id = x.get_device()
+            if device_id < 0:
+                return super().forward(x)
+            # use cupy to accelerate
+            if isinstance(self.v, float):
+                v = torch.zeros_like(x)
+                if self.v != 0.:
+                    torch.fill_(v, self.v)
+                self.v = v
+            if self.v_reset is None:
+                hard_reset = False
+            else:
+                hard_reset = True
+            code = rf'''
+                extern "C" __global__
+                void IFNode_{'hard' if hard_reset else 'soft'}_reset_inference_forward(
+                const float * x, const float & v_threshold, {'const float & v_reset,' if hard_reset else ''}
+                float * spike, float * v,
+                const int & numel)
+            '''
+            code += r'''
+                {
+                    const int index = blockIdx.x * blockDim.x + threadIdx.x;
+                    if (index < numel)
+                    {
+                        v[index] += x[index];
+                        spike[index] = (float) (v[index] >= v_threshold);
+            '''
+            code += rf'''
+                        {'v[index] = (1.0f - spike[index]) * v[index] + spike[index] * v_reset;' if hard_reset else 'v[index] -= spike[index] * v_threshold;'}
+            '''
+            code += r'''
+                    }
+                }
+            '''
+            if hasattr(self, 'cp_kernel'):
+                if self.cp_kernel.code != code:
+                    # replace codes
+                    del self.cp_kernel
+                    self.cp_kernel = cupy.RawKernel(code,
+                                                    f"IFNode_{'hard' if hard_reset else 'soft'}_reset_inference_forward",
+                                                    options=configure.cuda_compiler_options,
+                                                    backend=configure.cuda_compiler_backend)
+            else:
+                self.cp_kernel = cupy.RawKernel(code,
+                                                f"IFNode_{'hard' if hard_reset else 'soft'}_reset_inference_forward",
+                                                options=configure.cuda_compiler_options,
+                                                backend=configure.cuda_compiler_backend)
+            with cu_kernel_opt.DeviceEnvironment(device_id):
+                numel = x.numel()
+                threads = configure.cuda_threads
+                blocks = cu_kernel_opt.cal_blocks(numel)
+                cp_numel = cupy.asarray(numel)
+                cp_v_threshold = cupy.asarray(self.v_threshold, dtype=np.float32)
+                if hard_reset:
+                    cp_v_reset = cupy.asarray(self.v_reset, dtype=np.float32)
+                spike = torch.zeros_like(x)
+                if hard_reset:
+                    x, cp_v_threshold, cp_v_reset, spike, self.v, cp_numel = cu_kernel_opt.get_contiguous(x,
+                                                                                                          cp_v_threshold,
+                                                                                                          cp_v_reset,
+                                                                                                          spike, self.v,
+                                                                                                          cp_numel)
+                    kernel_args = [x, cp_v_threshold, cp_v_reset, spike, self.v, cp_numel]
+                else:
+                    x, cp_v_threshold, spike, self.v, cp_numel = cu_kernel_opt.get_contiguous(x, cp_v_threshold, spike,
+                                                                                              self.v, cp_numel)
+                    kernel_args = [x, cp_v_threshold, spike, self.v, cp_numel]
+                self.cp_kernel(
+                    (blocks,), (threads,),
+                    cu_kernel_opt.wrap_args_to_raw_kernel(
+                        device_id,
+                        *kernel_args
+                    )
+                )
+                return spike
+        else:
+            return super().forward(x)
+class MultiStepIFNode(IFNode):
+    def __init__(self, v_threshold: float = 1., v_reset: float = 0.,
+                 surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False, backend='torch',
+                 lava_s_cale=1 << 6):
+        """
+        * :ref:`API in English <MultiStepIFNode.__init__-en>`
+        .. _MultiStepIFNode.__init__-cn:
+        :param v_threshold: 神经元的阈值电压
+        :type v_threshold: float
+        :param v_reset: 神经元的重置电压。如果不为 ``None``，当神经元释放脉冲后，电压会被重置为 ``v_reset``；
+            如果设置为 ``None``，则电压会被减去 ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: 反向传播时用来计算脉冲函数梯度的替代函数
+        :type surrogate_function: Callable
+        :param detach_reset: 是否将reset过程的计算图分离
+        :type detach_reset: bool
+        :param backend: 使用哪种计算后端，可以为 ``'torch'`` 或 ``'cupy'``。``'cupy'`` 速度更快，但仅支持GPU。
+        :type backend: str
+        多步版本的 :class:`spikingjelly.clock_driven.neuron.IFNode`。
+        .. tip::
+            对于多步神经元，输入 ``x_seq.shape = [T, *]``，不仅可以使用 ``.v`` 和 ``.spike`` 获取 ``t = T - 1`` 时刻的电压和脉冲，还能够
+            使用 ``.v_seq`` 和 ``.spike_seq`` 获取完整的 ``T`` 个时刻的电压和脉冲。
+        .. tip::
+            阅读 :doc:`传播模式 <./clock_driven/10_propagation_pattern>` 以获取更多关于单步和多步传播的信息。
+        * :ref:`中文API <MultiStepIFNode.__init__-cn>`
+        .. _MultiStepIFNode.__init__-en:
+        :param v_threshold: threshold voltage of neurons
+        :type v_threshold: float
+        :param v_reset: reset voltage of neurons. If not ``None``, voltage of neurons that just fired spikes will be set to
+            ``v_reset``. If ``None``, voltage of neurons that just fired spikes will subtract ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: surrogate function for replacing gradient of spiking functions during back-propagation
+        :type surrogate_function: Callable
+        :param detach_reset: whether detach the computation graph of reset
+        :type detach_reset: bool
+        :param backend: use which backend, ``'torch'`` or ``'cupy'``. ``'cupy'`` is faster but only supports GPU
+        :type backend: str
+        The multi-step version of :class:`spikingjelly.clock_driven.neuron.IFNode`.
+        .. admonition:: Tip
+            :class: tip
+            The input for multi-step neurons are ``x_seq.shape = [T, *]``. We can get membrane potential and spike at
+            time-step ``t = T - 1`` by ``.v`` and ``.spike``. We can also get membrane potential and spike at all ``T``
+            time-steps by ``.v_seq`` and ``.spike_seq``.
+        .. admonition:: Tip
+            :class: tip
+            Read :doc:`Propagation Pattern <./clock_driven_en/10_propagation_pattern>` for more details about single-step
+            and multi-step propagation.
+        """
+        super().__init__(v_threshold, v_reset, surrogate_function, detach_reset)
+        self.register_memory('v_seq', None)
+        check_backend(backend)
+        self.backend = backend
+        self.lava_s_cale = lava_s_cale
+        if backend == 'lava':
+            self.lava_neuron = self.to_lava()
+        else:
+            self.lava_neuron = None
+    def forward(self, x_seq: torch.Tensor):
+        assert x_seq.dim() > 1
+        # x_seq.shape = [T, *]
+        if self.backend == 'torch':
+            spike_seq = []
+            self.v_seq = []
+            for t in range(x_seq.shape[0]):
+                spike_seq.append(super().forward(x_seq[t]).unsqueeze(0))
+                self.v_seq.append(self.v.unsqueeze(0))
+            spike_seq = torch.cat(spike_seq, 0)
+            self.v_seq = torch.cat(self.v_seq, 0)
+            return spike_seq
+        elif self.backend == 'cupy':
+            if isinstance(self.v, float):
+                v_init = self.v
+                self.v = torch.zeros_like(x_seq[0].data)
+                if v_init != 0.:
+                    torch.fill_(self.v, v_init)
+            spike_seq, self.v_seq = neuron_kernel.MultiStepIFNodePTT.apply(
+                x_seq.flatten(1), self.v.flatten(0), self.v_threshold, self.v_reset, self.detach_reset,
+                self.surrogate_function.cuda_code)
+            spike_seq = spike_seq.reshape(x_seq.shape)
+            self.v_seq = self.v_seq.reshape(x_seq.shape)
+            self.v = self.v_seq[-1].clone()
+            return spike_seq
+        elif self.backend == 'lava':
+            if self.lava_neuron is None:
+                self.lava_neuron = self.to_lava()
+            spike, self.v = lava_exchange.lava_neuron_forward(self.lava_neuron, x_seq, self.v)
+            return spike
+        else:
+            raise NotImplementedError(self.backend)
+    def extra_repr(self):
+        return super().extra_repr() + f', backend={self.backend}'
+    def to_lava(self):
+        return lava_exchange.to_lava_neuron(self)
+    def reset(self):
+        super().reset()
+        if self.lava_neuron is not None:
+            self.lava_neuron.current_state.zero_()
+            self.lava_neuron.voltage_state.zero_()
+class LIFNode(BaseNode):
+    def __init__(self, tau: float = 2., decay_input: bool = True, v_threshold: float = 1.,
+                 v_reset: float = 0., surrogate_function: Callable = surrogate.Sigmoid(),
+                 detach_reset: bool = False, cupy_fp32_inference=False):
+        """
+        * :ref:`API in English <LIFNode.__init__-en>`
+        .. _LIFNode.__init__-cn:
+        :param tau: 膜电位时间常数
+        :type tau: float
+        :param decay_input: 输入是否会衰减
+        :type decay_input: bool
+        :param v_threshold: 神经元的阈值电压
+        :type v_threshold: float
+        :param v_reset: 神经元的重置电压。如果不为 ``None``，当神经元释放脉冲后，电压会被重置为 ``v_reset``；
+            如果设置为 ``None``，则电压会被减去 ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: 反向传播时用来计算脉冲函数梯度的替代函数
+        :type surrogate_function: Callable
+        :param detach_reset: 是否将reset过程的计算图分离
+        :type detach_reset: bool
+        :param cupy_fp32_inference: 若为 `True`，在 `eval` 模式下，使用float32，却在GPU上运行，并且 `cupy` 已经安装，则会自动使用 `cupy` 进行加速
+        :type cupy_fp32_inference: bool
+        Leaky Integrate-and-Fire 神经元模型，可以看作是带漏电的积分器。其阈下神经动力学方程为：
+        若 ``decay_input == True``:
+            .. math::
+                V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset}))
+        若 ``decay_input == False``:
+            .. math::
+                V[t] = V[t-1] - \\frac{1}{\\tau}(V[t-1] - V_{reset}) + X[t]
+        .. tip::
+            在 `eval` 模式下，使用float32，却在GPU上运行，并且 `cupy` 已经安装，则会自动使用 `cupy` 进行加速。
+        * :ref:`中文API <LIFNode.__init__-cn>`
+        .. _LIFNode.__init__-en:
+        :param tau: membrane time constant
+        :type tau: float
+        :param decay_input: whether the input will decay
+        :type decay_input: bool
+        :param v_threshold: threshold voltage of neurons
+        :type v_threshold: float
+        :param v_reset: reset voltage of neurons. If not ``None``, voltage of neurons that just fired spikes will be set to
+            ``v_reset``. If ``None``, voltage of neurons that just fired spikes will subtract ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: surrogate function for replacing gradient of spiking functions during back-propagation
+        :type surrogate_function: Callable
+        :param detach_reset: whether detach the computation graph of reset
+        :type detach_reset: bool
+        :param cupy_fp32_inference: If `True`, if this module is in `eval` mode, using float32, running on GPU, and `cupy` is installed, then this
+            module will use `cupy` to accelerate
+        :type cupy_fp32_inference: bool
+        The Leaky Integrate-and-Fire neuron, which can be seen as a leaky integrator.
+        The subthreshold neural dynamics of it is as followed:
+        IF ``decay_input == True``:
+            .. math::
+                V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset}))
+        IF ``decay_input == False``:
+            .. math::
+                V[t] = V[t-1] - \\frac{1}{\\tau}(V[t-1] - V_{reset}) + X[t]
+        .. admonition:: Tip
+            :class: tip
+            If this module is in `eval` mode, using float32, running on GPU, and `cupy` is installed, then this
+            module will use `cupy` to accelerate.
+        """
+        assert isinstance(tau, float) and tau > 1.
+        super().__init__(v_threshold, v_reset, surrogate_function, detach_reset)
+        self.tau = tau
+        self.decay_input = decay_input
+        if cupy_fp32_inference:
+            check_backend('cupy')
+        self.cupy_fp32_inference = cupy_fp32_inference
+    def extra_repr(self):
+        return super().extra_repr() + f', tau={self.tau}'
+    def neuronal_charge(self, x: torch.Tensor):
+        if self.decay_input:
+            if self.v_reset is None or self.v_reset == 0.:
+                self.v = self.v + (x - self.v) / self.tau
+            else:
+                self.v = self.v + (x - (self.v - self.v_reset)) / self.tau
+        else:
+            if self.v_reset is None or self.v_reset == 0.:
+                self.v = self.v * (1. - 1. / self.tau) + x
+            else:
+                self.v = self.v - (self.v - self.v_reset) / self.tau + x
+    def forward(self, x: torch.Tensor):
+        if self.cupy_fp32_inference and cupy is not None and not self.training and x.dtype == torch.float32:
+            # cupy is installed && eval mode && fp32
+            device_id = x.get_device()
+            if device_id < 0:
+                return super().forward(x)
+            # use cupy to accelerate
+            if isinstance(self.v, float):
+                v = torch.zeros_like(x)
+                if self.v != 0.:
+                    torch.fill_(v, self.v)
+                self.v = v
+            if self.v_reset is None:
+                hard_reset = False
+            else:
+                hard_reset = True
+            code = rf'''
+                extern "C" __global__
+                void LIFNode_{'hard' if hard_reset else 'soft'}_reset_decayInput_{self.decay_input}_inference_forward(
+                const float * x, const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} const float & tau,
+                float * spike, float * v,
+                const int & numel)
+            '''
+            code += r'''
+                {
+                    const int index = blockIdx.x * blockDim.x + threadIdx.x;
+                    if (index < numel)
+                    {
+            '''
+            if self.decay_input:
+                if hard_reset:
+                    code += r'''
+                                v[index] += (x[index] - (v[index] - v_reset)) / tau;
+                            '''
+                else:
+                    code += r'''
+                                v[index] += (x[index] - v[index]) / tau;
+                    '''
+            else:
+                if hard_reset:
+                    code += r'''
+                                v[index] = x[index] + v[index] - (v[index] - v_reset) / tau;
+                            '''
+                else:
+                    code += r'''
+                                v[index] = x[index] + v[index] * (1.0f - 1.0f / tau);
+                    '''
+            code += rf'''
+                        spike[index] = (float) (v[index] >= v_threshold);
+                        {'v[index] = (1.0f - spike[index]) * v[index] + spike[index] * v_reset;' if hard_reset else 'v[index] -= spike[index] * v_threshold;'}
+            '''
+            code += r'''
+                    }
+                }
+            '''
+            if hasattr(self, 'cp_kernel'):
+                if self.cp_kernel.code != code:
+                    # replace codes
+                    del self.cp_kernel
+                    self.cp_kernel = cupy.RawKernel(code,
+                                                    f"LIFNode_{'hard' if hard_reset else 'soft'}_reset_decayInput_{self.decay_input}_inference_forward",
+                                                    options=configure.cuda_compiler_options,
+                                                    backend=configure.cuda_compiler_backend)
+            else:
+                self.cp_kernel = cupy.RawKernel(code,
+                                                f"LIFNode_{'hard' if hard_reset else 'soft'}_reset_decayInput_{self.decay_input}_inference_forward",
+                                                options=configure.cuda_compiler_options,
+                                                backend=configure.cuda_compiler_backend)
+            with cu_kernel_opt.DeviceEnvironment(device_id):
+                numel = x.numel()
+                threads = configure.cuda_threads
+                blocks = cu_kernel_opt.cal_blocks(numel)
+                cp_numel = cupy.asarray(numel)
+                cp_v_threshold = cupy.asarray(self.v_threshold, dtype=np.float32)
+                if hard_reset:
+                    cp_v_reset = cupy.asarray(self.v_reset, dtype=np.float32)
+                cp_tau = cupy.asarray(self.tau, dtype=np.float32)
+                spike = torch.zeros_like(x)
+                if hard_reset:
+                    x, cp_v_threshold, cp_v_reset, cp_tau, spike, self.v, cp_numel = cu_kernel_opt.get_contiguous(x,
+                                                                                                                  cp_v_threshold,
+                                                                                                                  cp_v_reset,
+                                                                                                                  cp_tau,
+                                                                                                                  spike,
+                                                                                                                  self.v,
+                                                                                                                  cp_numel)
+                    kernel_args = [x, cp_v_threshold, cp_v_reset, cp_tau, spike, self.v, cp_numel]
+                else:
+                    x, cp_v_threshold, cp_tau, spike, self.v, cp_numel = cu_kernel_opt.get_contiguous(x, cp_v_threshold,
+                                                                                                      cp_tau, spike,
+                                                                                                      self.v, cp_numel)
+                    kernel_args = [x, cp_v_threshold, cp_tau, spike, self.v, cp_numel]
+                self.cp_kernel(
+                    (blocks,), (threads,),
+                    cu_kernel_opt.wrap_args_to_raw_kernel(
+                        device_id,
+                        *kernel_args
+                    )
+                )
+                return spike
+        else:
+            return super().forward(x)
+class MultiStepLIFNode(LIFNode):
+    def __init__(self, tau: float = 2., decay_input: bool = True, v_threshold: float = 1.,
+                 v_reset: float = 0., surrogate_function: Callable = surrogate.Sigmoid(),
+                 detach_reset: bool = False, backend='torch', lava_s_cale=1 << 6):
+        """
+        * :ref:`API in English <MultiStepLIFNode.__init__-en>`
+        .. _MultiStepLIFNode.__init__-cn:
+        :param tau: 膜电位时间常数
+        :type tau: float
+        :param decay_input: 输入是否会衰减
+        :type decay_input: bool
+        :param v_threshold: 神经元的阈值电压
+        :type v_threshold: float
+        :param v_reset: 神经元的重置电压。如果不为 ``None``，当神经元释放脉冲后，电压会被重置为 ``v_reset``；
+            如果设置为 ``None``，则电压会被减去 ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: 反向传播时用来计算脉冲函数梯度的替代函数
+        :type surrogate_function: Callable
+        :param detach_reset: 是否将reset过程的计算图分离
+        :type detach_reset: bool
+        :param backend: 使用哪种计算后端，可以为 ``'torch'`` 或 ``'cupy'``。``'cupy'`` 速度更快，但仅支持GPU。
+        :type backend: str
+        多步版本的 :class:`spikingjelly.clock_driven.neuron.LIFNode`。
+        .. tip::
+            对于多步神经元，输入 ``x_seq.shape = [T, *]``，不仅可以使用 ``.v`` 和 ``.spike`` 获取 ``t = T - 1`` 时刻的电压和脉冲，还能够
+            使用 ``.v_seq`` 和 ``.spike_seq`` 获取完整的 ``T`` 个时刻的电压和脉冲。
+        .. tip::
+            阅读 :doc:`传播模式 <./clock_driven/10_propagation_pattern>` 以获取更多关于单步和多步传播的信息。
+        * :ref:`中文API <MultiStepLIFNode.__init__-cn>`
+        .. _MultiStepLIFNode.__init__-en:
+        :param tau: membrane time constant
+        :type tau: float
+        :param decay_input: whether the input will decay
+        :type decay_input: bool
+        :param v_threshold: threshold voltage of neurons
+        :type v_threshold: float
+        :param v_reset: reset voltage of neurons. If not ``None``, voltage of neurons that just fired spikes will be set to
+            ``v_reset``. If ``None``, voltage of neurons that just fired spikes will subtract ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: surrogate function for replacing gradient of spiking functions during back-propagation
+        :type surrogate_function: Callable
+        :param detach_reset: whether detach the computation graph of reset
+        :type detach_reset: bool
+        :param backend: use which backend, ``'torch'`` or ``'cupy'``. ``'cupy'`` is faster but only supports GPU
+        :type backend: str
+        The multi-step version of :class:`spikingjelly.clock_driven.neuron.LIFNode`.
+        .. admonition:: Tip
+            :class: tip
+            The input for multi-step neurons are ``x_seq.shape = [T, *]``. We can get membrane potential and spike at
+            time-step ``t = T - 1`` by ``.v`` and ``.spike``. We can also get membrane potential and spike at all ``T``
+            time-steps by ``.v_seq`` and ``.spike_seq``.
+        .. admonition:: Tip
+            :class: tip
+            Read :doc:`Propagation Pattern <./clock_driven_en/10_propagation_pattern>` for more details about single-step
+            and multi-step propagation.
+        """
+        super().__init__(tau, decay_input, v_threshold, v_reset, surrogate_function, detach_reset)
+        self.register_memory('v_seq', None)
+        check_backend(backend)
+        self.backend = backend
+        self.lava_s_cale = lava_s_cale
+        if backend == 'lava':
+            self.lava_neuron = self.to_lava()
+        else:
+            self.lava_neuron = None
+    def forward(self, x_seq: torch.Tensor):
+        assert x_seq.dim() > 1
+        # x_seq.shape = [T, *]
+        if self.backend == 'torch':
+            spike_seq = []
+            self.v_seq = []
+            for t in range(x_seq.shape[0]):
+                spike_seq.append(super().forward(x_seq[t]).unsqueeze(0))
+                self.v_seq.append(self.v.unsqueeze(0))
+            spike_seq = torch.cat(spike_seq, 0)
+            self.v_seq = torch.cat(self.v_seq, 0)
+            return spike_seq
+        elif self.backend == 'cupy':
+            if isinstance(self.v, float):
+                v_init = self.v
+                self.v = torch.zeros_like(x_seq[0].data)
+                if v_init != 0.:
+                    torch.fill_(self.v, v_init)
+            spike_seq, self.v_seq = neuron_kernel.MultiStepLIFNodePTT.apply(
+                x_seq.flatten(1), self.v.flatten(0), self.decay_input, self.tau, self.v_threshold, self.v_reset,
+                self.detach_reset, self.surrogate_function.cuda_code)
+            spike_seq = spike_seq.reshape(x_seq.shape)
+            self.v_seq = self.v_seq.reshape(x_seq.shape)
+            self.v = self.v_seq[-1].clone()
+            return spike_seq
+        elif self.backend == 'lava':
+            if self.lava_neuron is None:
+                self.lava_neuron = self.to_lava()
+            spike, self.v = lava_exchange.lava_neuron_forward(self.lava_neuron, x_seq, self.v)
+            return spike
+        else:
+            raise NotImplementedError(self.backend)
+    def extra_repr(self):
+        return super().extra_repr() + f', backend={self.backend}'
+    def to_lava(self):
+        return lava_exchange.to_lava_neuron(self)
+    def reset(self):
+        super().reset()
+        if self.lava_neuron is not None:
+            self.lava_neuron.current_state.zero_()
+            self.lava_neuron.voltage_state.zero_()
+class ParametricLIFNode(BaseNode):
+    def __init__(self, init_tau: float = 2.0, decay_input: bool = True, v_threshold: float = 1.,
+                 v_reset: float = 0., surrogate_function: Callable = surrogate.Sigmoid(),
+                 detach_reset: bool = False):
+        """
+        * :ref:`API in English <ParametricLIFNode.__init__-en>`
+        .. _ParametricLIFNode.__init__-cn:
+        :param init_tau: 膜电位时间常数的初始值
+        :type init_tau: float
+        :param decay_input: 输入是否会衰减
+        :type decay_input: bool
+        :param v_threshold: 神经元的阈值电压
+        :type v_threshold: float
+        :param v_reset: 神经元的重置电压。如果不为 ``None``，当神经元释放脉冲后，电压会被重置为 ``v_reset``；
+            如果设置为 ``None``，则电压会被减去 ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: 反向传播时用来计算脉冲函数梯度的替代函数
+        :type surrogate_function: Callable
+        :param detach_reset: 是否将reset过程的计算图分离
+        :type detach_reset: bool
+        `Incorporating Learnable Membrane Time Constant to Enhance Learning of Spiking Neural Networks <https://arxiv.org/abs/2007.05785>`_
+        提出的 Parametric Leaky Integrate-and-Fire (PLIF)神经元模型，可以看作是带漏电的积分器。其阈下神经动力学方程为：
+        若 ``decay_input == True``:
+            .. math::
+                V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset}))
+        若 ``decay_input == False``:
+            .. math::
+                V[t] = V[t-1] - \\frac{1}{\\tau}(V[t-1] - V_{reset}) + X[t]
+        其中 :math:`\\frac{1}{\\tau} = {\\rm Sigmoid}(w)`，:math:`w` 是可学习的参数。
+        * :ref:`中文API <ParametricLIFNode.__init__-cn>`
+        .. _ParametricLIFNode.__init__-en:
+        :param init_tau: the initial value of membrane time constant
+        :type init_tau: float
+        :param decay_input: whether the input will decay
+        :type decay_input: bool
+        :param v_threshold: threshold voltage of neurons
+        :type v_threshold: float
+        :param v_reset: reset voltage of neurons. If not ``None``, voltage of neurons that just fired spikes will be set to
+            ``v_reset``. If ``None``, voltage of neurons that just fired spikes will subtract ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: surrogate function for replacing gradient of spiking functions during back-propagation
+        :type surrogate_function: Callable
+        :param detach_reset: whether detach the computation graph of reset
+        :type detach_reset: bool
+        The Parametric Leaky Integrate-and-Fire (PLIF) neuron, which is proposed by `Incorporating Learnable Membrane Time Constant to Enhance Learning of Spiking Neural Networks <https://arxiv.org/abs/2007.05785>`_ and can be seen as a leaky integrator.
+        The subthreshold neural dynamics of it is as followed:
+        IF ``decay_input == True``:
+            .. math::
+                V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset}))
+        IF ``decay_input == False``:
+            .. math::
+                V[t] = V[t-1] - \\frac{1}{\\tau}(V[t-1] - V_{reset}) + X[t]
+        where :math:`\\frac{1}{\\tau} = {\\rm Sigmoid}(w)`, :math:`w` is a learnable parameter.
+        """
+        assert isinstance(init_tau, float) and init_tau > 1.
+        super().__init__(v_threshold, v_reset, surrogate_function, detach_reset)
+        self.decay_input = decay_input
+        init_w = - math.log(init_tau - 1.)
+        self.w = nn.Parameter(torch.as_tensor(init_w))
+    def extra_repr(self):
+        with torch.no_grad():
+            tau = 1. / self.w.sigmoid()
+        return super().extra_repr() + f', tau={tau}'
+    def neuronal_charge(self, x: torch.Tensor):
+        if self.decay_input:
+            if self.v_reset is None or self.v_reset == 0.:
+                self.v = self.v + (x - self.v) * self.w.sigmoid()
+            else:
+                self.v = self.v + (x - (self.v - self.v_reset)) * self.w.sigmoid()
+        else:
+            if self.v_reset is None or self.v_reset == 0.:
+                self.v = self.v * (1. - self.w.sigmoid()) + x
+            else:
+                self.v = self.v - (self.v - self.v_reset) * self.w.sigmoid() + x
+class MultiStepParametricLIFNode(ParametricLIFNode):
+    def __init__(self, init_tau: float = 2., decay_input: bool = True, v_threshold: float = 1.,
+                 v_reset: float = 0., surrogate_function: Callable = surrogate.Sigmoid(),
+                 detach_reset: bool = False, backend='torch'):
+        """
+        * :ref:`API in English <MultiStepParametricLIFNode.__init__-en>`
+        .. _MultiStepParametricLIFNode.__init__-cn:
+        :param init_tau: 膜电位时间常数的初始值
+        :type init_tau: float
+        :param decay_input: 输入是否会衰减
+        :type decay_input: bool
+        :param v_threshold: 神经元的阈值电压
+        :type v_threshold: float
+        :param v_reset: 神经元的重置电压。如果不为 ``None``，当神经元释放脉冲后，电压会被重置为 ``v_reset``；
+            如果设置为 ``None``，则电压会被减去 ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: 反向传播时用来计算脉冲函数梯度的替代函数
+        :type surrogate_function: Callable
+        :param detach_reset: 是否将reset过程的计算图分离
+        :type detach_reset: bool
+        多步版本的 `Incorporating Learnable Membrane Time Constant to Enhance Learning of Spiking Neural Networks <https://arxiv.org/abs/2007.05785>`_
+        提出的 Parametric Leaky Integrate-and-Fire (PLIF)神经元模型，可以看作是带漏电的积分器。其阈下神经动力学方程为：
+        .. math::
+            V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset})
+        其中 :math:`\\frac{1}{\\tau} = {\\rm Sigmoid}(w)`，:math:`w` 是可学习的参数。
+        .. tip::
+            对于多步神经元，输入 ``x_seq.shape = [T, *]``，不仅可以使用 ``.v`` 和 ``.spike`` 获取 ``t = T - 1`` 时刻的电压和脉冲，还能够
+            使用 ``.v_seq`` 和 ``.spike_seq`` 获取完整的 ``T`` 个时刻的电压和脉冲。
+        .. tip::
+            阅读 :doc:`传播模式 <./clock_driven/10_propagation_pattern>` 以获取更多关于单步和多步传播的信息。
+        * :ref:`中文API <MultiStepParametricLIFNode.__init__-cn>`
+        .. _MultiStepParametricLIFNode.__init__-en:
+        :param init_tau: the initial value of membrane time constant
+        :type init_tau: float
+        :param decay_input: whether the input will decay
+        :type decay_input: bool
+        :param v_threshold: threshold voltage of neurons
+        :type v_threshold: float
+        :param v_reset: reset voltage of neurons. If not ``None``, voltage of neurons that just fired spikes will be set to
+            ``v_reset``. If ``None``, voltage of neurons that just fired spikes will subtract ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: surrogate function for replacing gradient of spiking functions during back-propagation
+        :type surrogate_function: Callable
+        :param detach_reset: whether detach the computation graph of reset
+        :type detach_reset: bool
+        :param backend: use which backend, ``'torch'`` or ``'cupy'``. ``'cupy'`` is faster but only supports GPU
+        :type backend: str
+        The multi-step Parametric Leaky Integrate-and-Fire (PLIF) neuron, which is proposed by `Incorporating Learnable Membrane Time Constant to Enhance Learning of Spiking Neural Networks <https://arxiv.org/abs/2007.05785>`_ and can be seen as a leaky integrator.
+        The subthreshold neural dynamics of it is as followed:
+        .. math::
+            V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset})
+        where :math:`\\frac{1}{\\tau} = {\\rm Sigmoid}(w)`, :math:`w` is a learnable parameter.
+        .. admonition:: Tip
+            :class: tip
+            The input for multi-step neurons are ``x_seq.shape = [T, *]``. We can get membrane potential and spike at
+            time-step ``t = T - 1`` by ``.v`` and ``.spike``. We can also get membrane potential and spike at all ``T``
+            time-steps by ``.v_seq`` and ``.spike_seq``.
+        .. admonition:: Tip
+            :class: tip
+            Read :doc:`Propagation Pattern <./clock_driven_en/10_propagation_pattern>` for more details about single-step
+            and multi-step propagation.
+        """
+        super().__init__(init_tau, decay_input, v_threshold, v_reset, surrogate_function, detach_reset)
+        self.register_memory('v_seq', None)
+        check_backend(backend)
+        self.backend = backend
+    def forward(self, x_seq: torch.Tensor):
+        assert x_seq.dim() > 1
+        # x_seq.shape = [T, *]
+        if self.backend == 'torch':
+            spike_seq = []
+            self.v_seq = []
+            for t in range(x_seq.shape[0]):
+                spike_seq.append(super().forward(x_seq[t]).unsqueeze(0))
+                self.v_seq.append(self.v.unsqueeze(0))
+            spike_seq = torch.cat(spike_seq, 0)
+            self.v_seq = torch.cat(self.v_seq, 0)
+            return spike_seq
+        elif self.backend == 'cupy':
+            if isinstance(self.v, float):
+                v_init = self.v
+                self.v = torch.zeros_like(x_seq[0].data)
+                if v_init != 0.:
+                    torch.fill_(self.v, v_init)
+            spike_seq, self.v_seq = neuron_kernel.MultiStepParametricLIFNodePTT.apply(
+                x_seq.flatten(1), self.v.flatten(0), self.w.sigmoid(), self.decay_input, self.v_threshold, self.v_reset,
+                self.detach_reset, self.surrogate_function.cuda_code)
+            spike_seq = spike_seq.reshape(x_seq.shape)
+            self.v_seq = self.v_seq.reshape(x_seq.shape)
+            self.v = self.v_seq[-1].clone()
+            return spike_seq
+        else:
+            raise NotImplementedError
+    def extra_repr(self):
+        return super().extra_repr() + f', backend={self.backend}'
+class QIFNode(BaseNode):
+    def __init__(self, tau: float = 2., v_c: float = 0.8, a0: float = 1., v_threshold: float = 1., v_rest: float = 0.,
+                 v_reset: float = -0.1,
+                 surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False):
+        """
+        * :ref:`API in English <QIFNode.__init__-en>`
+        .. _QIFNode.__init__-cn:
+        :param tau: 膜电位时间常数
+        :type tau: float
+        :param v_c: 关键电压
+        :type v_c: float
+        :param a0:
+        :type a0: float
+        :param v_threshold: 神经元的阈值电压
+        :type v_threshold: float
+        :param v_rest: 静息电位
+        :type v_rest: float
+        :param v_reset: 神经元的重置电压。如果不为 ``None``，当神经元释放脉冲后，电压会被重置为 ``v_reset``；
+            如果设置为 ``None``，则电压会被减去 ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: 反向传播时用来计算脉冲函数梯度的替代函数
+        :type surrogate_function: Callable
+        :param detach_reset: 是否将reset过程的计算图分离
+        :type detach_reset: bool
+        Quadratic Integrate-and-Fire 神经元模型，一种非线性积分发放神经元模型，也是指数积分发放神经元(Exponential Integrate-and-Fire)的近似版本。其阈下神经动力学方程为：
+        .. math::
+            V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] + a_0 (V[t-1] - V_{rest})(V[t-1] - V_c))
+        * :ref:`中文API <QIFNode.__init__-cn>`
+        .. _QIFNode.__init__-en:
+        :param tau: membrane time constant
+        :type tau: float
+        :param v_c: critical voltage
+        :type v_c: float
+        :param a0:
+        :type a0: float
+        :param v_threshold: threshold voltage of neurons
+        :type v_threshold: float
+        :param v_rest: resting potential
+        :type v_rest: float
+        :param v_reset: reset voltage of neurons. If not ``None``, voltage of neurons that just fired spikes will be set to
+            ``v_reset``. If ``None``, voltage of neurons that just fired spikes will subtract ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: surrogate function for replacing gradient of spiking functions during back-propagation
+        :type surrogate_function: Callable
+        :param detach_reset: whether detach the computation graph of reset
+        :type detach_reset: bool
+        The Quadratic Integrate-and-Fire neuron is a kind of nonlinear integrate-and-fire models and also an approximation of the Exponential Integrate-and-Fire model.
+        The subthreshold neural dynamics of it is as followed:
+        .. math::
+            V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] + a_0 (V[t-1] - V_{rest})(V[t-1] - V_c))
+        """
+        assert isinstance(tau, float) and tau > 1.
+        if v_reset is not None:
+            assert v_threshold > v_reset
+            assert v_rest >= v_reset
+        assert a0 > 0
+        super().__init__(v_threshold, v_reset, surrogate_function, detach_reset)
+        self.tau = tau
+        self.v_c = v_c
+        self.v_rest = v_rest
+        self.a0 = a0
+    def extra_repr(self):
+        return super().extra_repr() + f', tau={self.tau}, v_c={self.v_c}, a0={self.a0}, v_rest={self.v_rest}'
+    def neuronal_charge(self, x: torch.Tensor):
+        self.v = self.v + (x + self.a0 * (self.v - self.v_rest) * (self.v - self.v_c)) / self.tau
+class EIFNode(BaseNode):
+    def __init__(self, tau: float = 2., delta_T: float = 1., theta_rh: float = .8, v_threshold: float = 1.,
+                 v_rest: float = 0., v_reset: float = -0.1,
+                 surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False):
+        """
+        * :ref:`API in English <EIFNode.__init__-en>`
+        .. _EIFNode.__init__-cn:
+        :param tau: 膜电位时间常数
+        :type tau: float
+        :param delta_T: 陡峭度参数
+        :type delta_T: float
+        :param theta_rh: 基强度电压阈值
+        :type theta_rh: float
+        :param v_threshold: 神经元的阈值电压
+        :type v_threshold: float
+        :param v_rest: 静息电位
+        :type v_rest: float
+        :param v_reset: 神经元的重置电压。如果不为 ``None``，当神经元释放脉冲后，电压会被重置为 ``v_reset``；
+            如果设置为 ``None``，则电压会被减去 ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: 反向传播时用来计算脉冲函数梯度的替代函数
+        :type surrogate_function: Callable
+        :param detach_reset: 是否将reset过程的计算图分离
+        :type detach_reset: bool
+        Exponential Integrate-and-Fire 神经元模型，一种非线性积分发放神经元模型，是由HH神经元模型(Hodgkin-Huxley model)简化后推导出的一维模型。在 :math:`\\Delta_T\\to 0` 时退化为LIF模型。其阈下神经动力学方程为：
+        .. math::
+            V[t] = V[t-1] + \\frac{1}{\\tau}\\left(X[t] - (V[t-1] - V_{rest}) + \\Delta_T\\exp\\left(\\frac{V[t-1] - \\theta_{rh}}{\\Delta_T}\\right)\\right)
+        * :ref:`中文API <EIFNode.__init__-cn>`
+        .. _EIFNode.__init__-en:
+        :param tau: membrane time constant
+        :type tau: float
+        :param delta_T: sharpness parameter
+        :type delta_T: float
+        :param theta_rh: rheobase threshold
+        :type theta_rh: float
+        :param v_threshold: threshold voltage of neurons
+        :type v_threshold: float
+        :param v_rest: resting potential
+        :type v_rest: float
+        :param v_reset: reset voltage of neurons. If not ``None``, voltage of neurons that just fired spikes will be set to
+            ``v_reset``. If ``None``, voltage of neurons that just fired spikes will subtract ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: surrogate function for replacing gradient of spiking functions during back-propagation
+        :type surrogate_function: Callable
+        :param detach_reset: whether detach the computation graph of reset
+        :type detach_reset: bool
+        The Exponential Integrate-and-Fire neuron is a kind of nonlinear integrate-and-fire models and also an one-dimensional model derived from the Hodgkin-Huxley model. It degenerates to the LIF model when :math:`\\Delta_T\\to 0`.
+        The subthreshold neural dynamics of it is as followed:
+        .. math::
+            V[t] = V[t-1] + \\frac{1}{\\tau}\\left(X[t] - (V[t-1] - V_{rest}) + \\Delta_T\\exp\\left(\\frac{V[t-1] - \\theta_{rh}}{\\Delta_T}\\right)\\right)
+        """
+        assert isinstance(tau, float) and tau > 1.
+        if v_reset is not None:
+            assert v_threshold > v_reset
+            assert v_rest >= v_reset
+        assert delta_T > 0
+        super().__init__(v_threshold, v_reset, surrogate_function, detach_reset)
+        self.tau = tau
+        self.delta_T = delta_T
+        self.v_rest = v_rest
+        self.theta_rh = theta_rh
+    def extra_repr(self):
+        return super().extra_repr() + f', tau={self.tau}, delta_T={self.delta_T}, theta_rh={self.theta_rh}'
+    def neuronal_charge(self, x: torch.Tensor):
+        with torch.no_grad():
+            if not isinstance(self.v, torch.Tensor):
+                self.v = torch.as_tensor(self.v, device=x.device)
+        self.v = self.v + (x + self.v_rest - self.v + self.delta_T * torch.exp(
+            (self.v - self.theta_rh) / self.delta_T)) / self.tau
+class MultiStepEIFNode(EIFNode):
+    def __init__(self, tau: float = 2., delta_T: float = 1., theta_rh: float = .8, v_threshold: float = 1.,
+                 v_rest: float = 0., v_reset: float = -0.1,
+                 surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False, backend='torch'):
+        """
+        * :ref:`API in English <MultiStepEIFNode.__init__-en>`
+        .. _MultiStepEIFNode.__init__-cn:
+        ::param tau: 膜电位时间常数
+        :type tau: float
+        :param delta_T: 陡峭度参数
+        :type delta_T: float
+        :param theta_rh: 基强度电压阈值
+        :type theta_rh: float
+        :param v_threshold: 神经元的阈值电压
+        :type v_threshold: float
+        :param v_rest: 静息电位
+        :type v_rest: float
+        :param v_reset: 神经元的重置电压。如果不为 ``None``，当神经元释放脉冲后，电压会被重置为 ``v_reset``；
+            如果设置为 ``None``，则电压会被减去 ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: 反向传播时用来计算脉冲函数梯度的替代函数
+        :type surrogate_function: Callable
+        :param detach_reset: 是否��reset过程的计算图分离
+        :type detach_reset: bool
+        多步版本的 :class:`spikingjelly.clock_driven.neuron.EIFNode`。
+        .. tip::
+        对于多步神经元，输入 ``x_seq.shape = [T, *]``，不仅可以使用 ``.v`` 和 ``.spike`` 获取 ``t = T - 1`` 时刻的电压和脉冲，还能够
+        使用 ``.v_seq`` 和 ``.spike_seq`` 获取完整的 ``T`` 个时刻的电压和脉冲。
+        .. tip::
+            阅读 :doc:`传播模式 <./clock_driven/10_propagation_pattern>` 以获取更多关于单步和多步传播的信息。
+        * :ref:`中文API <MultiStepEIFNode.__init__-cn>`
+        .. _MultiStepEIFNode.__init__-en:
+        :param tau: membrane time constant
+        :type tau: float
+        :param delta_T: sharpness parameter
+        :type delta_T: float
+        :param theta_rh: rheobase threshold
+        :type theta_rh: float
+        :param v_threshold: threshold voltage of neurons
+        :type v_threshold: float
+        :param v_rest: resting potential
+        :type v_rest: float
+        :param v_reset: reset voltage of neurons. If not ``None``, voltage of neurons that just fired spikes will be set to
+            ``v_reset``. If ``None``, voltage of neurons that just fired spikes will subtract ``v_threshold``
+        :type v_reset: float
+        :param surrogate_function: surrogate function for replacing gradient of spiking functions during back-propagation
+        :type surrogate_function: Callable
+        :param detach_reset: whether detach the computation graph of reset
+        :type detach_reset: bool
+        :param backend: use which backend, ``'torch'`` or ``'cupy'``. ``'cupy'`` is faster but only supports GPU
+        :type backend: str
+        .. admonition:: Tip
+            :class: tip
+            The input for multi-step neurons are ``x_seq.shape = [T, *]``. We can get membrane potential and spike at
+            time-step ``t = T - 1`` by ``.v`` and ``.spike``. We can also get membrane potential and spike at all ``T``
+            time-steps by ``.v_seq`` and ``.spike_seq``.
+        .. admonition:: Tip
+            :class: tip
+            Read :doc:`Propagation Pattern <./clock_driven_en/10_propagation_pattern>` for more details about single-step
+            and multi-step propagation.
+        """
+        super().__init__(tau, delta_T, theta_rh, v_threshold, v_rest, v_reset,
+                         surrogate_function, detach_reset)
+        self.register_memory('v_seq', None)
+        check_backend(backend)
+        self.backend = backend
+    def forward(self, x_seq: torch.Tensor):
+        assert x_seq.dim() > 1
+        # x_seq.shape = [T, *]
+        if self.backend == 'torch':
+            spike_seq = []
+            self.v_seq = []
+            for t in range(x_seq.shape[0]):
+                spike_seq.append(super().forward(x_seq[t]).unsqueeze(0))
+                self.v_seq.append(self.v.unsqueeze(0))
+            spike_seq = torch.cat(spike_seq, 0)
+            self.v_seq = torch.cat(self.v_seq, 0)
+            return spike_seq
+        elif self.backend == 'cupy':
+            if isinstance(self.v, float):
+                v_init = self.v
+                self.v = torch.zeros_like(x_seq[0].data)
+                if v_init != 0.:
+                    torch.fill_(self.v, v_init)
+            spike_seq, self.v_seq = neuron_kernel.MultiStepEIFNodePTT.apply(
+                x_seq.flatten(1), self.v.flatten(0), self.tau, self.v_threshold, self.v_reset, self.v_rest,
+                self.theta_rh, self.delta_T, self.detach_reset, self.surrogate_function.cuda_code)
+            spike_seq = spike_seq.reshape(x_seq.shape)
+            self.v_seq = self.v_seq.reshape(x_seq.shape)
+            self.v = self.v_seq[-1].clone()
+            return spike_seq
+        else:
+            raise NotImplementedError
+    def extra_repr(self):
+        return super().extra_repr() + f', backend={self.backend}'
+class GeneralNode(BaseNode):
+    def __init__(self, a: float or torch.Tensor, b: float or torch.Tensor, c: float or torch.Tensor = 0.,
+                 v_threshold: float = 1., v_reset: float = 0.,
+                 surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False):
+        super().__init__(v_threshold, v_reset, surrogate_function, detach_reset)
+        self.a = self.register_buffer('a', torch.as_tensor(a))
+        self.b = self.register_buffer('b', torch.as_tensor(b))
+        self.c = self.register_buffer('c', torch.as_tensor(c))
+    def neuronal_charge(self, x: torch.Tensor):
+        self.v = self.a * self.v + self.b * x + self.c
+class MultiStepGeneralNode(GeneralNode):
+    def __init__(self, a: float, b: float, c: float, v_threshold: float = 1., v_reset: float = 0.,
+                 surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False, backend='torch'):
+        super().__init__(v_threshold, v_reset, surrogate_function, detach_reset)
+        self.register_memory('v_seq', None)
+        check_backend(backend)
+        self.backend = backend
+    def forward(self, x_seq: torch.Tensor):
+        assert x_seq.dim() > 1
+        # x_seq.shape = [T, *]
+        if self.backend == 'torch':
+            spike_seq = []
+            self.v_seq = []
+            for t in range(x_seq.shape[0]):
+                spike_seq.append(super().forward(x_seq[t]).unsqueeze(0))
+                self.v_seq.append(self.v.unsqueeze(0))
+            spike_seq = torch.cat(spike_seq, 0)
+            self.v_seq = torch.cat(self.v_seq, 0)
+            return spike_seq
+        elif self.backend == 'cupy':
+            if isinstance(self.v, float):
+                v_init = self.v
+                self.v = torch.zeros_like(x_seq[0].data)
+                if v_init != 0.:
+                    torch.fill_(self.v, v_init)
+            raise NotImplementedError
+            spike_seq = spike_seq.reshape(x_seq.shape)
+            self.v_seq = self.v_seq.reshape(x_seq.shape)
+            self.v = self.v_seq[-1].clone()
+            return spike_seq
+        else:
+            raise NotImplementedError
+    def extra_repr(self):
+        return super().extra_repr() + f', backend={self.backend}'
+class LIAFNode(LIFNode):
+    def __init__(self, act: Callable, threshold_related: bool, *args, **kwargs):
+        """
+        :param act: the activation function
+        :type act: Callable
+        :param threshold_related: whether the neuron uses threshold related (TR mode). If true, `y = act(h - v_th)`,
+            otherwise `y = act(h)`
+        :type threshold_related: bool
+        Other parameters in `*args, **kwargs` are same with :class:`LIFNode`.
+        The LIAF neuron proposed in `LIAF-Net: Leaky Integrate and Analog Fire Network for Lightweight and Efficient Spatiotemporal Information Processing <https://arxiv.org/abs/2011.06176>`_.
+        .. admonition:: Warning
+            :class: warning
+            The outputs of this neuron are not binary spikes.
+        """
+        super().__init__(*args, **kwargs)
+        self.act = act
+        self.threshold_related = threshold_related
+    def forward(self, x: torch.Tensor):
+        self.neuronal_charge(x)
+        if self.threshold_related:
+            y = self.act(self.v - self.v_threshold)
+        else:
+            y = self.act(self.v)
+        spike = self.neuronal_fire()
+        self.neuronal_reset(spike)
+        return y

models/q_vit/Quant.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import torch
+import torch.nn.functional as F
+from torch.nn.modules.linear import Linear
+import math
+from torch.nn.parameter import Parameter
+from ._quan_base import _Conv2dQ, Qmodes, _LinearQ, _ActQ
+__all__ = ['Conv2dQ', 'LinearQ', 'ActQ']
+class FunQ(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, weight, alpha, g, Qn, Qp):
+        assert alpha > 0, 'alpha = {}'.format(alpha)
+        ctx.save_for_backward(weight, alpha)
+        ctx.other = g, Qn, Qp
+        q_w = (weight / alpha).round().clamp(Qn, Qp)
+        w_q = q_w * alpha
+        return w_q
+    @staticmethod
+    def backward(ctx, grad_weight):
+        weight, alpha = ctx.saved_tensors
+        g, Qn, Qp = ctx.other
+        q_w = weight / alpha
+        indicate_small = (q_w < Qn).float()
+        indicate_big = (q_w > Qp).float()
+        # indicate_middle = torch.ones(indicate_small.shape).to(indicate_small.device) - indicate_small - indicate_big
+        indicate_middle = 1.0 - indicate_small - indicate_big  # Thanks to @haolibai
+        grad_alpha = ((indicate_small * Qn + indicate_big * Qp + indicate_middle * (
+            -q_w + q_w.round())) * grad_weight * g).sum().unsqueeze(dim=0)
+        grad_weight = indicate_middle * grad_weight
+        # The following operation can make sure that alpha is always greater than zero in any case and can also
+        # suppress the update speed of alpha. (Personal understanding)
+        # grad_alpha.clamp_(-alpha.item(), alpha.item())  # FYI
+        return grad_weight, grad_alpha, None, None, None
+def grad_scale(x, scale):
+    y = x
+    y_grad = x * scale
+    return y.detach() - y_grad.detach() + y_grad
+def round_pass(x):
+    y = x.round()
+    y_grad = x
+    return y.detach() - y_grad.detach() + y_grad
+class Conv2dQ(_Conv2dQ):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True, nbits_w=8, mode=Qmodes.kernel_wise, **kwargs):
+        super(Conv2dQ, self).__init__(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
+            stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias,
+            nbits=nbits_w, mode=mode)
+        self.act = ActQ(in_features=in_channels, nbits_a=nbits_w)
+    def forward(self, x):
+        if self.alpha is None:
+            return F.conv2d(x, self.weight, self.bias, self.stride,
+                            self.padding, self.dilation, self.groups)
+        # w_reshape = self.weight.reshape([self.weight.shape[0], -1]).transpose(0, 1)
+        Qn = -2 ** (self.nbits - 1)
+        Qp = 2 ** (self.nbits - 1) - 1
+        if self.training and self.init_state == 0:
+            # self.alpha.data.copy_(self.weight.abs().max() / 2 ** (self.nbits - 1))
+            self.alpha.data.copy_(2 * self.weight.abs().mean() / math.sqrt(Qp))
+            # self.alpha.data.copy_(self.weight.abs().max() * 2)
+            self.init_state.fill_(1)
+        """
+        Implementation according to paper.
+        Feels wrong ...
+        When we initialize the alpha as a big number (e.g., self.weight.abs().max() * 2),
+        the clamp function can be skipped.
+        Then we get w_q = w / alpha * alpha = w, and $\frac{\partial w_q}{\partial \alpha} = 0$
+        As a result, I don't think the pseudo-code in the paper echoes the formula.
+        Please see jupyter/STE_LSQ.ipynb fo detailed comparison.
+        """
+        g = 1.0 / math.sqrt(self.weight.numel() * Qp)
+        # Method1: 31GB GPU memory (AlexNet w4a4 bs 2048) 17min/epoch
+        alpha = grad_scale(self.alpha, g)
+        # print(alpha.shape)
+        # print(self.weight.shape)
+        alpha = alpha.unsqueeze(1).unsqueeze(2).unsqueeze(3)
+        w_q = round_pass((self.weight / alpha).clamp(Qn, Qp)) * alpha
+        x = self.act(x)
+        # w = w.clamp(Qn, Qp)
+        # q_w = round_pass(w)
+        # w_q = q_w * alpha
+        # Method2: 25GB GPU memory (AlexNet w4a4 bs 2048) 32min/epoch
+        # w_q = FunLSQ.apply(self.weight, self.alpha, g, Qn, Qp)
+        # wq = y.transpose(0, 1).reshape(self.weight.shape).detach() + self.weight - self.weight.detach()
+        return F.conv2d(x, w_q, self.bias, self.stride,
+                        self.padding, self.dilation, self.groups)
+class LinearQ(_LinearQ):
+    def __init__(self, in_features, out_features, bias=True, nbits_w=4, **kwargs):
+        super(LinearQ, self).__init__(in_features=in_features,
+                                        out_features=out_features, bias=bias, nbits=nbits_w, mode=Qmodes.kernel_wise)
+        self.act = ActQ(in_features=in_features, nbits_a=nbits_w)
+    def forward(self, x):
+        if self.alpha is None:
+            return F.linear(x, self.weight, self.bias)
+        Qn = -2 ** (self.nbits - 1)
+        Qp = 2 ** (self.nbits - 1) - 1
+        if self.training and self.init_state == 0:
+            self.alpha.data.copy_(2 * self.weight.abs().mean() / math.sqrt(Qp))
+            # self.alpha.data.copy_(self.weight.abs().max() / 2 ** (self.nbits - 1))
+            self.init_state.fill_(1)
+        g = 1.0 / math.sqrt(self.weight.numel() * Qp)
+        # Method1:
+        alpha = grad_scale(self.alpha, g)
+        alpha = alpha.unsqueeze(1)
+        w_q = round_pass((self.weight / alpha).clamp(Qn, Qp)) * alpha
+        x = self.act(x)
+        # w = self.weight / alpha
+        # w = w.clamp(Qn, Qp)
+        # q_w = round_pass(w)
+        # w_q = q_w * alpha
+        # Method2:
+        # w_q = FunLSQ.apply(self.weight, self.alpha, g, Qn, Qp)
+        return F.linear(x, w_q, self.bias)
+class ActQ(_ActQ):
+    def __init__(self, in_features, nbits_a=4, mode=Qmodes.kernel_wise, **kwargs):
+        super(ActQ, self).__init__(in_features=in_features, nbits=nbits_a, mode=mode)
+        # print(self.alpha.shape, self.zero_point.shape)
+    def forward(self, x):
+        if self.alpha is None:
+            return x
+        if self.training and self.init_state == 0:
+            # The init alpha for activation is very very important as the experimental results shows.
+            # Please select a init_rate for activation.
+            # self.alpha.data.copy_(x.max() / 2 ** (self.nbits - 1) * self.init_rate)
+            if x.min() < -1e-5:
+                self.signed.data.fill_(1)
+            if self.signed == 1:
+                Qn = -2 ** (self.nbits - 1)
+                Qp = 2 ** (self.nbits - 1) - 1
+            else:
+                Qn = 0
+                Qp = 2 ** self.nbits - 1
+            self.alpha.data.copy_(2 * x.abs().mean() / math.sqrt(Qp))
+            self.zero_point.data.copy_(self.zero_point.data * 0.9 + 0.1 * (torch.min(x.detach()) - self.alpha.data * Qn))
+            self.init_state.fill_(1)
+        if self.signed == 1:
+            Qn = -2 ** (self.nbits - 1)
+            Qp = 2 ** (self.nbits - 1) - 1
+        else:
+            Qn = 0
+            Qp = 2 ** self.nbits - 1
+        g = 1.0 / math.sqrt(x.numel() * Qp)
+        # Method1:
+        zero_point = (self.zero_point.round() - self.zero_point).detach() + self.zero_point
+        alpha = grad_scale(self.alpha, g)
+        zero_point = grad_scale(zero_point, g)
+        # x = round_pass((x / alpha).clamp(Qn, Qp)) * alpha
+        if len(x.shape)==2:
+            alpha = alpha.unsqueeze(0)
+            zero_point = zero_point.unsqueeze(0)
+        elif len(x.shape)==4:
+            alpha = alpha.unsqueeze(0).unsqueeze(2).unsqueeze(3)
+            zero_point = zero_point.unsqueeze(0).unsqueeze(2).unsqueeze(3)
+        x = round_pass((x / alpha + zero_point).clamp(Qn, Qp))
+        x = (x - zero_point) * alpha
+        return x

models/q_vit/__init__.py ADDED Viewed

File without changes

models/q_vit/__pycache__/Quant.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

models/q_vit/__pycache__/Quant.cpython-312.pyc ADDED Viewed

Binary file (10.5 kB). View file

models/q_vit/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (207 Bytes). View file

models/q_vit/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (218 Bytes). View file

models/q_vit/__pycache__/_quan_base.cpython-311.pyc ADDED Viewed

Binary file (12.8 kB). View file

models/q_vit/__pycache__/_quan_base.cpython-312.pyc ADDED Viewed

Binary file (11.3 kB). View file

models/q_vit/__pycache__/quant_vision_transformer.cpython-311.pyc ADDED Viewed

Binary file (33.1 kB). View file

models/q_vit/__pycache__/quant_vision_transformer.cpython-312.pyc ADDED Viewed

Binary file (30.1 kB). View file

models/q_vit/_quan_base.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+    Quantized modules: the base class
+"""
+import torch
+import torch.nn as nn
+from torch.nn.parameter import Parameter
+import math
+from enum import Enum
+__all__ = ['Qmodes',  '_Conv2dQ', '_LinearQ', '_ActQ',
+           'truncation', 'get_sparsity_mask', 'FunStopGradient', 'round_pass', 'grad_scale']
+class Qmodes(Enum):
+    layer_wise = 1
+    kernel_wise = 2
+def grad_scale(x, scale):
+    y = x
+    y_grad = x * scale
+    return y.detach() - y_grad.detach() + y_grad
+def get_sparsity_mask(param, sparsity):
+    bottomk, _ = torch.topk(param.abs().view(-1), int(sparsity * param.numel()), largest=False, sorted=True)
+    threshold = bottomk.data[-1]  # This is the largest element from the group of elements that we prune away
+    return torch.gt(torch.abs(param), threshold).type(param.type())
+def round_pass(x):
+    y = x.round()
+    y_grad = x
+    return y.detach() - y_grad.detach() + y_grad
+class FunStopGradient(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, weight, stopGradientMask):
+        ctx.save_for_backward(stopGradientMask)
+        return weight
+    @staticmethod
+    def backward(ctx, grad_outputs):
+        stopGradientMask, = ctx.saved_tensors
+        grad_inputs = grad_outputs * stopGradientMask
+        return grad_inputs, None
+def log_shift(value_fp):
+    value_shift = 2 ** (torch.log2(value_fp).ceil())
+    return value_shift
+def clamp(input, min, max, inplace=False):
+    if inplace:
+        input.clamp_(min, max)
+        return input
+    return torch.clamp(input, min, max)
+def get_quantized_range(num_bits, signed=True):
+    if signed:
+        n = 2 ** (num_bits - 1)
+        return -n, n - 1
+    return 0, 2 ** num_bits - 1
+def linear_quantize(input, scale_factor, inplace=False):
+    if inplace:
+        input.mul_(scale_factor).round_()
+        return input
+    return torch.round(scale_factor * input)
+def linear_quantize_clamp(input, scale_factor, clamp_min, clamp_max, inplace=False):
+    output = linear_quantize(input, scale_factor, inplace)
+    return clamp(output, clamp_min, clamp_max, inplace)
+def linear_dequantize(input, scale_factor, inplace=False):
+    if inplace:
+        input.div_(scale_factor)
+        return input
+    return input / scale_factor
+def truncation(fp_data, nbits=8):
+    il = torch.log2(torch.max(fp_data.max(), fp_data.min().abs())) + 1
+    il = math.ceil(il - 1e-5)
+    qcode = nbits - il
+    scale_factor = 2 ** qcode
+    clamp_min, clamp_max = get_quantized_range(nbits, signed=True)
+    q_data = linear_quantize_clamp(fp_data, scale_factor, clamp_min, clamp_max)
+    q_data = linear_dequantize(q_data, scale_factor)
+    return q_data, qcode
+def get_default_kwargs_q(kwargs_q, layer_type):
+    default = {
+        'nbits': 4
+    }
+    if isinstance(layer_type, _Conv2dQ):
+        default.update({
+            'mode': Qmodes.layer_wise})
+    elif isinstance(layer_type, _LinearQ):
+        pass
+    elif isinstance(layer_type, _ActQ):
+        pass
+        # default.update({
+        #     'signed': 'Auto'})
+    else:
+        assert NotImplementedError
+        return
+    for k, v in default.items():
+        if k not in kwargs_q:
+            kwargs_q[k] = v
+    return kwargs_q
+class _Conv2dQ(nn.Conv2d):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True, **kwargs_q):
+        super(_Conv2dQ, self).__init__(in_channels, out_channels, kernel_size, stride=stride,
+                                       padding=padding, dilation=dilation, groups=groups, bias=bias)
+        self.kwargs_q = get_default_kwargs_q(kwargs_q, layer_type=self)
+        self.nbits = kwargs_q['nbits']
+        if self.nbits < 0:
+            self.register_parameter('alpha', None)
+            return
+        self.q_mode = kwargs_q['mode']
+        if self.q_mode == Qmodes.kernel_wise:
+            self.alpha = Parameter(torch.Tensor(out_channels))
+        else:  # layer-wise quantization
+            self.alpha = Parameter(torch.Tensor(1))
+        self.register_buffer('init_state', torch.zeros(1))
+    def add_param(self, param_k, param_v):
+        self.kwargs_q[param_k] = param_v
+    def set_bit(self, nbits):
+        self.kwargs_q['nbits'] = nbits
+    def extra_repr(self):
+        s_prefix = super(_Conv2dQ, self).extra_repr()
+        if self.alpha is None:
+            return '{}, fake'.format(s_prefix)
+        return '{}, {}'.format(s_prefix, self.kwargs_q)
+class _LinearQ(nn.Linear):
+    def __init__(self, in_features, out_features, bias=True, **kwargs_q):
+        super(_LinearQ, self).__init__(in_features=in_features, out_features=out_features, bias=bias)
+        self.kwargs_q = get_default_kwargs_q(kwargs_q, layer_type=self)
+        self.nbits = kwargs_q['nbits']
+        if self.nbits < 0:
+            self.register_parameter('alpha', None)
+            return
+        self.q_mode = kwargs_q['mode']
+        self.alpha = Parameter(torch.Tensor(1))
+        if self.q_mode == Qmodes.kernel_wise:
+            self.alpha = Parameter(torch.Tensor(out_features))
+        self.register_buffer('init_state', torch.zeros(1))
+    def add_param(self, param_k, param_v):
+        self.kwargs_q[param_k] = param_v
+    def extra_repr(self):
+        s_prefix = super(_LinearQ, self).extra_repr()
+        if self.alpha is None:
+            return '{}, fake'.format(s_prefix)
+        return '{}, {}'.format(s_prefix, self.kwargs_q)
+class _ActQ(nn.Module):
+    def __init__(self, in_features, **kwargs_q):
+        super(_ActQ, self).__init__()
+        self.kwargs_q = get_default_kwargs_q(kwargs_q, layer_type=self)
+        self.nbits = kwargs_q['nbits']
+        if self.nbits < 0:
+            self.register_parameter('alpha', None)
+            self.register_parameter('zero_point', None)
+            return
+        # self.signed = kwargs_q['signed']
+        self.q_mode = kwargs_q['mode']
+        self.alpha = Parameter(torch.Tensor(1))
+        self.zero_point = Parameter(torch.Tensor([0]))
+        if self.q_mode == Qmodes.kernel_wise:
+            self.alpha = Parameter(torch.Tensor(in_features))
+            self.zero_point = Parameter(torch.Tensor(in_features))
+            torch.nn.init.zeros_(self.zero_point)
+        # self.zero_point = Parameter(torch.Tensor([0]))
+        self.register_buffer('init_state', torch.zeros(1))
+        self.register_buffer('signed', torch.zeros(1))
+    def add_param(self, param_k, param_v):
+        self.kwargs_q[param_k] = param_v
+    def set_bit(self, nbits):
+        self.kwargs_q['nbits'] = nbits
+    def extra_repr(self):
+        # s_prefix = super(_ActQ, self).extra_repr()
+        if self.alpha is None:
+            return 'fake'
+        return '{}'.format(self.kwargs_q)

models/q_vit/quant_vision_transformer.py ADDED Viewed

	@@ -0,0 +1,527 @@

+import math
+import logging
+from functools import partial
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.models.helpers import load_pretrained
+from timm.models.layers import Mlp
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from timm.models.resnet import resnet26d, resnet50d
+from timm.models.registry import register_model
+import numpy as np
+from .Quant import *
+from ._quan_base import *
+_logger = logging.getLogger(__name__)
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+default_cfgs = {
+    # patch models (my experiments)
+    'vit_small_patch16_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/vit_small_p16_224-15ec54c9.pth',
+    ),
+    # patch models (weights ported from official Google JAX impl)
+    'vit_base_patch16_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+    ),
+    'vit_base_patch32_224': _cfg(
+        url='',  # no official model weights for this combo, only for in21k
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_base_patch16_384': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_384-83fb41ba.pth',
+        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+    'vit_base_patch32_384': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p32_384-830016f5.pth',
+        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+    'vit_large_patch16_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_224-4ee7a4dc.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_large_patch32_224': _cfg(
+        url='',  # no official model weights for this combo, only for in21k
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_large_patch16_384': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_384-b3be5167.pth',
+        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+    'vit_large_patch32_384': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p32_384-9b920ba8.pth',
+        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+    # patch models, imagenet21k (weights ported from official Google JAX impl)
+    'vit_base_patch16_224_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch16_224_in21k-e5005f0a.pth',
+        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_base_patch32_224_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch32_224_in21k-8db57226.pth',
+        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_large_patch16_224_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch16_224_in21k-606da67d.pth',
+        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_large_patch32_224_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth',
+        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_huge_patch14_224_in21k': _cfg(
+        url='',  # FIXME I have weights for this but > 2GB limit for github release binaries
+        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    # hybrid models (weights ported from official Google JAX impl)
+    'vit_base_resnet50_224_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_224_in21k-6f7c7740.pth',
+        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=0.9, first_conv='patch_embed.backbone.stem.conv'),
+    'vit_base_resnet50_384': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_384-9fd3c705.pth',
+        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0, first_conv='patch_embed.backbone.stem.conv'),
+    # hybrid models (my experiments)
+    'vit_small_resnet26d_224': _cfg(),
+    'vit_small_resnet50d_s3_224': _cfg(),
+    'vit_base_resnet26d_224': _cfg(),
+    'vit_base_resnet50d_224': _cfg(),
+    # deit models (FB weights)
+    'vit_deit_tiny_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth'),
+    'vit_deit_small_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth'),
+    'vit_deit_base_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth',),
+    'vit_deit_base_patch16_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_384-8de9b5d1.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'vit_deit_tiny_distilled_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_tiny_distilled_patch16_224-b40b3cf7.pth'),
+    'vit_deit_small_distilled_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth'),
+    'vit_deit_base_distilled_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_224-df68dfff.pth', ),
+    'vit_deit_base_distilled_patch16_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+}
+class Q_Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self, nbits, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        drop_probs = to_2tuple(drop)
+        self.fc1 = LinearQ(in_features, hidden_features, nbits_w=nbits, mode=Qmodes.kernel_wise)
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = LinearQ(hidden_features, out_features, nbits_w=nbits, mode=Qmodes.kernel_wise)
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        # print(torch.max(x), torch.min(x))
+        x = self.act(x)
+        x = torch.clip(x, -10., 10.)
+        # print(torch.clip(x, -10., 10.))
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Q_Attention(nn.Module):
+    def __init__(self, nbits, dim, num_heads=8, quantize_attn=True, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.quantize_attn = quantize_attn
+        self.norm_q = nn.LayerNorm(head_dim)
+        self.norm_k = nn.LayerNorm(head_dim)
+        if self.quantize_attn:
+            self.qkv = LinearQ(dim, dim * 3, bias=qkv_bias, nbits_w=nbits, mode=Qmodes.kernel_wise)
+            self.attn_drop = nn.Dropout(attn_drop)
+            self.proj = LinearQ(dim, dim, nbits_w=nbits, mode=Qmodes.kernel_wise)
+            self.q_act = ActQ(nbits_a=nbits, in_features=self.num_heads)
+            self.k_act = ActQ(nbits_a=nbits, in_features=self.num_heads)
+            self.v_act = ActQ(nbits_a=nbits, in_features=self.num_heads)
+            self.attn_act = ActQ(nbits_a=nbits, in_features=self.num_heads)
+        else:
+            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+            self.attn_drop = nn.Dropout(attn_drop)
+            self.proj = nn.Linear(dim, dim)
+            self.q_act = ActQ(nbits_a=nbits, in_features=self.num_heads)
+            self.k_act = ActQ(nbits_a=nbits, in_features=self.num_heads)
+            self.v_act = ActQ(nbits_a=nbits, in_features=self.num_heads)
+            self.attn_act = ActQ(nbits_a=nbits, in_features=self.num_heads)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)   # make torchscript happy (cannot use tensor as tuple)
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+        q = self.q_act(q)
+        k = self.k_act(k)
+        v = self.v_act(v)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        attn = self.attn_act(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Q_Block(nn.Module):
+    def __init__(self, nbits, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Q_Attention(nbits, dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Q_Mlp(nbits=nbits, in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class Q_PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, nbits=4, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = Conv2dQ(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        # nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class lowbit_VisionTransformer(nn.Module):
+    """ Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
+        - https://arxiv.org/abs/2012.12877
+    """
+    def __init__(self, nbits, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=True, representation_size=None, distilled=True,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0., embed_layer=Q_PatchEmbed, norm_layer=None,
+                 act_layer=None, weight_init=''):
+        """
+        Args:
+            nbits: nbits
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            distilled (bool): model includes a distillation token and head as in DeiT models
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            weight_init: (str): weight init scheme
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 2 if distilled else 1
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+        self.patch_embed = embed_layer(
+            nbits=nbits, img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.dist_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(*[
+            Q_Block(
+                nbits=nbits, dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate,
+                attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+        # Representation layer
+        if representation_size and not distilled:
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(OrderedDict([
+                ('fc', nn.Linear(embed_dim, representation_size)),
+                ('act', nn.Tanh())
+            ]))
+        else:
+            self.pre_logits = nn.Identity()
+        # Classifier head(s)
+        self.head = LinearQ(self.num_features, num_classes, nbits_w=8) if num_classes > 0 else nn.Identity()
+        # nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.head_dist = None
+        if distilled:
+            self.head_dist = LinearQ(self.embed_dim, self.num_classes, nbits_w=8) if num_classes > 0 else nn.Identity()
+            # self.head = LinearQ(self.embed_dim, self.num_classes, nbits_w=8) if num_classes > 0 else nn.Identity()
+            # nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+        self.init_weights(weight_init)
+    def init_weights(self, mode=''):
+        assert mode in ('jax', 'jax_nlhb', 'nlhb', '')
+        head_bias = -math.log(self.num_classes) if 'nlhb' in mode else 0.
+        trunc_normal_(self.pos_embed, std=.02)
+        if self.dist_token is not None:
+            trunc_normal_(self.dist_token, std=.02)
+        if mode.startswith('jax'):
+            # leave cls token as zeros to match jax impl
+            named_apply(partial(_init_vit_weights, head_bias=head_bias, jax_impl=True), self)
+        else:
+            trunc_normal_(self.cls_token, std=.02)
+            self.apply(_init_vit_weights)
+    def _init_weights(self, m):
+        # this fn left here for compat with downstream users
+        _init_vit_weights(m)
+    @torch.jit.ignore()
+    def load_pretrained(self, checkpoint_path, prefix=''):
+        _load_weights(self, checkpoint_path, prefix)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token', 'dist_token'}
+    def get_classifier(self):
+        if self.dist_token is None:
+            return self.head
+        else:
+            return self.head, self.head_dist
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        if self.num_tokens == 2:
+            self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        if self.dist_token is None:
+            x = torch.cat((cls_token, x), dim=1)
+        else:
+            x = torch.cat((cls_token, self.dist_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = self.pos_drop(x + self.pos_embed)
+        x = self.blocks(x)
+        x = self.norm(x)
+        if self.dist_token is None:
+            return self.pre_logits(x[:, 0])
+        else:
+            return x[:, 0], x[:, 1]
+    def forward(self, x):
+        x = self.forward_features(x)
+        if self.head_dist is not None:
+            x, x_dist = self.head(x[0]), self.head_dist(x[1])  # x must be a tuple
+            if self.training and not torch.jit.is_scripting():
+                # during inference, return the average of both classifier predictions
+                return x, x_dist
+            else:
+                return (x + x_dist) / 2
+        else:
+            x = self.head(x)
+        return x
+def _init_vit_weights(module: nn.Module, name: str = '', head_bias: float = 0., jax_impl: bool = False):
+    """ ViT weight initialization
+    * When called without n, head_bias, jax_impl args it will behave exactly the same
+      as my original init for compatibility with prev hparam / downstream use cases (ie DeiT).
+    * When called w/ valid n (module name) and jax_impl=True, will (hopefully) match JAX impl
+    """
+    if isinstance(module, nn.Linear):
+        if name.startswith('head'):
+            nn.init.zeros_(module.weight)
+            nn.init.constant_(module.bias, head_bias)
+        elif name.startswith('pre_logits'):
+            lecun_normal_(module.weight)
+            nn.init.zeros_(module.bias)
+        else:
+            if jax_impl:
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    if 'mlp' in name:
+                        nn.init.normal_(module.bias, std=1e-6)
+                    else:
+                        nn.init.zeros_(module.bias)
+            else:
+                trunc_normal_(module.weight, std=.02)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+    elif jax_impl and isinstance(module, nn.Conv2d):
+        # NOTE conv was left to pytorch default in my original init
+        lecun_normal_(module.weight)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm2d)):
+        nn.init.zeros_(module.bias)
+        nn.init.ones_(module.weight)
+def resize_pos_embed(posemb, posemb_new):
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    _logger.info('Resized position embedding: %s to %s', posemb.shape, posemb_new.shape)
+    ntok_new = posemb_new.shape[1]
+    if True:
+        posemb_tok, posemb_grid = posemb[:, :1], posemb[0, 1:]
+        ntok_new -= 1
+    else:
+        posemb_tok, posemb_grid = posemb[:, :0], posemb[0]
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    gs_new = int(math.sqrt(ntok_new))
+    _logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new)
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bilinear')
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new * gs_new, -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
+def checkpoint_filter_fn(state_dict, model):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    if 'model' in state_dict:
+        # For deit models
+        state_dict = state_dict['model']
+    for k, v in state_dict.items():
+        if 'patch_embed.proj.weight' in k and len(v.shape) < 4:
+            # For old models that I trained prior to conv based patchification
+            O, I, H, W = model.patch_embed.proj.weight.shape
+            v = v.reshape(O, -1, H, W)
+        elif k == 'pos_embed' and v.shape != model.pos_embed.shape:
+            # To resize pos embedding when using model at different size from pretrained weights
+            v = resize_pos_embed(v, model.pos_embed)
+        out_dict[k] = v
+    return out_dict
+def _create_vision_transformer(variant, pretrained=False, distilled=False, **kwargs):
+    default_cfg = default_cfgs[variant]
+    default_num_classes = default_cfg['num_classes']
+    default_img_size = default_cfg['input_size'][-1]
+    num_classes = kwargs.pop('num_classes', default_num_classes)
+    img_size = kwargs.pop('img_size', default_img_size)
+    repr_size = kwargs.pop('representation_size', None)
+    if repr_size is not None and num_classes != default_num_classes:
+        # Remove representation layer if fine-tuning. This may not always be the desired action,
+        # but I feel better than doing nothing by default for fine-tuning. Perhaps a better interface?
+        _logger.warning("Removing representation layer for fine-tuning.")
+        repr_size = None
+    model_cls = DistilledVisionTransformer if distilled else VisionTransformer
+    model = model_cls(img_size=img_size, num_classes=num_classes, representation_size=repr_size, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model, num_classes=num_classes, in_chans=kwargs.get('in_chans', 3),
+            filter_fn=partial(checkpoint_filter_fn, model=model))
+    return model
+@register_model
+def fourbits_deit_small_patch16_224(pretrained=False, **kwargs):
+    model = lowbit_VisionTransformer(
+        nbits=4, patch_size=16, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    model.default_cfg = _cfg()
+    if pretrained:
+        torch.hub.load_state_dict_from_url(
+            url='https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth',
+            map_location="cpu", check_hash=True
+        )
+    return model
+@register_model
+def threebits_deit_small_patch16_224(pretrained=False, **kwargs):
+    model = lowbit_VisionTransformer(
+        nbits=3, patch_size=16, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    model.default_cfg = _cfg()
+    if pretrained:
+        torch.hub.load_state_dict_from_url(
+            url='https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth',
+            map_location="cpu", check_hash=True
+        )
+    return model
+@register_model
+def twobits_deit_small_patch16_224(pretrained=False, **kwargs):
+    model = lowbit_VisionTransformer(
+        nbits=2, patch_size=16, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    model.default_cfg = _cfg()
+    if pretrained:
+        torch.hub.load_state_dict_from_url(
+            url='https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth',
+            map_location="cpu", check_hash=True
+        )
+    return model

models/qk_model_v1_1003.py ADDED Viewed

	@@ -0,0 +1,426 @@

+import torch
+import torch.nn as nn
+from spikingjelly.clock_driven.neuron import MultiStepParametricLIFNode, MultiStepLIFNode
+from timm.models.layers import to_2tuple, trunc_normal_, DropPath
+from timm.models.registry import register_model
+from timm.models.vision_transformer import _cfg
+from functools import partial
+from timm.models import create_model
+__all__ = ['QKFormer']
+class MLP(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.mlp1_conv = nn.Conv2d(in_features, hidden_features, kernel_size=1, stride=1)
+        self.mlp1_bn = nn.BatchNorm2d(hidden_features)
+        self.mlp1_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.mlp2_conv = nn.Conv2d(hidden_features, out_features, kernel_size=1, stride=1)
+        self.mlp2_bn = nn.BatchNorm2d(out_features)
+        self.mlp2_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.c_hidden = hidden_features
+        self.c_output = out_features
+    def forward(self, x):
+        T, B, C, H, W = x.shape
+        x = self.mlp1_conv(x.flatten(0, 1))
+        x = self.mlp1_bn(x).reshape(T, B, self.c_hidden, H, W)
+        x = self.mlp1_lif(x)
+        x = self.mlp2_conv(x.flatten(0, 1))
+        x = self.mlp2_bn(x).reshape(T, B, C, H, W)
+        x = self.mlp2_lif(x)
+        return x
+class Token_QK_Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        self.q_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1, bias=False)
+        self.q_bn = nn.BatchNorm1d(dim)
+        self.q_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.k_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1, bias=False)
+        self.k_bn = nn.BatchNorm1d(dim)
+        self.k_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.attn_lif = MultiStepLIFNode(tau=2.0, v_threshold=0.5, detach_reset=True, backend='torch')
+        self.proj_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1)
+        self.proj_bn = nn.BatchNorm1d(dim)
+        self.proj_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+    def forward(self, x):
+        T, B, C, H, W = x.shape
+        x = x.flatten(3)
+        T, B, C, N = x.shape
+        x_for_qkv = x.flatten(0, 1)
+        q_conv_out = self.q_conv(x_for_qkv)
+        q_conv_out = self.q_bn(q_conv_out).reshape(T, B, C, N)
+        q_conv_out = self.q_lif(q_conv_out)
+        q = q_conv_out.unsqueeze(2).reshape(T, B, self.num_heads, C // self.num_heads, N)
+        k_conv_out = self.k_conv(x_for_qkv)
+        k_conv_out = self.k_bn(k_conv_out).reshape(T, B, C, N)
+        k_conv_out = self.k_lif(k_conv_out)
+        k = k_conv_out.unsqueeze(2).reshape(T, B, self.num_heads, C // self.num_heads, N)
+        q = torch.sum(q, dim=3, keepdim=True)
+        attn = self.attn_lif(q)
+        x = torch.mul(attn, k)
+        x = x.flatten(2, 3)
+        x = self.proj_bn(self.proj_conv(x.flatten(0, 1))).reshape(T, B, C, H, W)
+        # print(f"proj_conv out shape: {x.shape}")
+        x = self.proj_lif(x)
+        return x
+class Spiking_Self_Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = 0.125
+        self.q_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1, bias=False)
+        self.q_bn = nn.BatchNorm1d(dim)
+        self.q_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.k_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1, bias=False)
+        self.k_bn = nn.BatchNorm1d(dim)
+        self.k_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.v_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1, bias=False)
+        self.v_bn = nn.BatchNorm1d(dim)
+        self.v_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.attn_lif = MultiStepLIFNode(tau=2.0, v_threshold=0.5, detach_reset=True, backend='torch')
+        self.proj_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1)
+        self.proj_bn = nn.BatchNorm1d(dim)
+        self.proj_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.qkv_mp = nn.MaxPool1d(4)
+    def forward(self, x):
+        T, B, C, H, W = x.shape
+        x = x.flatten(3)
+        T, B, C, N = x.shape
+        x_for_qkv = x.flatten(0, 1)
+        q_conv_out = self.q_conv(x_for_qkv)
+        q_conv_out = self.q_bn(q_conv_out).reshape(T, B, C, N).contiguous()
+        q_conv_out = self.q_lif(q_conv_out)
+        q = q_conv_out.transpose(-1, -2).reshape(T, B, N, self.num_heads, C // self.num_heads).permute(0, 1, 3, 2,
+                                                                                                       4).contiguous()
+        k_conv_out = self.k_conv(x_for_qkv)
+        k_conv_out = self.k_bn(k_conv_out).reshape(T, B, C, N).contiguous()
+        k_conv_out = self.k_lif(k_conv_out)
+        k = k_conv_out.transpose(-1, -2).reshape(T, B, N, self.num_heads, C // self.num_heads).permute(0, 1, 3, 2,
+                                                                                                       4).contiguous()
+        v_conv_out = self.v_conv(x_for_qkv)
+        v_conv_out = self.v_bn(v_conv_out).reshape(T, B, C, N).contiguous()
+        v_conv_out = self.v_lif(v_conv_out)
+        v = v_conv_out.transpose(-1, -2).reshape(T, B, N, self.num_heads, C // self.num_heads).permute(0, 1, 3, 2,
+                                                                                                       4).contiguous()
+        x = k.transpose(-2, -1) @ v
+        x = (q @ x) * self.scale
+        x = x.transpose(3, 4).reshape(T, B, C, N).contiguous()
+        x = self.attn_lif(x)
+        x = x.flatten(0, 1)
+        x = self.proj_lif(self.proj_bn(self.proj_conv(x))).reshape(T, B, C, H, W)
+        return x
+class TokenSpikingTransformer(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, sr_ratio=1):
+        super().__init__()
+        self.tssa = Token_QK_Attention(dim, num_heads)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MLP(in_features= dim, hidden_features=mlp_hidden_dim, drop=drop)
+    def forward(self, x):
+        x = x + self.tssa(x)
+        x = x + self.mlp(x)
+        return x
+class SpikingTransformer(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, sr_ratio=1):
+        super().__init__()
+        self.ssa = Spiking_Self_Attention(dim, num_heads)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MLP(in_features= dim, hidden_features=mlp_hidden_dim, drop=drop)
+    def forward(self, x):
+        x = x + self.ssa(x)
+        x = x + self.mlp(x)
+        return x
+class PatchEmbedInit(nn.Module):
+    def __init__(self, img_size_h=128, img_size_w=128, patch_size=4, in_channels=2, embed_dims=256):
+        super().__init__()
+        self.image_size = [img_size_h, img_size_w]
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.C = in_channels
+        self.H, self.W = self.image_size[0] // patch_size[0], self.image_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj_conv = nn.Conv2d(in_channels, embed_dims // 8, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj_bn = nn.BatchNorm2d(embed_dims // 8)
+        self.proj_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.proj1_conv = nn.Conv2d(embed_dims // 8, embed_dims // 4, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj1_bn = nn.BatchNorm2d(embed_dims // 4)
+        self.maxpool1 = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
+        self.proj1_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.proj2_conv = nn.Conv2d(embed_dims//4, embed_dims // 2, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj2_bn = nn.BatchNorm2d(embed_dims // 2)
+        self.maxpool2 = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
+        self.proj2_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.proj3_conv = nn.Conv2d(embed_dims // 2, embed_dims, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj3_bn = nn.BatchNorm2d(embed_dims)
+        self.maxpool3 = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
+        self.proj3_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.proj_res_conv = nn.Conv2d(embed_dims // 4, embed_dims, kernel_size=1, stride=4, padding=0, bias=False)
+        self.proj_res_bn = nn.BatchNorm2d(embed_dims)
+        self.proj_res_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+    def forward(self, x):
+        T, B, C, H, W = x.shape
+        # Downsampling + Res
+        # x_feat = x.flatten(0, 1)
+        x = self.proj_conv(x.flatten(0, 1))
+        x = self.proj_bn(x).reshape(T, B, -1, H, W)
+        x = self.proj_lif(x).flatten(0, 1).contiguous()
+        x = self.proj1_conv(x)
+        x = self.proj1_bn(x)
+        x = self.maxpool1(x)
+        _, _, H1, W1 = x.shape
+        x = x.reshape(T, B, -1, H1, W1).contiguous()
+        x = self.proj1_lif(x).flatten(0, 1).contiguous()
+        x_feat = x
+        x = self.proj2_conv(x)
+        x = self.proj2_bn(x)
+        x = self.maxpool2(x)
+        _, _, H2, W2 = x.shape
+        x = x.reshape(T, B, -1, H2, W2).contiguous()
+        x = self.proj2_lif(x).flatten(0, 1).contiguous()
+        x = self.proj3_conv(x)
+        x = self.proj3_bn(x)
+        x = self.maxpool3(x)
+        _, _, H3, W3 = x.shape
+        x = x.reshape(T, B, -1, H3, W3).contiguous()
+        x = self.proj3_lif(x)
+        x_feat = self.proj_res_conv(x_feat)
+        x_feat = self.proj_res_bn(x_feat)
+        _, _, Hres, Wres = x_feat.shape
+        x_feat = x_feat.reshape(T, B, -1, Hres, Wres).contiguous()
+        x_feat = self.proj_res_lif(x_feat)
+        x = x + x_feat # shortcut
+        return x
+class PatchEmbeddingStage(nn.Module):
+    def __init__(self, img_size_h=128, img_size_w=128, patch_size=4, in_channels=2, embed_dims=256):
+        super().__init__()
+        self.image_size = [img_size_h, img_size_w]
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.C = in_channels
+        self.H, self.W = self.image_size[0] // patch_size[0], self.image_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj_conv = nn.Conv2d(embed_dims//2, embed_dims, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj_bn = nn.BatchNorm2d(embed_dims)
+        self.proj_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.proj4_conv = nn.Conv2d(embed_dims, embed_dims, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj4_bn = nn.BatchNorm2d(embed_dims)
+        self.proj4_maxpool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
+        self.proj4_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+        self.proj_res_conv = nn.Conv2d(embed_dims//2, embed_dims, kernel_size=1, stride=2, padding=0, bias=False)
+        self.proj_res_bn = nn.BatchNorm2d(embed_dims)
+        self.proj_res_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='torch')
+    def forward(self, x):
+        T, B, C, H, W = x.shape
+        # Downsampling + Res
+        x = x.flatten(0, 1).contiguous()
+        x_feat = x
+        x = self.proj_conv(x)
+        x = self.proj_bn(x).reshape(T, B, -1, H, W).contiguous()
+        x = self.proj_lif(x).flatten(0, 1).contiguous()
+        x = self.proj4_conv(x)
+        x = self.proj4_bn(x)
+        x = self.proj4_maxpool(x)
+        _, _, H4, W4 = x.shape
+        x = x.reshape(T, B, -1, H4, W4).contiguous()
+        x = self.proj4_lif(x)
+        x_feat = self.proj_res_conv(x_feat)
+        x_feat = self.proj_res_bn(x_feat)
+        _, _, Hres, Wres = x_feat.shape
+        x_feat = x_feat.reshape(T, B, -1, Hres, Wres).contiguous()
+        x_feat = self.proj_res_lif(x_feat)
+        x = x + x_feat # shortcut
+        return x
+class vit_snn(nn.Module):
+    def __init__(self,
+                 img_size_h=128, img_size_w=128, patch_size=16, in_channels=2, num_classes=11,
+                 embed_dims=[64, 128, 256], num_heads=[1, 2, 4], mlp_ratios=[4, 4, 4], qkv_bias=False, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
+                 depths=[6, 8, 6], sr_ratios=[8, 4, 2], T=4, pretrained_cfg=None, in_chans = 3, no_weight_decay = None
+                 ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.depths = depths
+        self.T = T
+        num_heads = [16, 16, 16]
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depths)]  # stochastic depth decay rule
+        #
+        patch_embed1 = PatchEmbedInit(img_size_h=img_size_h,
+                                       img_size_w=img_size_w,
+                                       patch_size=patch_size,
+                                       in_channels=in_channels,
+                                       embed_dims=embed_dims // 2)
+        stage1 = nn.ModuleList([TokenSpikingTransformer(
+            dim=embed_dims // 2, num_heads=num_heads[0], mlp_ratio=mlp_ratios, qkv_bias=qkv_bias,
+            qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[j],
+            norm_layer=norm_layer, sr_ratio=sr_ratios)
+            for j in range(1)])
+        patch_embed2 = PatchEmbeddingStage(img_size_h=img_size_h,
+                                       img_size_w=img_size_w,
+                                       patch_size=patch_size,
+                                       in_channels=in_channels,
+                                       embed_dims=embed_dims)
+        stage2 = nn.ModuleList([SpikingTransformer(
+            dim=embed_dims, num_heads=num_heads[1], mlp_ratio=mlp_ratios, qkv_bias=qkv_bias,
+            qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[j],
+            norm_layer=norm_layer, sr_ratio=sr_ratios)
+            for j in range(1)])
+        setattr(self, f"patch_embed1", patch_embed1)
+        setattr(self, f"stage1", stage1)
+        setattr(self, f"patch_embed2", patch_embed2)
+        setattr(self, f"stage2", stage2)
+        # classification head
+        self.head = nn.Linear(embed_dims, num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pose_embed'}
+    @torch.jit.ignore
+    def _get_pos_embed(self, pos_embed, patch_embed, H, W):
+        return None
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward_features(self, x):
+        stage1 = getattr(self, f"stage1")
+        patch_embed1 = getattr(self, f"patch_embed1")
+        stage2 = getattr(self, f"stage2")
+        patch_embed2 = getattr(self, f"patch_embed2")
+        x = patch_embed1(x)
+        for blk in stage1:
+            x = blk(x)
+        x = patch_embed2(x)
+        for blk in stage2:
+            x = blk(x)
+        return x.flatten(3).mean(3)
+    def forward(self, x):
+        x = x.permute(1, 0, 2, 3, 4)  # [T, N, 2, *, *]
+        x = self.forward_features(x)
+        x = self.head(x.mean(0))
+        return x
+@register_model
+def QKFormer_1003(pretrained=False, **kwargs):
+    model = vit_snn(
+        patch_size=16, embed_dims=256, num_heads=16, mlp_ratios=1,
+        in_channels=2, num_classes=101, qkv_bias=False,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=4, sr_ratios=1,
+        **kwargs
+    )
+    model.default_cfg = _cfg()
+    return model
+from timm.models import create_model
+if __name__ == '__main__':
+    x = torch.randn(1, 1, 2, 128, 128).cuda()
+    model = create_model(
+        'QKFormer_1003',
+        pretrained=False,
+        drop_rate=0,
+        drop_path_rate=0.1,
+        drop_block_rate=None,
+    ).cuda()
+    model.eval()
+    from torchinfo import summary
+    summary(model, input_size=(1, 1, 2, 128, 128))
+    y = model(x)
+    print(y.shape)
+    print('Test Good!')

models/qk_model_with_delay/__init__.py ADDED Viewed

File without changes

models/qk_model_with_delay/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (221 Bytes). View file

models/qk_model_with_delay/__pycache__/delay_synaptic_func_inter.cpython-311.pyc ADDED Viewed

Binary file (11.3 kB). View file

models/qk_model_with_delay/__pycache__/delay_synaptic_inter_model.cpython-311.pyc ADDED Viewed

Binary file (30.3 kB). View file

models/qk_model_with_delay/delay_synaptic_func_inter.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+def set_sigma_for_DCLS(model, s):
+    for name, module in model.named_modules():
+        if module.__class__.__name__ == 'DelayConv':
+            if hasattr(module, 'sigma'):
+                module.sigma = s
+    print('Set sigma to ',s)
+class DropoutNd(nn.Module):
+    def __init__(self, p: float = 0.5, tie=True, transposed=True):
+        """
+        tie: tie dropout mask across sequence lengths (Dropout1d/2d/3d)
+        """
+        super().__init__()
+        if p < 0 or p >= 1:
+            raise ValueError("dropout probability has to be in [0, 1), " "but got {}".format(p))
+        self.p = p
+        self.tie = tie
+        self.transposed = transposed
+        self.binomial = torch.distributions.binomial.Binomial(probs=1-self.p)
+    def forward(self, X):
+        """X: (batch, dim, lengths...)."""
+        if self.training:
+            if not self.transposed: X = rearrange(X, 'b ... d -> b d ...')
+            # binomial = torch.distributions.binomial.Binomial(probs=1-self.p) # This is incredibly slow because of CPU -> GPU copying
+            mask_shape = X.shape[:2] + (1,) * (X.ndim - 2) if self.tie else X.shape
+            # mask = self.binomial.sample(mask_shape)
+            mask = torch.rand(*mask_shape, device=X.device) < 1. - self.p
+            X = X * mask * (1.0 / (1 - self.p))
+            if not self.transposed: X = rearrange(X, 'b d ... -> b ... d')
+            return X
+        return X
+class DelayConv(nn.Module):
+    def __init__(
+        self,
+        in_c,
+        k,
+        dropout=0.0,
+        n_delay=1,
+        dilation=1,
+        kernel_type='triangle_r_temp'
+    ):
+        super().__init__()
+        self.C = in_c  # 输入和输出通道数
+        self.win_len = k
+        self.dilation = dilation
+        self.n_delay = n_delay
+        self.kernel_type = kernel_type
+        self.t = torch.arange(self.win_len).float().unsqueeze(0)  # [1, k]
+        self.sigma = self.win_len // 2
+        self.delay_kernel = None
+        self.bump = None
+        # ========== 修改：d 形状 -> [C_out, C_in, n_delay] ==========
+        d = torch.rand(self.C, self.C, self.n_delay)
+        with torch.no_grad():
+            for co in range(self.C):
+                for ci in range(self.C):
+                    d[co, ci, :] = torch.randperm(self.win_len - 2)[:self.n_delay] + 1
+        self.register("d", d, lr=1e-2)
+        # 初始化权重: [C_out, C_in, k]
+        weight = torch.ones([self.C, self.C, k])
+        with torch.no_grad():
+            for co in range(self.C):            # output channel
+                for ci in range(self.C):        # input channel
+                    for i in range(k - 2, -1, -1):
+                        weight[co, ci, i] = weight[co, ci, i + 1] / 2
+        self.weight = nn.Parameter(weight)
+        self.dropout = nn.Dropout(dropout / 5) if dropout > 0.0 else nn.Identity()
+    def register(self, name, tensor, lr=None):
+        """注册可训练或固定参数"""
+        if lr == 0.0:
+            self.register_buffer(name, tensor)
+        else:
+            self.register_parameter(name, nn.Parameter(tensor))
+            optim = {"weight_decay": 0}
+            if lr is not None:
+                optim["lr"] = lr
+            setattr(getattr(self, name), "_optim", optim)
+    def update_kernel(self, device):
+        """
+        输出 delay kernel: shape [C_out, C_in, k]
+        """
+        t = self.t.to(device).view(1, 1, 1, -1)  # [1,1,1,k]
+        d = self.d.to(device)                    # [C_out, C_in, n_delay]
+        # ---------- 计算 bump ----------
+        if self.kernel_type == 'gauss':
+            bump = torch.exp(-0.5 * ((t - self.win_len + d.unsqueeze(-1) + 1) / self.sigma) ** 2)
+            bump = (bump - 1e-3).relu() + 1e-3
+            bump = bump / (bump.sum(dim=-1, keepdim=True) + 1e-7)
+        elif self.kernel_type == 'triangle':
+            bump = torch.relu(1 - torch.abs((t - self.win_len + d.unsqueeze(-1) + 1) / self.sigma))
+            bump = bump / (bump.sum(dim=-1, keepdim=True).detach() + 1e-7)
+        elif self.kernel_type == 'triangle_r':
+            d_int = (d.round() - d).detach() + d
+            bump = torch.relu(1 - torch.abs((t - self.win_len + d_int.unsqueeze(-1) + 1) / self.sigma))
+            bump = bump / (bump.sum(dim=-1, keepdim=True).detach() + 1e-7)
+        elif self.kernel_type == 'triangle_r_temp':
+            scale = min(1.0, 1.0 / self.sigma)
+            d_int = (d.round() - d).detach() * scale + d
+            bump = torch.relu(1 - torch.abs((t - self.win_len + d_int.unsqueeze(-1) + 1) / self.sigma))
+            bump = bump / (bump.sum(dim=-1, keepdim=True).detach() + 1e-7)  # [C_out, C_in, n_delay, k]
+            # ------ 在eval模式硬化bump ------
+            if not self.training:
+                max_idx = bump.argmax(dim=-1, keepdim=True)  # 找最大值索引
+                hard_mask = torch.zeros_like(bump)
+                hard_mask.scatter_(-1, max_idx, 1.0)
+                bump = bump * hard_mask
+            # --------------------------------
+        else:
+            raise ValueError(f"Unknown kernel_type: {self.kernel_type}")
+        # bump: [C_out, C_in, n_delay, k]
+        self.bump = bump.detach().clone().to(device)
+        # ---------- 沿 n_delay 维度求和: [C_out, C_in, k] ----------
+        bump_sum = bump.sum(dim=2)
+        # ---------- 生成最终卷积核 ----------
+        # weight: [C_out, C_in, k]
+        self.delay_kernel = (self.weight * bump_sum).to(device)  # [C_out, C_in, k]
+    def forward(self, x):
+        """
+        x: (T, B, N, C)
+        return: (T*B, C, N)
+        """
+        # 调整维度
+        x = x.permute(0, 1, 3, 2).contiguous()  # (T, B, N, C)
+        T, B, N, C = x.shape
+        assert C == self.C, f"Input channel mismatch: {C} vs {self.C}"
+        x = x.permute(1, 2, 3, 0).contiguous()  # (B, N, C, T)
+        # 合并 B*N 作为 batch
+        x_reshaped = x.view(B * N, C, T)  # (B*N, C, T)
+        device = x.device
+        # 更新 kernel
+        self.update_kernel(device)  # -> [C_out, C_in, k]
+        kernel = self.delay_kernel
+        # padding
+        pad_left = (self.win_len - 1) * self.dilation
+        x_padded = F.pad(x_reshaped, (pad_left, 0))  # (B*N, C, T+pad)
+        # 全通道卷积: groups=1 (跨通道交互)
+        y = F.conv1d(x_padded, kernel, stride=1, dilation=self.dilation, groups=1)  # (B*N, C, T)
+        # 还原到原始形状
+        y = y.view(B, N, C, T).permute(3, 0, 2, 1).contiguous().view(-1, C, N)  # (T*B, C, N)
+        return self.dropout(y)

models/qk_model_with_delay/delay_synaptic_inter_model.py ADDED Viewed

	@@ -0,0 +1,459 @@

+import torch
+import torch.nn as nn
+from spikingjelly.clock_driven.neuron import MultiStepParametricLIFNode, MultiStepLIFNode
+from timm.models.layers import to_2tuple, trunc_normal_, DropPath
+from timm.models.registry import register_model
+from timm.models.vision_transformer import _cfg
+from functools import partial
+from timm.models import create_model
+from .delay_synaptic_func_inter import DelayConv
+__all__ = ['delay_QKFormer']
+class MLP(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.mlp1_conv = nn.Conv2d(in_features, hidden_features, kernel_size=1, stride=1)
+        self.mlp1_bn = nn.BatchNorm2d(hidden_features)
+        self.mlp1_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        self.mlp2_conv = nn.Conv2d(hidden_features, out_features, kernel_size=1, stride=1)
+        self.mlp2_bn = nn.BatchNorm2d(out_features)
+        self.mlp2_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        self.c_hidden = hidden_features
+        self.c_output = out_features
+    def forward(self, x):
+        T, B, C, H, W = x.shape
+        x = self.mlp1_conv(x.flatten(0, 1))
+        x = self.mlp1_bn(x).reshape(T, B, self.c_hidden, H, W)
+        x = self.mlp1_lif(x)
+        x = self.mlp2_conv(x.flatten(0, 1))
+        x = self.mlp2_bn(x).reshape(T, B, C, H, W)
+        x = self.mlp2_lif(x)
+        return x
+class Token_QK_Attention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1,
+                 k=16):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        self.q_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1, bias=False)
+        self.q_bn = nn.BatchNorm1d(dim)
+        self.q_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        # self.k_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1, bias=False)
+        self.k_proj_delay = DelayConv(in_c=self.dim, k=k)
+        self.k_bn = nn.BatchNorm1d(dim)
+        self.k_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        self.attn_lif = MultiStepLIFNode(tau=2.0, v_threshold=0.5, detach_reset=True, backend='cupy')
+        self.proj_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1)
+        self.proj_bn = nn.BatchNorm1d(dim)
+        self.proj_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+    def forward(self, x):
+        T, B, C, H, W = x.shape
+        x = x.flatten(3)
+        T, B, C, N = x.shape
+        x_for_qkv = x.flatten(0, 1)
+        q_conv_out = self.q_conv(x_for_qkv)
+        q_conv_out = self.q_bn(q_conv_out).reshape(T, B, C, N)
+        q_conv_out = self.q_lif(q_conv_out)
+        q = q_conv_out.unsqueeze(2).reshape(T, B, self.num_heads, C // self.num_heads, N)
+        # k_conv_out = self.k_conv(x_for_qkv)
+        k_conv_out = self.k_proj_delay(x_for_qkv.reshape(T,B,C,N))
+        k_conv_out = self.k_bn(k_conv_out).reshape(T, B, C, N)
+        k_conv_out = self.k_lif(k_conv_out)
+        k = k_conv_out.unsqueeze(2).reshape(T, B, self.num_heads, C // self.num_heads, N)
+        q = torch.sum(q, dim=3, keepdim=True)
+        attn = self.attn_lif(q)
+        x = torch.mul(attn, k)
+        x = x.flatten(2, 3)
+        x = self.proj_bn(self.proj_conv(x.flatten(0, 1))).reshape(T, B, C, H, W)
+        x = self.proj_lif(x)
+        return x
+class Spiking_Self_Attention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1,
+                 k=16):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = 0.125
+        self.q_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1, bias=False)
+        self.q_bn = nn.BatchNorm1d(dim)
+        self.q_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        # self.k_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1, bias=False)
+        self.k_proj_delay = DelayConv(in_c=self.dim, k=k)
+        self.k_bn = nn.BatchNorm1d(dim)
+        self.k_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        # self.v_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1, bias=False)
+        self.v_proj_delay = DelayConv(in_c=self.dim, k=k)
+        self.v_bn = nn.BatchNorm1d(dim)
+        self.v_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        self.attn_lif = MultiStepLIFNode(tau=2.0, v_threshold=0.5, detach_reset=True, backend='cupy')
+        self.proj_conv = nn.Conv1d(dim, dim, kernel_size=1, stride=1)
+        self.proj_bn = nn.BatchNorm1d(dim)
+        self.proj_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        self.qkv_mp = nn.MaxPool1d(4)
+    def forward(self, x):
+        T, B, C, H, W = x.shape
+        x = x.flatten(3)
+        T, B, C, N = x.shape
+        x_for_qkv = x.flatten(0, 1)
+        q_conv_out = self.q_conv(x_for_qkv)
+        q_conv_out = self.q_bn(q_conv_out).reshape(T, B, C, N).contiguous()
+        q_conv_out = self.q_lif(q_conv_out)
+        q = q_conv_out.transpose(-1, -2).reshape(T, B, N, self.num_heads, C // self.num_heads).permute(0, 1, 3, 2,
+                                                                                                       4).contiguous()
+        k_conv_out = self.k_proj_delay(x_for_qkv.reshape(T,B,C,N))
+        k_conv_out = self.k_bn(k_conv_out).reshape(T, B, C, N).contiguous()
+        k_conv_out = self.k_lif(k_conv_out)
+        k = k_conv_out.transpose(-1, -2).reshape(T, B, N, self.num_heads, C // self.num_heads).permute(0, 1, 3, 2,
+                                                                                                       4).contiguous()
+        v_conv_out = self.v_proj_delay(x_for_qkv.reshape(T,B,C,N))
+        v_conv_out = self.v_bn(v_conv_out).reshape(T, B, C, N).contiguous()
+        v_conv_out = self.v_lif(v_conv_out)
+        v = v_conv_out.transpose(-1, -2).reshape(T, B, N, self.num_heads, C // self.num_heads).permute(0, 1, 3, 2,
+                                                                                                       4).contiguous()
+        x = k.transpose(-2, -1) @ v
+        x = (q @ x) * self.scale
+        x = x.transpose(3, 4).reshape(T, B, C, N).contiguous()
+        x = self.attn_lif(x)
+        x = x.flatten(0, 1)
+        x = self.proj_lif(self.proj_bn(self.proj_conv(x))).reshape(T, B, C, H, W)
+        return x
+class TokenSpikingTransformer(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, sr_ratio=1):
+        super().__init__()
+        self.tssa = Token_QK_Attention(dim, num_heads)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MLP(in_features= dim, hidden_features=mlp_hidden_dim, drop=drop)
+    def forward(self, x):
+        x = x + self.tssa(x)
+        x = x + self.mlp(x)
+        return x
+class SpikingTransformer(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, sr_ratio=1):
+        super().__init__()
+        self.ssa = Spiking_Self_Attention(dim, num_heads)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MLP(in_features= dim, hidden_features=mlp_hidden_dim, drop=drop)
+    def forward(self, x):
+        x = x + self.ssa(x)
+        x = x + self.mlp(x)
+        return x
+class PatchEmbedInit(nn.Module):
+    def __init__(self, img_size_h=128, img_size_w=128, patch_size=4, in_channels=2, embed_dims=256):
+        super().__init__()
+        self.image_size = [img_size_h, img_size_w]
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.C = in_channels
+        self.H, self.W = self.image_size[0] // patch_size[0], self.image_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj_conv = nn.Conv2d(in_channels, embed_dims // 8, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj_bn = nn.BatchNorm2d(embed_dims // 8)
+        self.proj_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        self.proj1_conv = nn.Conv2d(embed_dims // 8, embed_dims // 4, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj1_bn = nn.BatchNorm2d(embed_dims // 4)
+        self.maxpool1 = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
+        self.proj1_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        self.proj2_conv = nn.Conv2d(embed_dims//4, embed_dims // 2, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj2_bn = nn.BatchNorm2d(embed_dims // 2)
+        self.maxpool2 = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
+        self.proj2_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        self.proj3_conv = nn.Conv2d(embed_dims // 2, embed_dims, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj3_bn = nn.BatchNorm2d(embed_dims)
+        self.maxpool3 = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
+        self.proj3_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        self.proj_res_conv = nn.Conv2d(embed_dims // 4, embed_dims, kernel_size=1, stride=4, padding=0, bias=False)
+        self.proj_res_bn = nn.BatchNorm2d(embed_dims)
+        self.proj_res_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+    def forward(self, x):
+        T, B, C, H, W = x.shape
+        # Downsampling + Res
+        # x_feat = x.flatten(0, 1)
+        x = self.proj_conv(x.flatten(0, 1))
+        x = self.proj_bn(x).reshape(T, B, -1, H, W)
+        x = self.proj_lif(x).flatten(0, 1).contiguous()
+        x = self.proj1_conv(x)
+        x = self.proj1_bn(x)
+        x = self.maxpool1(x)
+        _, _, H1, W1 = x.shape
+        x = x.reshape(T, B, -1, H1, W1).contiguous()
+        x = self.proj1_lif(x).flatten(0, 1).contiguous()
+        x_feat = x
+        x = self.proj2_conv(x)
+        x = self.proj2_bn(x)
+        x = self.maxpool2(x)
+        _, _, H2, W2 = x.shape
+        x = x.reshape(T, B, -1, H2, W2).contiguous()
+        x = self.proj2_lif(x).flatten(0, 1).contiguous()
+        x = self.proj3_conv(x)
+        x = self.proj3_bn(x)
+        x = self.maxpool3(x)
+        _, _, H3, W3 = x.shape
+        x = x.reshape(T, B, -1, H3, W3).contiguous()
+        x = self.proj3_lif(x)
+        x_feat = self.proj_res_conv(x_feat)
+        x_feat = self.proj_res_bn(x_feat)
+        _, _, Hres, Wres = x_feat.shape
+        x_feat = x_feat.reshape(T, B, -1, Hres, Wres).contiguous()
+        x_feat = self.proj_res_lif(x_feat)
+        x = x + x_feat  # shortcut
+        return x
+class PatchEmbeddingStage(nn.Module):
+    def __init__(self, img_size_h=128, img_size_w=128, patch_size=4, in_channels=2, embed_dims=256):
+        super().__init__()
+        self.image_size = [img_size_h, img_size_w]
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.C = in_channels
+        self.H, self.W = self.image_size[0] // patch_size[0], self.image_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj_conv = nn.Conv2d(embed_dims//2, embed_dims, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj_bn = nn.BatchNorm2d(embed_dims)
+        self.proj_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        self.proj4_conv = nn.Conv2d(embed_dims, embed_dims, kernel_size=3, stride=1, padding=1, bias=False)
+        self.proj4_bn = nn.BatchNorm2d(embed_dims)
+        self.proj4_maxpool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
+        self.proj4_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+        self.proj_res_conv = nn.Conv2d(embed_dims//2, embed_dims, kernel_size=1, stride=2, padding=0, bias=False)
+        self.proj_res_bn = nn.BatchNorm2d(embed_dims)
+        self.proj_res_lif = MultiStepLIFNode(tau=2.0, detach_reset=True, backend='cupy')
+    def forward(self, x):
+        T, B, C, H, W = x.shape
+        # Downsampling + Res
+        x = x.flatten(0, 1).contiguous()
+        x_feat = x
+        x = self.proj_conv(x)
+        x = self.proj_bn(x).reshape(T, B, -1, H, W).contiguous()
+        x = self.proj_lif(x).flatten(0, 1).contiguous()
+        x = self.proj4_conv(x)
+        x = self.proj4_bn(x)
+        x = self.proj4_maxpool(x)
+        _, _, H4, W4 = x.shape
+        x = x.reshape(T, B, -1, H4, W4).contiguous()
+        x = self.proj4_lif(x)
+        x_feat = self.proj_res_conv(x_feat)
+        x_feat = self.proj_res_bn(x_feat)
+        _, _, Hres, Wres = x_feat.shape
+        x_feat = x_feat.reshape(T, B, -1, Hres, Wres).contiguous()
+        x_feat = self.proj_res_lif(x_feat)
+        x = x + x_feat # shortcut
+        return x
+class vit_snn(nn.Module):
+    def __init__(self,
+                 img_size_h=128, img_size_w=128, patch_size=16, in_channels=2, num_classes=11,
+                 embed_dims=[64, 128, 256], num_heads=[1, 2, 4], mlp_ratios=[4, 4, 4], qkv_bias=False, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
+                 depths=[6, 8, 6], sr_ratios=[8, 4, 2], T=4, pretrained_cfg=None, in_chans = 3, no_weight_decay = None
+                 ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.depths = depths
+        self.T = T
+        num_heads = [16, 16, 16]
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depths)]  # stochastic depth decay rule
+        #
+        patch_embed1 = PatchEmbedInit(img_size_h=img_size_h,
+                                       img_size_w=img_size_w,
+                                       patch_size=patch_size,
+                                       in_channels=in_channels,
+                                       embed_dims=embed_dims // 2)
+        stage1 = nn.ModuleList([TokenSpikingTransformer(
+            dim=embed_dims // 2, num_heads=num_heads[0], mlp_ratio=mlp_ratios, qkv_bias=qkv_bias,
+            qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[j],
+            norm_layer=norm_layer, sr_ratio=sr_ratios)
+            for j in range(1)])
+        patch_embed2 = PatchEmbeddingStage(img_size_h=img_size_h,
+                                       img_size_w=img_size_w,
+                                       patch_size=patch_size,
+                                       in_channels=in_channels,
+                                       embed_dims=embed_dims)
+        stage2 = nn.ModuleList([SpikingTransformer(
+            dim=embed_dims, num_heads=num_heads[1], mlp_ratio=mlp_ratios, qkv_bias=qkv_bias,
+            qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[j],
+            norm_layer=norm_layer, sr_ratio=sr_ratios)
+            for j in range(1)])
+        setattr(self, f"patch_embed1", patch_embed1)
+        setattr(self, f"stage1", stage1)
+        setattr(self, f"patch_embed2", patch_embed2)
+        setattr(self, f"stage2", stage2)
+        # classification head
+        self.head = nn.Linear(embed_dims, num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pose_embed'}
+    @torch.jit.ignore
+    def _get_pos_embed(self, pos_embed, patch_embed, H, W):
+        return None
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward_features(self, x):
+        stage1 = getattr(self, f"stage1")
+        patch_embed1 = getattr(self, f"patch_embed1")
+        stage2 = getattr(self, f"stage2")
+        patch_embed2 = getattr(self, f"patch_embed2")
+        x = patch_embed1(x)
+        for blk in stage1:
+            x = blk(x)
+        x = patch_embed2(x)
+        for blk in stage2:
+            x = blk(x)
+        return x.flatten(3).mean(3)
+    def forward(self, x):
+        x = x.permute(1, 0, 2, 3, 4)  # [T, N, 2, *, *]
+        # print("torch.unique", torch.unique(x))
+        # print("torch.count_nonzero", torch.count_nonzero(x))
+        # print("numel()", x.numel())
+        x = self.forward_features(x)
+        x = self.head(x.mean(0))
+        return x
+@register_model
+def delay_QKFormer(pretrained=False, **kwargs):
+    model = vit_snn(
+        patch_size=16, embed_dims=256, num_heads=16, mlp_ratios=4,
+        in_channels=2, num_classes=101, qkv_bias=False,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=4, sr_ratios=1,
+        **kwargs
+    )
+    model.default_cfg = _cfg()
+    return model
+from timm.models import create_model
+if __name__ == '__main__':
+    x = torch.randn(1, 1, 2, 128, 128).cuda()
+    model = create_model(
+        'delay_QKFormer',
+        pretrained=False,
+        drop_rate=0,
+        drop_path_rate=0.1,
+        drop_block_rate=None,
+    ).cuda()
+    model.eval()
+    from torchinfo import summary
+    summary(model, input_size=(1, 1, 2, 128, 128))
+    # y = model(x)
+    # print(y.shape)
+    # print('Test Good!')