Spaces:

CurHarsh
/

COC-VIZ

Sleeping

App Files Files Community

CurHarsh commited on May 24, 2023

Commit

0ff5906

1 Parent(s): 3781a48

Update models/context_cluster.py

Browse files

Files changed (1) hide show

models/context_cluster.py +847 -0

models/context_cluster.py CHANGED Viewed

	@@ -0,0 +1,847 @@

+"""
+ContextCluster implementation
+# --------------------------------------------------------
+# Context Cluster -- Image as Set of Points, ICLR'23 Oral
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xu Ma (ma.xu1@northeastern.com)
+# --------------------------------------------------------
+"""
+import os
+import copy
+import torch
+import torch.nn as nn
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.models.layers import DropPath, trunc_normal_
+from timm.models.registry import register_model
+from timm.models.layers.helpers import to_2tuple
+from einops import rearrange
+import torch.nn.functional as F
+try:
+    from mmseg.models.builder import BACKBONES as seg_BACKBONES
+    from mmseg.utils import get_root_logger
+    from mmcv.runner import _load_checkpoint
+    has_mmseg = True
+except ImportError:
+    print("If for semantic segmentation, please install mmsegmentation first")
+    has_mmseg = False
+try:
+    from mmdet.models.builder import BACKBONES as det_BACKBONES
+    from mmdet.utils import get_root_logger
+    from mmcv.runner import _load_checkpoint
+    has_mmdet = True
+except ImportError:
+    print("If for detection, please install mmdetection first")
+    has_mmdet = False
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224),
+        'crop_pct': .95, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'classifier': 'head',
+        **kwargs
+    }
+default_cfgs = {
+    'model_small': _cfg(crop_pct=0.9),
+    'model_medium': _cfg(crop_pct=0.95),
+}
+class PointRecuder(nn.Module):
+    """
+    Point Reducer is implemented by a layer of conv since it is mathmatically equal.
+    Input: tensor in shape [B, in_chans, H, W]
+    Output: tensor in shape [B, embed_dim, H/stride, W/stride]
+    """
+    def __init__(self, patch_size=16, stride=16, padding=0,
+                 in_chans=3, embed_dim=768, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        stride = to_2tuple(stride)
+        padding = to_2tuple(padding)
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size,
+                              stride=stride, padding=padding)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        x = self.proj(x)
+        x = self.norm(x)
+        return x
+class GroupNorm(nn.GroupNorm):
+    """
+    Group Normalization with 1 group.
+    Input: tensor in shape [B, C, H, W]
+    """
+    def __init__(self, num_channels, **kwargs):
+        super().__init__(1, num_channels, **kwargs)
+def pairwise_cos_sim(x1: torch.Tensor, x2: torch.Tensor):
+    """
+    return pair-wise similarity matrix between two tensors
+    :param x1: [B,...,M,D]
+    :param x2: [B,...,N,D]
+    :return: similarity matrix [B,...,M,N]
+    """
+    x1 = F.normalize(x1, dim=-1)
+    x2 = F.normalize(x2, dim=-1)
+    sim = torch.matmul(x1, x2.transpose(-2, -1))
+    return sim
+class Cluster(nn.Module):
+    def __init__(self, dim, out_dim, proposal_w=2, proposal_h=2, fold_w=2, fold_h=2, heads=4, head_dim=24,
+                 return_center=False):
+        """
+        :param dim:  channel nubmer
+        :param out_dim: channel nubmer
+        :param proposal_w: the sqrt(proposals) value, we can also set a different value
+        :param proposal_h: the sqrt(proposals) value, we can also set a different value
+        :param fold_w: the sqrt(number of regions) value, we can also set a different value
+        :param fold_h: the sqrt(number of regions) value, we can also set a different value
+        :param heads:  heads number in context cluster
+        :param head_dim: dimension of each head in context cluster
+        :param return_center: if just return centers instead of dispatching back (deprecated).
+        """
+        super().__init__()
+        self.heads = heads
+        self.head_dim = head_dim
+        self.f = nn.Conv2d(dim, heads * head_dim, kernel_size=1)  # for similarity
+        self.proj = nn.Conv2d(heads * head_dim, out_dim, kernel_size=1)  # for projecting channel number
+        self.v = nn.Conv2d(dim, heads * head_dim, kernel_size=1)  # for value
+        self.sim_alpha = nn.Parameter(torch.ones(1))
+        self.sim_beta = nn.Parameter(torch.zeros(1))
+        self.centers_proposal = nn.AdaptiveAvgPool2d((proposal_w, proposal_h))
+        self.fold_w = fold_w
+        self.fold_h = fold_h
+        self.return_center = return_center
+    def forward(self, x):  # [b,c,w,h]
+        value = self.v(x)
+        x = self.f(x)
+        x = rearrange(x, "b (e c) w h -> (b e) c w h", e=self.heads)
+        value = rearrange(value, "b (e c) w h -> (b e) c w h", e=self.heads)
+        if self.fold_w > 1 and self.fold_h > 1:
+            # split the big feature maps to small local regions to reduce computations.
+            b0, c0, w0, h0 = x.shape
+            assert w0 % self.fold_w == 0 and h0 % self.fold_h == 0, \
+                f"Ensure the feature map size ({w0}*{h0}) can be divided by fold {self.fold_w}*{self.fold_h}"
+            x = rearrange(x, "b c (f1 w) (f2 h) -> (b f1 f2) c w h", f1=self.fold_w,
+                          f2=self.fold_h)  # [bs*blocks,c,ks[0],ks[1]]
+            value = rearrange(value, "b c (f1 w) (f2 h) -> (b f1 f2) c w h", f1=self.fold_w, f2=self.fold_h)
+        b, c, w, h = x.shape
+        centers = self.centers_proposal(x)  # [b,c,C_W,C_H], we set M = C_W*C_H and N = w*h
+        value_centers = rearrange(self.centers_proposal(value), 'b c w h -> b (w h) c')  # [b,C_W,C_H,c]
+        b, c, ww, hh = centers.shape
+        sim = torch.sigmoid(
+            self.sim_beta +
+            self.sim_alpha * pairwise_cos_sim(
+                centers.reshape(b, c, -1).permute(0, 2, 1),
+                x.reshape(b, c, -1).permute(0, 2, 1)
+            )
+        )  # [B,M,N]
+        # we use mask to sololy assign each point to one center
+        sim_max, sim_max_idx = sim.max(dim=1, keepdim=True)
+        mask = torch.zeros_like(sim)  # binary #[B,M,N]
+        mask.scatter_(1, sim_max_idx, 1.)
+        sim = sim * mask
+        value2 = rearrange(value, 'b c w h -> b (w h) c')  # [B,N,D]
+        # aggregate step, out shape [B,M,D]
+        ###
+        # Update Comment: Mar/26/2022
+        # a small bug: mask.sum should be sim.sum according to Eq. (1), mask can be considered as a hard version of sim in out implementation.
+        # We will update all checkpoints and the bug once all models are re-trained.
+        ###
+        out = ((value2.unsqueeze(dim=1) * sim.unsqueeze(dim=-1)).sum(dim=2) + value_centers) / (
+                    mask.sum(dim=-1, keepdim=True) + 1.0)  # [B,M,D]
+        if self.return_center:
+            out = rearrange(out, "b (w h) c -> b c w h", w=ww)
+        else:
+            # dispatch step, return to each point in a cluster
+            out = (out.unsqueeze(dim=2) * sim.unsqueeze(dim=-1)).sum(dim=1)  # [B,N,D]
+            out = rearrange(out, "b (w h) c -> b c w h", w=w)
+        if self.fold_w > 1 and self.fold_h > 1:
+            # recover the splited regions back to big feature maps if use the region partition.
+            out = rearrange(out, "(b f1 f2) c w h -> b c (f1 w) (f2 h)", f1=self.fold_w, f2=self.fold_h)
+        out = rearrange(out, "(b e) c w h -> b (e c) w h", e=self.heads)
+        out = self.proj(out)
+        return out
+class Mlp(nn.Module):
+    """
+    Implementation of MLP with nn.Linear (would be slightly faster in both training and inference).
+    Input: tensor with shape [B, C, H, W]
+    """
+    def __init__(self, in_features, hidden_features=None,
+                 out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.fc1(x.permute(0, 2, 3, 1))
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x).permute(0, 3, 1, 2)
+        x = self.drop(x)
+        return x
+class ClusterBlock(nn.Module):
+    """
+    Implementation of one block.
+    --dim: embedding dim
+    --mlp_ratio: mlp expansion ratio
+    --act_layer: activation
+    --norm_layer: normalization
+    --drop: dropout rate
+    --drop path: Stochastic Depth,
+        refer to https://arxiv.org/abs/1603.09382
+    --use_layer_scale, --layer_scale_init_value: LayerScale,
+        refer to https://arxiv.org/abs/2103.17239
+    """
+    def __init__(self, dim, mlp_ratio=4.,
+                 act_layer=nn.GELU, norm_layer=GroupNorm,
+                 drop=0., drop_path=0.,
+                 use_layer_scale=True, layer_scale_init_value=1e-5,
+                 # for context-cluster
+                 proposal_w=2, proposal_h=2, fold_w=2, fold_h=2, heads=4, head_dim=24, return_center=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        # dim, out_dim, proposal_w=2,proposal_h=2, fold_w=2, fold_h=2, heads=4, head_dim=24, return_center=False
+        self.token_mixer = Cluster(dim=dim, out_dim=dim, proposal_w=proposal_w, proposal_h=proposal_h,
+                                   fold_w=fold_w, fold_h=fold_h, heads=heads, head_dim=head_dim, return_center=False)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer, drop=drop)
+        # The following two techniques are useful to train deep ContextClusters.
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.use_layer_scale = use_layer_scale
+        if use_layer_scale:
+            self.layer_scale_1 = nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            self.layer_scale_2 = nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+    def forward(self, x):
+        if self.use_layer_scale:
+            x = x + self.drop_path(
+                self.layer_scale_1.unsqueeze(-1).unsqueeze(-1)
+                * self.token_mixer(self.norm1(x)))
+            x = x + self.drop_path(
+                self.layer_scale_2.unsqueeze(-1).unsqueeze(-1)
+                * self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.token_mixer(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+def basic_blocks(dim, index, layers,
+                 mlp_ratio=4.,
+                 act_layer=nn.GELU, norm_layer=GroupNorm,
+                 drop_rate=.0, drop_path_rate=0.,
+                 use_layer_scale=True, layer_scale_init_value=1e-5,
+                 # for context-cluster
+                 proposal_w=2, proposal_h=2, fold_w=2, fold_h=2, heads=4, head_dim=24, return_center=False):
+    blocks = []
+    for block_idx in range(layers[index]):
+        block_dpr = drop_path_rate * ( block_idx + sum(layers[:index])) / (sum(layers) - 1)
+        blocks.append(ClusterBlock(
+            dim, mlp_ratio=mlp_ratio,
+            act_layer=act_layer, norm_layer=norm_layer,
+            drop=drop_rate, drop_path=block_dpr,
+            use_layer_scale=use_layer_scale,
+            layer_scale_init_value=layer_scale_init_value,
+            proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+            heads=heads, head_dim=head_dim, return_center=False
+        ))
+    blocks = nn.Sequential(*blocks)
+    return blocks
+class ContextCluster(nn.Module):
+    """
+    ContextCluster, the main class of our model
+    --layers: [x,x,x,x], number of blocks for the 4 stages
+    --embed_dims, --mlp_ratios, the embedding dims, mlp ratios
+    --downsamples: flags to apply downsampling or not
+    --norm_layer, --act_layer: define the types of normalization and activation
+    --num_classes: number of classes for the image classification
+    --in_patch_size, --in_stride, --in_pad: specify the patch embedding
+        for the input image
+    --down_patch_size --down_stride --down_pad:
+        specify the downsample (patch embed.)
+    --fork_feat: whether output features of the 4 stages, for dense prediction
+    --init_cfg, --pretrained:
+        for mmdetection and mmsegmentation to load pretrained weights
+    """
+    def __init__(self, layers, embed_dims=None,
+                 mlp_ratios=None, downsamples=None,
+                 norm_layer=nn.BatchNorm2d, act_layer=nn.GELU,
+                 num_classes=1000,
+                 in_patch_size=4, in_stride=4, in_pad=0,
+                 down_patch_size=2, down_stride=2, down_pad=0,
+                 drop_rate=0., drop_path_rate=0.,
+                 use_layer_scale=True, layer_scale_init_value=1e-5,
+                 fork_feat=False,
+                 init_cfg=None,
+                 pretrained=None,
+                 # the parameters for context-cluster
+                 proposal_w=[2, 2, 2, 2], proposal_h=[2, 2, 2, 2], fold_w=[8, 4, 2, 1], fold_h=[8, 4, 2, 1],
+                 heads=[2, 4, 6, 8], head_dim=[16, 16, 32, 32],
+                 **kwargs):
+        super().__init__()
+        if not fork_feat:
+            self.num_classes = num_classes
+        self.fork_feat = fork_feat
+        self.patch_embed = PointRecuder(
+            patch_size=in_patch_size, stride=in_stride, padding=in_pad,
+            in_chans=5, embed_dim=embed_dims[0])
+        # set the main block in network
+        network = []
+        for i in range(len(layers)):
+            stage = basic_blocks(embed_dims[i], i, layers,
+                                 mlp_ratio=mlp_ratios[i],
+                                 act_layer=act_layer, norm_layer=norm_layer,
+                                 drop_rate=drop_rate,
+                                 drop_path_rate=drop_path_rate,
+                                 use_layer_scale=use_layer_scale,
+                                 layer_scale_init_value=layer_scale_init_value,
+                                 proposal_w=proposal_w[i], proposal_h=proposal_h[i],
+                                 fold_w=fold_w[i], fold_h=fold_h[i], heads=heads[i], head_dim=head_dim[i],
+                                 return_center=False
+                                 )
+            network.append(stage)
+            if i >= len(layers) - 1:
+                break
+            if downsamples[i] or embed_dims[i] != embed_dims[i + 1]:
+                # downsampling between two stages
+                network.append(
+                    PointRecuder(
+                        patch_size=down_patch_size, stride=down_stride,
+                        padding=down_pad,
+                        in_chans=embed_dims[i], embed_dim=embed_dims[i + 1]
+                    )
+                )
+        self.network = nn.ModuleList(network)
+        if self.fork_feat:
+            # add a norm layer for each output
+            self.out_indices = [0, 2, 4, 6]
+            for i_emb, i_layer in enumerate(self.out_indices):
+                if i_emb == 0 and os.environ.get('FORK_LAST3', None):
+                    # TODO: more elegant way
+                    """For RetinaNet, `start_level=1`. The first norm layer will not used.
+                    cmd: `FORK_LAST3=1 python -m torch.distributed.launch ...`
+                    """
+                    layer = nn.Identity()
+                else:
+                    layer = norm_layer(embed_dims[i_emb])
+                layer_name = f'norm{i_layer}'
+                self.add_module(layer_name, layer)
+        else:
+            # Classifier head
+            self.norm = norm_layer(embed_dims[-1])
+            self.head = nn.Linear(
+                embed_dims[-1], num_classes) if num_classes > 0 \
+                else nn.Identity()
+        self.apply(self.cls_init_weights)
+        self.init_cfg = copy.deepcopy(init_cfg)
+        # load pre-trained model
+        if self.fork_feat and (
+                self.init_cfg is not None or pretrained is not None):
+            self.init_weights()
+    # init for classification
+    def cls_init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    # init for mmdetection or mmsegmentation by loading
+    # imagenet pre-trained weights
+    def init_weights(self, pretrained=None):
+        logger = get_root_logger()
+        if self.init_cfg is None and pretrained is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            pass
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            if self.init_cfg is not None:
+                ckpt_path = self.init_cfg['checkpoint']
+            elif pretrained is not None:
+                ckpt_path = pretrained
+            ckpt = _load_checkpoint(
+                ckpt_path, logger=logger, map_location='cpu')
+            if 'state_dict' in ckpt:
+                _state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                _state_dict = ckpt['model']
+            else:
+                _state_dict = ckpt
+            state_dict = _state_dict
+            missing_keys, unexpected_keys = \
+                self.load_state_dict(state_dict, False)
+            # show for debug
+            # print('missing_keys: ', missing_keys)
+            # print('unexpected_keys: ', unexpected_keys)
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes):
+        self.num_classes = num_classes
+        self.head = nn.Linear(
+            self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    def forward_embeddings(self, x):
+        _, c, img_w, img_h = x.shape
+        # print(f"det img size is {img_w} * {img_h}")
+        # register positional information buffer.
+        range_w = torch.arange(0, img_w, step=1) / (img_w - 1.0)
+        range_h = torch.arange(0, img_h, step=1) / (img_h - 1.0)
+        fea_pos = torch.stack(torch.meshgrid(range_w, range_h, indexing='ij'), dim=-1).float()
+        fea_pos = fea_pos.to(x.device)
+        fea_pos = fea_pos - 0.5
+        pos = fea_pos.permute(2, 0, 1).unsqueeze(dim=0).expand(x.shape[0], -1, -1, -1)
+        x = self.patch_embed(torch.cat([x, pos], dim=1))
+        return x
+    def forward_tokens(self, x):
+        outs = []
+        for idx, block in enumerate(self.network):
+            x = block(x)
+            if self.fork_feat and idx in self.out_indices:
+                norm_layer = getattr(self, f'norm{idx}')
+                x_out = norm_layer(x)
+                outs.append(x_out)
+        if self.fork_feat:
+            # output the features of four stages for dense prediction
+            return outs
+        # output only the features of last layer for image classification
+        return x
+    def forward(self, x):
+        # input embedding
+        x = self.forward_embeddings(x)
+        # through backbone
+        x = self.forward_tokens(x)
+        if self.fork_feat:
+            # otuput features of four stages for dense prediction
+            return x
+        x = self.norm(x)
+        cls_out = self.head(x.mean([-2, -1]))
+        # for image classification
+        return cls_out
+@register_model
+def coc_tiny(pretrained=False, **kwargs):
+    layers = [3, 4, 5, 2]
+    norm_layer = GroupNorm
+    embed_dims = [32, 64, 196, 320]
+    mlp_ratios = [8, 8, 4, 4]
+    downsamples = [True, True, True, True]
+    proposal_w = [2, 2, 2, 2]
+    proposal_h = [2, 2, 2, 2]
+    fold_w = [8, 4, 2, 1]
+    fold_h = [8, 4, 2, 1]
+    heads = [4, 4, 8, 8]
+    head_dim = [24, 24, 24, 24]
+    down_patch_size = 3
+    down_pad = 1
+    model = ContextCluster(
+        layers, embed_dims=embed_dims, norm_layer=norm_layer,
+        mlp_ratios=mlp_ratios, downsamples=downsamples,
+        down_patch_size=down_patch_size, down_pad=down_pad,
+        proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+        heads=heads, head_dim=head_dim,
+        **kwargs)
+    model.default_cfg = default_cfgs['model_small']
+    return model
+@register_model
+def coc_tiny2(pretrained=False, **kwargs):
+    layers = [3, 4, 5, 2]
+    norm_layer = GroupNorm
+    embed_dims = [32, 64, 196, 320]
+    mlp_ratios = [8, 8, 4, 4]
+    downsamples = [True, True, True, True]
+    proposal_w = [4, 2, 7, 4]
+    proposal_h = [4, 2, 7, 4]
+    fold_w = [7, 7, 1, 1]
+    fold_h = [7, 7, 1, 1]
+    heads = [4, 4, 8, 8]
+    head_dim = [24, 24, 24, 24]
+    down_patch_size = 3
+    down_pad = 1
+    model = ContextCluster(
+        layers, embed_dims=embed_dims, norm_layer=norm_layer,
+        mlp_ratios=mlp_ratios, downsamples=downsamples,
+        down_patch_size=down_patch_size, down_pad=down_pad,
+        proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+        heads=heads, head_dim=head_dim,
+        **kwargs)
+    model.default_cfg = default_cfgs['model_small']
+    return model
+@register_model
+def coc_small(pretrained=False, **kwargs):
+    layers = [2, 2, 6, 2]
+    norm_layer = GroupNorm
+    embed_dims = [64, 128, 320, 512]
+    mlp_ratios = [8, 8, 4, 4]
+    downsamples = [True, True, True, True]
+    proposal_w = [2, 2, 2, 2]
+    proposal_h = [2, 2, 2, 2]
+    fold_w = [8, 4, 2, 1]
+    fold_h = [8, 4, 2, 1]
+    heads = [4, 4, 8, 8]
+    head_dim = [32, 32, 32, 32]
+    down_patch_size = 3
+    down_pad = 1
+    model = ContextCluster(
+        layers, embed_dims=embed_dims, norm_layer=norm_layer,
+        mlp_ratios=mlp_ratios, downsamples=downsamples,
+        down_patch_size=down_patch_size, down_pad=down_pad,
+        proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+        heads=heads, head_dim=head_dim,
+        **kwargs)
+    model.default_cfg = default_cfgs['model_small']
+    return model
+@register_model
+def coc_medium(pretrained=False, **kwargs):
+    layers = [4, 4, 12, 4]
+    norm_layer = GroupNorm
+    embed_dims = [64, 128, 320, 512]
+    mlp_ratios = [8, 8, 4, 4]
+    downsamples = [True, True, True, True]
+    proposal_w = [2, 2, 2, 2]
+    proposal_h = [2, 2, 2, 2]
+    fold_w = [8, 4, 2, 1]
+    fold_h = [8, 4, 2, 1]
+    heads = [6, 6, 12, 12]
+    head_dim = [32, 32, 32, 32]
+    down_patch_size = 3
+    down_pad = 1
+    model = ContextCluster(
+        layers, embed_dims=embed_dims, norm_layer=norm_layer,
+        mlp_ratios=mlp_ratios, downsamples=downsamples,
+        down_patch_size=down_patch_size, down_pad=down_pad,
+        proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+        heads=heads, head_dim=head_dim,
+        **kwargs)
+    model.default_cfg = default_cfgs['model_small']
+    return model
+@register_model
+def coc_base_dim64(pretrained=False, **kwargs):
+    layers = [6, 6, 24, 6]
+    norm_layer = GroupNorm
+    embed_dims = [64, 128, 320, 512]
+    mlp_ratios = [8, 8, 4, 4]
+    downsamples = [True, True, True, True]
+    proposal_w = [2, 2, 2, 2]
+    proposal_h = [2, 2, 2, 2]
+    fold_w = [8, 4, 2, 1]
+    fold_h = [8, 4, 2, 1]
+    heads = [8, 8, 16, 16]
+    head_dim = [32, 32, 32, 32]
+    down_patch_size = 3
+    down_pad = 1
+    model = ContextCluster(
+        layers, embed_dims=embed_dims, norm_layer=norm_layer,
+        mlp_ratios=mlp_ratios, downsamples=downsamples,
+        down_patch_size=down_patch_size, down_pad=down_pad,
+        proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+        heads=heads, head_dim=head_dim,
+        **kwargs)
+    model.default_cfg = default_cfgs['model_small']
+    return model
+@register_model
+def coc_base_dim96(pretrained=False, **kwargs):
+    layers = [4, 4, 12, 4]
+    norm_layer = GroupNorm
+    embed_dims = [96, 192, 384, 768]
+    mlp_ratios = [8, 8, 4, 4]
+    downsamples = [True, True, True, True]
+    proposal_w = [2, 2, 2, 2]
+    proposal_h = [2, 2, 2, 2]
+    fold_w = [8, 4, 2, 1]
+    fold_h = [8, 4, 2, 1]
+    heads = [8, 8, 16, 16]
+    head_dim = [32, 32, 32, 32]
+    down_patch_size = 3
+    down_pad = 1
+    model = ContextCluster(
+        layers, embed_dims=embed_dims, norm_layer=norm_layer,
+        mlp_ratios=mlp_ratios, downsamples=downsamples,
+        down_patch_size=down_patch_size, down_pad=down_pad,
+        proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+        heads=heads, head_dim=head_dim,
+        **kwargs)
+    model.default_cfg = default_cfgs['model_small']
+    return model
+"""
+Updated: add plain models (without region partition) for tiny, small, and base , etc.
+Re-trained with new implementation (PWconv->MLP for faster training and inference), achieve slightly better performance.
+"""
+@register_model
+def coc_tiny_plain(pretrained=False, **kwargs):
+    # sharing same parameters as coc_tiny, without region partition.
+    layers = [3, 4, 5, 2]
+    norm_layer = GroupNorm
+    embed_dims = [32, 64, 196, 320]
+    mlp_ratios = [8, 8, 4, 4]
+    downsamples = [True, True, True, True]
+    proposal_w = [4, 4, 2, 2]
+    proposal_h = [4, 4, 2, 2]
+    fold_w = [1, 1, 1, 1]
+    fold_h = [1, 1, 1, 1]
+    heads = [4, 4, 8, 8]
+    head_dim = [24, 24, 24, 24]
+    down_patch_size = 3
+    down_pad = 1
+    model = ContextCluster(
+        layers, embed_dims=embed_dims, norm_layer=norm_layer,
+        mlp_ratios=mlp_ratios, downsamples=downsamples,
+        down_patch_size=down_patch_size, down_pad=down_pad,
+        proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+        heads=heads, head_dim=head_dim,
+        **kwargs)
+    model.default_cfg = default_cfgs['model_small']
+    return model
+if has_mmdet:
+    @seg_BACKBONES.register_module()
+    @det_BACKBONES.register_module()
+    class context_cluster_small_feat2(ContextCluster):
+        def __init__(self, **kwargs):
+                layers = [2, 2, 6, 2]
+                norm_layer=GroupNorm
+                embed_dims = [64, 128, 320, 512]
+                mlp_ratios = [8, 8, 4, 4]
+                downsamples = [True, True, True, True]
+                proposal_w=[2,2,2,2]
+                proposal_h=[2,2,2,2]
+                fold_w=[8,4,2,1]
+                fold_h=[8,4,2,1]
+                heads=[4,4,8,8]
+                head_dim=[32,32,32,32]
+                down_patch_size=3
+                down_pad = 1
+                super().__init__(
+                    layers, embed_dims=embed_dims, norm_layer=norm_layer,
+                    mlp_ratios=mlp_ratios, downsamples=downsamples,
+                    down_patch_size = down_patch_size, down_pad=down_pad,
+                    proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+                    heads=heads, head_dim=head_dim,
+                    fork_feat=True,
+                    **kwargs)
+    @seg_BACKBONES.register_module()
+    @det_BACKBONES.register_module()
+    class context_cluster_small_feat5(ContextCluster):
+        def __init__(self, **kwargs):
+                layers = [2, 2, 6, 2]
+                norm_layer=GroupNorm
+                embed_dims = [64, 128, 320, 512]
+                mlp_ratios = [8, 8, 4, 4]
+                downsamples = [True, True, True, True]
+                proposal_w=[5,5,5,5]
+                proposal_h=[5,5,5,5]
+                fold_w=[8,4,2,1]
+                fold_h=[8,4,2,1]
+                heads=[4,4,8,8]
+                head_dim=[32,32,32,32]
+                down_patch_size=3
+                down_pad = 1
+                super().__init__(
+                    layers, embed_dims=embed_dims, norm_layer=norm_layer,
+                    mlp_ratios=mlp_ratios, downsamples=downsamples,
+                    down_patch_size = down_patch_size, down_pad=down_pad,
+                    proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+                    heads=heads, head_dim=head_dim,
+                    fork_feat=True,
+                    **kwargs)
+    @seg_BACKBONES.register_module()
+    @det_BACKBONES.register_module()
+    class context_cluster_small_feat7(ContextCluster):
+        def __init__(self, **kwargs):
+                layers = [2, 2, 6, 2]
+                norm_layer=GroupNorm
+                embed_dims = [64, 128, 320, 512]
+                mlp_ratios = [8, 8, 4, 4]
+                downsamples = [True, True, True, True]
+                proposal_w=[7,7,7,7]
+                proposal_h=[7,7,7,7]
+                fold_w=[8,4,2,1]
+                fold_h=[8,4,2,1]
+                heads=[4,4,8,8]
+                head_dim=[32,32,32,32]
+                down_patch_size=3
+                down_pad = 1
+                super().__init__(
+                    layers, embed_dims=embed_dims, norm_layer=norm_layer,
+                    mlp_ratios=mlp_ratios, downsamples=downsamples,
+                    down_patch_size = down_patch_size, down_pad=down_pad,
+                    proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+                    heads=heads, head_dim=head_dim,
+                    fork_feat=True,
+                    **kwargs)
+    @seg_BACKBONES.register_module()
+    @det_BACKBONES.register_module()
+    class context_cluster_medium_feat2(ContextCluster):
+        def __init__(self, **kwargs):
+                layers = [4, 4, 12, 4]
+                norm_layer=GroupNorm
+                embed_dims = [64, 128, 320, 512]
+                mlp_ratios = [8, 8, 4, 4]
+                downsamples = [True, True, True, True]
+                proposal_w=[2,2,2,2]
+                proposal_h=[2,2,2,2]
+                fold_w=[8,4,2,1]
+                fold_h=[8,4,2,1]
+                heads=[6,6,12,12]
+                head_dim=[32,32,32,32]
+                down_patch_size=3
+                down_pad = 1
+                super().__init__(
+                    layers, embed_dims=embed_dims, norm_layer=norm_layer,
+                    mlp_ratios=mlp_ratios, downsamples=downsamples,
+                    down_patch_size = down_patch_size, down_pad=down_pad,
+                    proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+                    heads=heads, head_dim=head_dim,
+                    fork_feat=True,
+                    **kwargs)
+    @seg_BACKBONES.register_module()
+    @det_BACKBONES.register_module()
+    class context_cluster_medium_feat5(ContextCluster):
+        def __init__(self, **kwargs):
+                layers = [4, 4, 12, 4]
+                norm_layer=GroupNorm
+                embed_dims = [64, 128, 320, 512]
+                mlp_ratios = [8, 8, 4, 4]
+                downsamples = [True, True, True, True]
+                proposal_w=[5, 5, 5, 5]
+                proposal_h=[5, 5, 5, 5]
+                fold_w=[8,4,2,1]
+                fold_h=[8,4,2,1]
+                heads=[6,6,12,12]
+                head_dim=[32,32,32,32]
+                down_patch_size=3
+                down_pad = 1
+                super().__init__(
+                    layers, embed_dims=embed_dims, norm_layer=norm_layer,
+                    mlp_ratios=mlp_ratios, downsamples=downsamples,
+                    down_patch_size = down_patch_size, down_pad=down_pad,
+                    proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+                    heads=heads, head_dim=head_dim,
+                    fork_feat=True,
+                    **kwargs)
+    @seg_BACKBONES.register_module()
+    @det_BACKBONES.register_module()
+    class context_cluster_medium_feat7(ContextCluster):
+        def __init__(self, **kwargs):
+                layers = [4, 4, 12, 4]
+                norm_layer=GroupNorm
+                embed_dims = [64, 128, 320, 512]
+                mlp_ratios = [8, 8, 4, 4]
+                downsamples = [True, True, True, True]
+                proposal_w=[7,7,7,7]
+                proposal_h=[7,7,7,7]
+                fold_w=[8,4,2,1]
+                fold_h=[8,4,2,1]
+                heads=[6,6,12,12]
+                head_dim=[32,32,32,32]
+                down_patch_size=3
+                down_pad = 1
+                super().__init__(
+                    layers, embed_dims=embed_dims, norm_layer=norm_layer,
+                    mlp_ratios=mlp_ratios, downsamples=downsamples,
+                    down_patch_size = down_patch_size, down_pad=down_pad,
+                    proposal_w=proposal_w, proposal_h=proposal_h, fold_w=fold_w, fold_h=fold_h,
+                    heads=heads, head_dim=head_dim,
+                    fork_feat=True,
+                    **kwargs)
+if __name__ == '__main__':
+    input = torch.rand(2, 3, 224, 224)
+    model = coc_base_dim64()
+    out = model(input)
+    print(model)
+    print(out.shape)
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print("number of params: {:.2f}M".format(n_parameters/1024**2))