Upload 4 files

Browse files

Files changed (4) hide show

models/__init__.py +10 -0
models/convnext.py +220 -0
models/flexible_unet.py +312 -0
models/flexible_unet_convnext.py +447 -0

models/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Mar 20 14:23:55 2022
+@author: jma
+"""
+#from .unetr2d import UNETR2D
+#from .swin_unetr import SwinUNETR

models/convnext.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_, DropPath
+from timm.models.registry import register_model
+from monai.networks.layers.factories import Act, Conv, Pad, Pool
+from monai.networks.layers.utils import get_norm_layer
+from monai.utils.module import look_up_option
+from typing import List, NamedTuple, Optional, Tuple, Type, Union
+class Block(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
+                                    requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+    def __init__(self, in_chans=3, num_classes=21841,
+                 depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0.,
+                 layer_scale_init_value=1e-6, head_init_scale=1., out_indices=[0, 1, 2, 3],
+                 ):
+        super().__init__()
+        # conv_type: Type[Union[nn.Conv1d, nn.Conv2d, nn.Conv3d]] = Conv["conv", 2]
+        # self._conv_stem = conv_type(self.in_channels, self.in_channels, kernel_size=3, stride=stride, bias=False)
+        # self._conv_stem_padding = _make_same_padder(self._conv_stem, current_image_size)
+        self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
+        )
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                    LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                    nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+        self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(
+                *[Block(dim=dims[i], drop_path=dp_rates[cur + j],
+                layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.out_indices = out_indices
+        norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first")
+        for i_layer in range(4):
+            layer = norm_layer(dims[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+    def forward_features(self, x):
+        outs = []
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x)
+                outs.append(x_out)
+        return tuple(outs)
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+model_urls = {
+    "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
+    "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
+    "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
+    "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
+    "convnext_tiny_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth",
+    "convnext_small_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth",
+    "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth",
+    "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth",
+    "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth",
+}
+@register_model
+def convnext_tiny(pretrained=False,in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_tiny_22k'] if in_22k else model_urls['convnext_tiny_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
+        model.load_state_dict(checkpoint["model"])
+    return model
+@register_model
+def convnext_small(pretrained=False,in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_small_22k'] if in_22k else model_urls['convnext_small_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"], strict=False)
+    return model
+@register_model
+def convnext_base(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_base_22k'] if in_22k else model_urls['convnext_base_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"], strict=False)
+    return model
+@register_model
+def convnext_large(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_large_22k'] if in_22k else model_urls['convnext_large_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"])
+    return model
+@register_model
+def convnext_xlarge(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs)
+    if pretrained:
+        assert in_22k, "only ImageNet-22K pre-trained ConvNeXt-XL is available; please set in_22k=True"
+        url = model_urls['convnext_xlarge_22k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"])
+    return model

models/flexible_unet.py ADDED Viewed

	@@ -0,0 +1,312 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Sequence, Tuple, Union
+import torch
+from torch import nn
+from monai.networks.blocks import UpSample
+from monai.networks.layers.factories import Conv
+from monai.networks.layers.utils import get_act_layer
+from monai.networks.nets import EfficientNetBNFeatures
+from monai.networks.nets.basic_unet import UpCat
+from monai.utils import InterpolateMode
+__all__ = ["FlexibleUNet"]
+encoder_feature_channel = {
+    "efficientnet-b0": (16, 24, 40, 112, 320),
+    "efficientnet-b1": (16, 24, 40, 112, 320),
+    "efficientnet-b2": (16, 24, 48, 120, 352),
+    "efficientnet-b3": (24, 32, 48, 136, 384),
+    "efficientnet-b4": (24, 32, 56, 160, 448),
+    "efficientnet-b5": (24, 40, 64, 176, 512),
+    "efficientnet-b6": (32, 40, 72, 200, 576),
+    "efficientnet-b7": (32, 48, 80, 224, 640),
+    "efficientnet-b8": (32, 56, 88, 248, 704),
+    "efficientnet-l2": (72, 104, 176, 480, 1376),
+}
+def _get_encoder_channels_by_backbone(backbone: str, in_channels: int = 3) -> tuple:
+    """
+    Get the encoder output channels by given backbone name.
+    Args:
+        backbone: name of backbone to generate features, can be from [efficientnet-b0, ..., efficientnet-b7].
+        in_channels: channel of input tensor, default to 3.
+    Returns:
+        A tuple of output feature map channels' length .
+    """
+    encoder_channel_tuple = encoder_feature_channel[backbone]
+    encoder_channel_list = [in_channels] + list(encoder_channel_tuple)
+    encoder_channel = tuple(encoder_channel_list)
+    return encoder_channel
+class UNetDecoder(nn.Module):
+    """
+    UNet Decoder.
+    This class refers to `segmentation_models.pytorch
+    <https://github.com/qubvel/segmentation_models.pytorch>`_.
+    Args:
+        spatial_dims: number of spatial dimensions.
+        encoder_channels: number of output channels for all feature maps in encoder.
+            `len(encoder_channels)` should be no less than 2.
+        decoder_channels: number of output channels for all feature maps in decoder.
+            `len(decoder_channels)` should equal to `len(encoder_channels) - 1`.
+        act: activation type and arguments.
+        norm: feature normalization type and arguments.
+        dropout: dropout ratio.
+        bias: whether to have a bias term in convolution blocks in this decoder.
+        upsample: upsampling mode, available options are
+            ``"deconv"``, ``"pixelshuffle"``, ``"nontrainable"``.
+        pre_conv: a conv block applied before upsampling.
+            Only used in the "nontrainable" or "pixelshuffle" mode.
+        interp_mode: {``"nearest"``, ``"linear"``, ``"bilinear"``, ``"bicubic"``, ``"trilinear"``}
+            Only used in the "nontrainable" mode.
+        align_corners: set the align_corners parameter for upsample. Defaults to True.
+            Only used in the "nontrainable" mode.
+        is_pad: whether to pad upsampling features to fit the encoder spatial dims.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        encoder_channels: Sequence[int],
+        decoder_channels: Sequence[int],
+        act: Union[str, tuple],
+        norm: Union[str, tuple],
+        dropout: Union[float, tuple],
+        bias: bool,
+        upsample: str,
+        pre_conv: Optional[str],
+        interp_mode: str,
+        align_corners: Optional[bool],
+        is_pad: bool,
+    ):
+        super().__init__()
+        if len(encoder_channels) < 2:
+            raise ValueError("the length of `encoder_channels` should be no less than 2.")
+        if len(decoder_channels) != len(encoder_channels) - 1:
+            raise ValueError("`len(decoder_channels)` should equal to `len(encoder_channels) - 1`.")
+        in_channels = [encoder_channels[-1]] + list(decoder_channels[:-1])
+        skip_channels = list(encoder_channels[1:-1][::-1]) + [0]
+        halves = [True] * (len(skip_channels) - 1)
+        halves.append(False)
+        blocks = []
+        for in_chn, skip_chn, out_chn, halve in zip(in_channels, skip_channels, decoder_channels, halves):
+            blocks.append(
+                UpCat(
+                    spatial_dims=spatial_dims,
+                    in_chns=in_chn,
+                    cat_chns=skip_chn,
+                    out_chns=out_chn,
+                    act=act,
+                    norm=norm,
+                    dropout=dropout,
+                    bias=bias,
+                    upsample=upsample,
+                    pre_conv=pre_conv,
+                    interp_mode=interp_mode,
+                    align_corners=align_corners,
+                    halves=halve,
+                    is_pad=is_pad,
+                )
+            )
+        self.blocks = nn.ModuleList(blocks)
+    def forward(self, features: List[torch.Tensor], skip_connect: int = 4):
+        skips = features[:-1][::-1]
+        features = features[1:][::-1]
+        x = features[0]
+        for i, block in enumerate(self.blocks):
+            if i < skip_connect:
+                skip = skips[i]
+            else:
+                skip = None
+            x = block(x, skip)
+        return x
+class SegmentationHead(nn.Sequential):
+    """
+    Segmentation head.
+    This class refers to `segmentation_models.pytorch
+    <https://github.com/qubvel/segmentation_models.pytorch>`_.
+    Args:
+        spatial_dims: number of spatial dimensions.
+        in_channels: number of input channels for the block.
+        out_channels: number of output channels for the block.
+        kernel_size: kernel size for the conv layer.
+        act: activation type and arguments.
+        scale_factor: multiplier for spatial size. Has to match input size if it is a tuple.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        act: Optional[Union[Tuple, str]] = None,
+        scale_factor: float = 1.0,
+    ):
+        conv_layer = Conv[Conv.CONV, spatial_dims](
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=kernel_size // 2
+        )
+        up_layer: nn.Module = nn.Identity()
+        if scale_factor > 1.0:
+            up_layer = UpSample(
+                spatial_dims=spatial_dims,
+                scale_factor=scale_factor,
+                mode="nontrainable",
+                pre_conv=None,
+                interp_mode=InterpolateMode.LINEAR,
+            )
+        if act is not None:
+            act_layer = get_act_layer(act)
+        else:
+            act_layer = nn.Identity()
+        super().__init__(conv_layer, up_layer, act_layer)
+class FlexibleUNet(nn.Module):
+    """
+    A flexible implementation of UNet-like encoder-decoder architecture.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        backbone: str,
+        pretrained: bool = False,
+        decoder_channels: Tuple = (256, 128, 64, 32, 16),
+        spatial_dims: int = 2,
+        norm: Union[str, tuple] = ("batch", {"eps": 1e-3, "momentum": 0.1}),
+        act: Union[str, tuple] = ("relu", {"inplace": True}),
+        dropout: Union[float, tuple] = 0.0,
+        decoder_bias: bool = False,
+        upsample: str = "nontrainable",
+        interp_mode: str = "nearest",
+        is_pad: bool = True,
+    ) -> None:
+        """
+        A flexible implement of UNet, in which the backbone/encoder can be replaced with
+        any efficient network. Currently the input must have a 2 or 3 spatial dimension
+        and the spatial size of each dimension must be a multiple of 32 if is pad parameter
+        is False
+        Args:
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            backbone: name of backbones to initialize, only support efficientnet right now,
+                can be from [efficientnet-b0,..., efficientnet-b8, efficientnet-l2].
+            pretrained: whether to initialize pretrained ImageNet weights, only available
+                for spatial_dims=2 and batch norm is used, default to False.
+            decoder_channels: number of output channels for all feature maps in decoder.
+                `len(decoder_channels)` should equal to `len(encoder_channels) - 1`,default
+                to (256, 128, 64, 32, 16).
+            spatial_dims: number of spatial dimensions, default to 2.
+            norm: normalization type and arguments, default to ("batch", {"eps": 1e-3,
+                "momentum": 0.1}).
+            act: activation type and arguments, default to ("relu", {"inplace": True}).
+            dropout: dropout ratio, default to 0.0.
+            decoder_bias: whether to have a bias term in decoder's convolution blocks.
+            upsample: upsampling mode, available options are``"deconv"``, ``"pixelshuffle"``,
+                ``"nontrainable"``.
+            interp_mode: {``"nearest"``, ``"linear"``, ``"bilinear"``, ``"bicubic"``, ``"trilinear"``}
+                Only used in the "nontrainable" mode.
+            is_pad: whether to pad upsampling features to fit features from encoder. Default to True.
+                If this parameter is set to "True", the spatial dim of network input can be arbitary
+                size, which is not supported by TensorRT. Otherwise, it must be a multiple of 32.
+        """
+        super().__init__()
+        if backbone not in encoder_feature_channel:
+            raise ValueError(f"invalid model_name {backbone} found, must be one of {encoder_feature_channel.keys()}.")
+        if spatial_dims not in (2, 3):
+            raise ValueError("spatial_dims can only be 2 or 3.")
+        adv_prop = "ap" in backbone
+        self.backbone = backbone
+        self.spatial_dims = spatial_dims
+        model_name = backbone
+        encoder_channels = _get_encoder_channels_by_backbone(backbone, in_channels)
+        self.encoder = EfficientNetBNFeatures(
+            model_name=model_name,
+            pretrained=pretrained,
+            in_channels=in_channels,
+            spatial_dims=spatial_dims,
+            norm=norm,
+            adv_prop=adv_prop,
+        )
+        self.decoder = UNetDecoder(
+            spatial_dims=spatial_dims,
+            encoder_channels=encoder_channels,
+            decoder_channels=decoder_channels,
+            act=act,
+            norm=norm,
+            dropout=dropout,
+            bias=decoder_bias,
+            upsample=upsample,
+            interp_mode=interp_mode,
+            pre_conv=None,
+            align_corners=None,
+            is_pad=is_pad,
+        )
+        self.dist_head = SegmentationHead(
+            spatial_dims=spatial_dims,
+            in_channels=decoder_channels[-1],
+            out_channels=32,
+            kernel_size=1,
+            act='relu',
+        )
+        self.prob_head = SegmentationHead(
+            spatial_dims=spatial_dims,
+            in_channels=decoder_channels[-1],
+            out_channels=1,
+            kernel_size=1,
+            act='sigmoid',
+        )
+    def forward(self, inputs: torch.Tensor):
+        """
+        Do a typical encoder-decoder-header inference.
+        Args:
+            inputs: input should have spatially N dimensions ``(Batch, in_channels, dim_0[, dim_1, ..., dim_N])``,
+                N is defined by `dimensions`.
+        Returns:
+            A torch Tensor of "raw" predictions in shape ``(Batch, out_channels, dim_0[, dim_1, ..., dim_N])``.
+        """
+        x = inputs
+        enc_out = self.encoder(x)
+        decoder_out = self.decoder(enc_out)
+        dist = self.dist_head(decoder_out)
+        prob = self.prob_head(decoder_out)
+        return dist,prob

models/flexible_unet_convnext.py ADDED Viewed

	@@ -0,0 +1,447 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Sequence, Tuple, Union
+import torch
+from torch import nn
+from . import convnext
+from monai.networks.blocks import UpSample
+from monai.networks.layers.factories import Conv
+from monai.networks.layers.utils import get_act_layer
+from monai.networks.nets import EfficientNetBNFeatures
+from monai.networks.nets.basic_unet import UpCat
+from monai.utils import InterpolateMode
+__all__ = ["FlexibleUNet"]
+encoder_feature_channel = {
+    "efficientnet-b0": (16, 24, 40, 112, 320),
+    "efficientnet-b1": (16, 24, 40, 112, 320),
+    "efficientnet-b2": (16, 24, 48, 120, 352),
+    "efficientnet-b3": (24, 32, 48, 136, 384),
+    "efficientnet-b4": (24, 32, 56, 160, 448),
+    "efficientnet-b5": (24, 40, 64, 176, 512),
+    "efficientnet-b6": (32, 40, 72, 200, 576),
+    "efficientnet-b7": (32, 48, 80, 224, 640),
+    "efficientnet-b8": (32, 56, 88, 248, 704),
+    "efficientnet-l2": (72, 104, 176, 480, 1376),
+    "convnext_small": (96, 192, 384, 768),
+    "convnext_base": (128, 256, 512, 1024),
+    "van_b2": (64, 128, 320, 512),
+    "van_b1": (64, 128, 320, 512),
+}
+def _get_encoder_channels_by_backbone(backbone: str, in_channels: int = 3) -> tuple:
+    """
+    Get the encoder output channels by given backbone name.
+    Args:
+        backbone: name of backbone to generate features, can be from [efficientnet-b0, ..., efficientnet-b7].
+        in_channels: channel of input tensor, default to 3.
+    Returns:
+        A tuple of output feature map channels' length .
+    """
+    encoder_channel_tuple = encoder_feature_channel[backbone]
+    encoder_channel_list = [in_channels] + list(encoder_channel_tuple)
+    encoder_channel = tuple(encoder_channel_list)
+    return encoder_channel
+class UNetDecoder(nn.Module):
+    """
+    UNet Decoder.
+    This class refers to `segmentation_models.pytorch
+    <https://github.com/qubvel/segmentation_models.pytorch>`_.
+    Args:
+        spatial_dims: number of spatial dimensions.
+        encoder_channels: number of output channels for all feature maps in encoder.
+            `len(encoder_channels)` should be no less than 2.
+        decoder_channels: number of output channels for all feature maps in decoder.
+            `len(decoder_channels)` should equal to `len(encoder_channels) - 1`.
+        act: activation type and arguments.
+        norm: feature normalization type and arguments.
+        dropout: dropout ratio.
+        bias: whether to have a bias term in convolution blocks in this decoder.
+        upsample: upsampling mode, available options are
+            ``"deconv"``, ``"pixelshuffle"``, ``"nontrainable"``.
+        pre_conv: a conv block applied before upsampling.
+            Only used in the "nontrainable" or "pixelshuffle" mode.
+        interp_mode: {``"nearest"``, ``"linear"``, ``"bilinear"``, ``"bicubic"``, ``"trilinear"``}
+            Only used in the "nontrainable" mode.
+        align_corners: set the align_corners parameter for upsample. Defaults to True.
+            Only used in the "nontrainable" mode.
+        is_pad: whether to pad upsampling features to fit the encoder spatial dims.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        encoder_channels: Sequence[int],
+        decoder_channels: Sequence[int],
+        act: Union[str, tuple],
+        norm: Union[str, tuple],
+        dropout: Union[float, tuple],
+        bias: bool,
+        upsample: str,
+        pre_conv: Optional[str],
+        interp_mode: str,
+        align_corners: Optional[bool],
+        is_pad: bool,
+    ):
+        super().__init__()
+        if len(encoder_channels) < 2:
+            raise ValueError("the length of `encoder_channels` should be no less than 2.")
+        if len(decoder_channels) != len(encoder_channels) - 1:
+            raise ValueError("`len(decoder_channels)` should equal to `len(encoder_channels) - 1`.")
+        in_channels = [encoder_channels[-1]] + list(decoder_channels[:-1])
+        skip_channels = list(encoder_channels[1:-1][::-1]) + [0]
+        halves = [True] * (len(skip_channels) - 1)
+        halves.append(False)
+        blocks = []
+        for in_chn, skip_chn, out_chn, halve in zip(in_channels, skip_channels, decoder_channels, halves):
+            blocks.append(
+                UpCat(
+                    spatial_dims=spatial_dims,
+                    in_chns=in_chn,
+                    cat_chns=skip_chn,
+                    out_chns=out_chn,
+                    act=act,
+                    norm=norm,
+                    dropout=dropout,
+                    bias=bias,
+                    upsample=upsample,
+                    pre_conv=pre_conv,
+                    interp_mode=interp_mode,
+                    align_corners=align_corners,
+                    halves=halve,
+                    is_pad=is_pad,
+                )
+            )
+        self.blocks = nn.ModuleList(blocks)
+    def forward(self, features: List[torch.Tensor], skip_connect: int = 3):
+        skips = features[:-1][::-1]
+        features = features[1:][::-1]
+        x = features[0]
+        for i, block in enumerate(self.blocks):
+            if i < skip_connect:
+                skip = skips[i]
+            else:
+                skip = None
+            x = block(x, skip)
+        return x
+class SegmentationHead(nn.Sequential):
+    """
+    Segmentation head.
+    This class refers to `segmentation_models.pytorch
+    <https://github.com/qubvel/segmentation_models.pytorch>`_.
+    Args:
+        spatial_dims: number of spatial dimensions.
+        in_channels: number of input channels for the block.
+        out_channels: number of output channels for the block.
+        kernel_size: kernel size for the conv layer.
+        act: activation type and arguments.
+        scale_factor: multiplier for spatial size. Has to match input size if it is a tuple.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        act: Optional[Union[Tuple, str]] = None,
+        scale_factor: float = 1.0,
+    ):
+        conv_layer = Conv[Conv.CONV, spatial_dims](
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=kernel_size // 2
+        )
+        up_layer: nn.Module = nn.Identity()
+        # if scale_factor > 1.0:
+        #     up_layer = UpSample(
+        #         in_channels=out_channels,
+        #         spatial_dims=spatial_dims,
+        #         scale_factor=scale_factor,
+        #         mode="deconv",
+        #         pre_conv=None,
+        #         interp_mode=InterpolateMode.LINEAR,
+        #     )
+        if scale_factor > 1.0:
+            up_layer = UpSample(
+                spatial_dims=spatial_dims,
+                scale_factor=scale_factor,
+                mode="nontrainable",
+                pre_conv=None,
+                interp_mode=InterpolateMode.LINEAR,
+            )
+        if act is not None:
+            act_layer = get_act_layer(act)
+        else:
+            act_layer = nn.Identity()
+        super().__init__(conv_layer, up_layer, act_layer)
+class FlexibleUNet_star(nn.Module):
+    """
+    A flexible implementation of UNet-like encoder-decoder architecture.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        backbone: str,
+        pretrained: bool = False,
+        decoder_channels: Tuple = (256, 128, 64, 32),
+        #decoder_channels: Tuple = (1024, 512, 256, 128),
+        spatial_dims: int = 2,
+        norm: Union[str, tuple] = ("batch", {"eps": 1e-3, "momentum": 0.1}),
+        act: Union[str, tuple] = ("relu", {"inplace": True}),
+        dropout: Union[float, tuple] = 0.0,
+        decoder_bias: bool = False,
+        upsample: str = "nontrainable",
+        interp_mode: str = "nearest",
+        is_pad: bool = True,
+        n_rays: int = 32,
+        prob_out_channels: int = 1,
+    ) -> None:
+        """
+        A flexible implement of UNet, in which the backbone/encoder can be replaced with
+        any efficient network. Currently the input must have a 2 or 3 spatial dimension
+        and the spatial size of each dimension must be a multiple of 32 if is pad parameter
+        is False
+        Args:
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            backbone: name of backbones to initialize, only support efficientnet right now,
+                can be from [efficientnet-b0,..., efficientnet-b8, efficientnet-l2].
+            pretrained: whether to initialize pretrained ImageNet weights, only available
+                for spatial_dims=2 and batch norm is used, default to False.
+            decoder_channels: number of output channels for all feature maps in decoder.
+                `len(decoder_channels)` should equal to `len(encoder_channels) - 1`,default
+                to (256, 128, 64, 32, 16).
+            spatial_dims: number of spatial dimensions, default to 2.
+            norm: normalization type and arguments, default to ("batch", {"eps": 1e-3,
+                "momentum": 0.1}).
+            act: activation type and arguments, default to ("relu", {"inplace": True}).
+            dropout: dropout ratio, default to 0.0.
+            decoder_bias: whether to have a bias term in decoder's convolution blocks.
+            upsample: upsampling mode, available options are``"deconv"``, ``"pixelshuffle"``,
+                ``"nontrainable"``.
+            interp_mode: {``"nearest"``, ``"linear"``, ``"bilinear"``, ``"bicubic"``, ``"trilinear"``}
+                Only used in the "nontrainable" mode.
+            is_pad: whether to pad upsampling features to fit features from encoder. Default to True.
+                If this parameter is set to "True", the spatial dim of network input can be arbitary
+                size, which is not supported by TensorRT. Otherwise, it must be a multiple of 32.
+        """
+        super().__init__()
+        if backbone not in encoder_feature_channel:
+            raise ValueError(f"invalid model_name {backbone} found, must be one of {encoder_feature_channel.keys()}.")
+        if spatial_dims not in (2, 3):
+            raise ValueError("spatial_dims can only be 2 or 3.")
+        adv_prop = "ap" in backbone
+        self.backbone = backbone
+        self.spatial_dims = spatial_dims
+        model_name = backbone
+        encoder_channels = _get_encoder_channels_by_backbone(backbone, in_channels)
+        self.encoder = convnext.convnext_small(pretrained=False,in_22k=True)
+        self.decoder = UNetDecoder(
+            spatial_dims=spatial_dims,
+            encoder_channels=encoder_channels,
+            decoder_channels=decoder_channels,
+            act=act,
+            norm=norm,
+            dropout=dropout,
+            bias=decoder_bias,
+            upsample=upsample,
+            interp_mode=interp_mode,
+            pre_conv=None,
+            align_corners=None,
+            is_pad=is_pad,
+        )
+        self.dist_head = SegmentationHead(
+            spatial_dims=spatial_dims,
+            in_channels=decoder_channels[-1],
+            out_channels=n_rays,
+            kernel_size=1,
+            act='relu',
+            scale_factor = 2,
+        )
+        self.prob_head = SegmentationHead(
+            spatial_dims=spatial_dims,
+            in_channels=decoder_channels[-1],
+            out_channels=prob_out_channels,
+            kernel_size=1,
+            act='sigmoid',
+            scale_factor = 2,
+        )
+    def forward(self, inputs: torch.Tensor):
+        """
+        Do a typical encoder-decoder-header inference.
+        Args:
+            inputs: input should have spatially N dimensions ``(Batch, in_channels, dim_0[, dim_1, ..., dim_N])``,
+                N is defined by `dimensions`.
+        Returns:
+            A torch Tensor of "raw" predictions in shape ``(Batch, out_channels, dim_0[, dim_1, ..., dim_N])``.
+        """
+        x = inputs
+        enc_out = self.encoder(x)
+        decoder_out = self.decoder(enc_out)
+        dist = self.dist_head(decoder_out)
+        prob = self.prob_head(decoder_out)
+        return dist,prob
+class FlexibleUNet_hv(nn.Module):
+    """
+    A flexible implementation of UNet-like encoder-decoder architecture.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        backbone: str,
+        pretrained: bool = False,
+        decoder_channels: Tuple = (1024, 512, 256, 128),
+        spatial_dims: int = 2,
+        norm: Union[str, tuple] = ("batch", {"eps": 1e-3, "momentum": 0.1}),
+        act: Union[str, tuple] = ("relu", {"inplace": True}),
+        dropout: Union[float, tuple] = 0.0,
+        decoder_bias: bool = False,
+        upsample: str = "nontrainable",
+        interp_mode: str = "nearest",
+        is_pad: bool = True,
+        n_rays: int = 32,
+        prob_out_channels: int = 1,
+    ) -> None:
+        """
+        A flexible implement of UNet, in which the backbone/encoder can be replaced with
+        any efficient network. Currently the input must have a 2 or 3 spatial dimension
+        and the spatial size of each dimension must be a multiple of 32 if is pad parameter
+        is False
+        Args:
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            backbone: name of backbones to initialize, only support efficientnet right now,
+                can be from [efficientnet-b0,..., efficientnet-b8, efficientnet-l2].
+            pretrained: whether to initialize pretrained ImageNet weights, only available
+                for spatial_dims=2 and batch norm is used, default to False.
+            decoder_channels: number of output channels for all feature maps in decoder.
+                `len(decoder_channels)` should equal to `len(encoder_channels) - 1`,default
+                to (256, 128, 64, 32, 16).
+            spatial_dims: number of spatial dimensions, default to 2.
+            norm: normalization type and arguments, default to ("batch", {"eps": 1e-3,
+                "momentum": 0.1}).
+            act: activation type and arguments, default to ("relu", {"inplace": True}).
+            dropout: dropout ratio, default to 0.0.
+            decoder_bias: whether to have a bias term in decoder's convolution blocks.
+            upsample: upsampling mode, available options are``"deconv"``, ``"pixelshuffle"``,
+                ``"nontrainable"``.
+            interp_mode: {``"nearest"``, ``"linear"``, ``"bilinear"``, ``"bicubic"``, ``"trilinear"``}
+                Only used in the "nontrainable" mode.
+            is_pad: whether to pad upsampling features to fit features from encoder. Default to True.
+                If this parameter is set to "True", the spatial dim of network input can be arbitary
+                size, which is not supported by TensorRT. Otherwise, it must be a multiple of 32.
+        """
+        super().__init__()
+        if backbone not in encoder_feature_channel:
+            raise ValueError(f"invalid model_name {backbone} found, must be one of {encoder_feature_channel.keys()}.")
+        if spatial_dims not in (2, 3):
+            raise ValueError("spatial_dims can only be 2 or 3.")
+        adv_prop = "ap" in backbone
+        self.backbone = backbone
+        self.spatial_dims = spatial_dims
+        model_name = backbone
+        encoder_channels = _get_encoder_channels_by_backbone(backbone, in_channels)
+        self.encoder = convnext.convnext_small(pretrained=False,in_22k=True)
+        self.decoder = UNetDecoder(
+            spatial_dims=spatial_dims,
+            encoder_channels=encoder_channels,
+            decoder_channels=decoder_channels,
+            act=act,
+            norm=norm,
+            dropout=dropout,
+            bias=decoder_bias,
+            upsample=upsample,
+            interp_mode=interp_mode,
+            pre_conv=None,
+            align_corners=None,
+            is_pad=is_pad,
+        )
+        self.dist_head = SegmentationHead(
+            spatial_dims=spatial_dims,
+            in_channels=decoder_channels[-1],
+            out_channels=n_rays,
+            kernel_size=1,
+            act=None,
+            scale_factor = 2,
+        )
+        self.prob_head = SegmentationHead(
+            spatial_dims=spatial_dims,
+            in_channels=decoder_channels[-1],
+            out_channels=prob_out_channels,
+            kernel_size=1,
+            act='sigmoid',
+            scale_factor = 2,
+        )
+    def forward(self, inputs: torch.Tensor):
+        """
+        Do a typical encoder-decoder-header inference.
+        Args:
+            inputs: input should have spatially N dimensions ``(Batch, in_channels, dim_0[, dim_1, ..., dim_N])``,
+                N is defined by `dimensions`.
+        Returns:
+            A torch Tensor of "raw" predictions in shape ``(Batch, out_channels, dim_0[, dim_1, ..., dim_N])``.
+        """
+        x = inputs
+        enc_out = self.encoder(x)
+        decoder_out = self.decoder(enc_out)
+        dist = self.dist_head(decoder_out)
+        prob = self.prob_head(decoder_out)
+        return dist,prob