S-KEY / skey /convnext.py
2cylu2's picture
Upload 13 files
77ac75d verified
# Code originally provided by Meta FAIR. https://github.com/facebookresearch/ConvNeXt
import torch
from torch import nn
class DropPath(nn.Module):
r"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Args:
drop_prob (float): Probability of dropping a path. Default: 0.0.
scale_by_keep (bool): Whether to scale the output by the keep probability. Default: True.
"""
def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
self.scale_by_keep = scale_by_keep
def forward(self, x):
if self.drop_prob == 0.0 or not self.training:
return x
keep_prob = 1 - self.drop_prob
shape = (x.shape[0],) + (1,) * (x.ndim - 1)
random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
if keep_prob > 0.0 and self.scale_by_keep:
random_tensor.div_(keep_prob)
return x * random_tensor
def extra_repr(self):
return f"drop_prob={round(self.drop_prob, 3):0.3f}"
class ConvNeXtBlock(nn.Module):
r"""ConvNeXt Block. There are two equivalent implementations:
(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
We use (2) as we find it slightly faster in PyTorch.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
kernel_size (int): Size of the convolution kernel. Default: 7.
padding (int): Padding size for the convolution. Default: 3.
drop_path (float): Stochastic depth rate. Default: 0.1.
layer_scale_init_value (float): Initial value for Layer Scale. Default: 1e-1.
"""
def __init__(
self,
in_channels,
out_channels,
kernel_size=7,
padding=3,
drop_path=0.1,
layer_scale_init_value=1e-1,
):
super().__init__()
self.dwconv = nn.Conv2d(
in_channels,
out_channels,
kernel_size=kernel_size,
padding=padding,
groups=in_channels,
padding_mode="replicate",
) # depthwise conv
self.norm = nn.functional.layer_norm
self.pwconv1 = nn.Linear(out_channels, 4 * out_channels) # pointwise/1x1 convs, implemented with linear layers
self.act = nn.GELU()
self.pwconv2 = nn.Linear(4 * out_channels, in_channels)
self.gamma = (
nn.Parameter(layer_scale_init_value * torch.ones((out_channels)), requires_grad=True)
if layer_scale_init_value > 0
else None
)
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
def forward(self, x):
input = x
x = self.dwconv(x)
x = self.norm(x, x.shape[1:])
x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
x = self.pwconv1(x)
x = self.act(x)
x = self.pwconv2(x)
if self.gamma is not None:
x = self.gamma * x
x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
x = input + self.drop_path(x)
return x
class TimeDownsamplingBlock(nn.Module):
r"""Time Downsampling Block: LayerNorm -> 1x2 strided Conv -> GELU.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
bias (bool): Whether to use bias in the convolution. Default: True.
"""
def __init__(self, in_channels, out_channels, bias=True):
super().__init__()
self.norm = nn.functional.layer_norm
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=(1, 2), stride=(1, 2), bias=bias)
self.act = nn.GELU()
def forward(self, x):
x = self.norm(x, x.shape[1:])
x = self.conv(x)
x = self.act(x)
return x