Spaces:
Running on Zero
Running on Zero
| # Code originally provided by Meta FAIR. https://github.com/facebookresearch/ConvNeXt | |
| import torch | |
| from torch import nn | |
| class DropPath(nn.Module): | |
| r"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). | |
| Args: | |
| drop_prob (float): Probability of dropping a path. Default: 0.0. | |
| scale_by_keep (bool): Whether to scale the output by the keep probability. Default: True. | |
| """ | |
| def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): | |
| super(DropPath, self).__init__() | |
| self.drop_prob = drop_prob | |
| self.scale_by_keep = scale_by_keep | |
| def forward(self, x): | |
| if self.drop_prob == 0.0 or not self.training: | |
| return x | |
| keep_prob = 1 - self.drop_prob | |
| shape = (x.shape[0],) + (1,) * (x.ndim - 1) | |
| random_tensor = x.new_empty(shape).bernoulli_(keep_prob) | |
| if keep_prob > 0.0 and self.scale_by_keep: | |
| random_tensor.div_(keep_prob) | |
| return x * random_tensor | |
| def extra_repr(self): | |
| return f"drop_prob={round(self.drop_prob, 3):0.3f}" | |
| class ConvNeXtBlock(nn.Module): | |
| r"""ConvNeXt Block. There are two equivalent implementations: | |
| (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) | |
| (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back | |
| We use (2) as we find it slightly faster in PyTorch. | |
| Args: | |
| in_channels (int): Number of input channels. | |
| out_channels (int): Number of output channels. | |
| kernel_size (int): Size of the convolution kernel. Default: 7. | |
| padding (int): Padding size for the convolution. Default: 3. | |
| drop_path (float): Stochastic depth rate. Default: 0.1. | |
| layer_scale_init_value (float): Initial value for Layer Scale. Default: 1e-1. | |
| """ | |
| def __init__( | |
| self, | |
| in_channels, | |
| out_channels, | |
| kernel_size=7, | |
| padding=3, | |
| drop_path=0.1, | |
| layer_scale_init_value=1e-1, | |
| ): | |
| super().__init__() | |
| self.dwconv = nn.Conv2d( | |
| in_channels, | |
| out_channels, | |
| kernel_size=kernel_size, | |
| padding=padding, | |
| groups=in_channels, | |
| padding_mode="replicate", | |
| ) # depthwise conv | |
| self.norm = nn.functional.layer_norm | |
| self.pwconv1 = nn.Linear(out_channels, 4 * out_channels) # pointwise/1x1 convs, implemented with linear layers | |
| self.act = nn.GELU() | |
| self.pwconv2 = nn.Linear(4 * out_channels, in_channels) | |
| self.gamma = ( | |
| nn.Parameter(layer_scale_init_value * torch.ones((out_channels)), requires_grad=True) | |
| if layer_scale_init_value > 0 | |
| else None | |
| ) | |
| self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() | |
| def forward(self, x): | |
| input = x | |
| x = self.dwconv(x) | |
| x = self.norm(x, x.shape[1:]) | |
| x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) | |
| x = self.pwconv1(x) | |
| x = self.act(x) | |
| x = self.pwconv2(x) | |
| if self.gamma is not None: | |
| x = self.gamma * x | |
| x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) | |
| x = input + self.drop_path(x) | |
| return x | |
| class TimeDownsamplingBlock(nn.Module): | |
| r"""Time Downsampling Block: LayerNorm -> 1x2 strided Conv -> GELU. | |
| Args: | |
| in_channels (int): Number of input channels. | |
| out_channels (int): Number of output channels. | |
| bias (bool): Whether to use bias in the convolution. Default: True. | |
| """ | |
| def __init__(self, in_channels, out_channels, bias=True): | |
| super().__init__() | |
| self.norm = nn.functional.layer_norm | |
| self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=(1, 2), stride=(1, 2), bias=bias) | |
| self.act = nn.GELU() | |
| def forward(self, x): | |
| x = self.norm(x, x.shape[1:]) | |
| x = self.conv(x) | |
| x = self.act(x) | |
| return x | |