# Code originally provided by Meta FAIR. https://github.com/facebookresearch/ConvNeXt
import torch
from torch import nn


class DropPath(nn.Module):
    r"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Args:
        drop_prob (float): Probability of dropping a path. Default: 0.0.
        scale_by_keep (bool): Whether to scale the output by the keep probability. Default: True.
    """

    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob
        self.scale_by_keep = scale_by_keep

    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training:
            return x
        keep_prob = 1 - self.drop_prob
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
        if keep_prob > 0.0 and self.scale_by_keep:
            random_tensor.div_(keep_prob)
        return x * random_tensor

    def extra_repr(self):
        return f"drop_prob={round(self.drop_prob, 3):0.3f}"


class ConvNeXtBlock(nn.Module):
    r"""ConvNeXt Block. There are two equivalent implementations:
    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
    We use (2) as we find it slightly faster in PyTorch.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        kernel_size (int): Size of the convolution kernel. Default: 7.
        padding (int): Padding size for the convolution. Default: 3.
        drop_path (float): Stochastic depth rate. Default: 0.1.
        layer_scale_init_value (float): Initial value for Layer Scale. Default: 1e-1.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size=7,
        padding=3,
        drop_path=0.1,
        layer_scale_init_value=1e-1,
    ):
        super().__init__()
        self.dwconv = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            padding=padding,
            groups=in_channels,
            padding_mode="replicate",
        )  # depthwise conv
        self.norm = nn.functional.layer_norm
        self.pwconv1 = nn.Linear(out_channels, 4 * out_channels)  # pointwise/1x1 convs, implemented with linear layers
        self.act = nn.GELU()
        self.pwconv2 = nn.Linear(4 * out_channels, in_channels)
        self.gamma = (
            nn.Parameter(layer_scale_init_value * torch.ones((out_channels)), requires_grad=True)
            if layer_scale_init_value > 0
            else None
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

    def forward(self, x):
        input = x
        x = self.dwconv(x)
        x = self.norm(x, x.shape[1:])
        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
        x = self.pwconv1(x)
        x = self.act(x)
        x = self.pwconv2(x)
        if self.gamma is not None:
            x = self.gamma * x
        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)

        x = input + self.drop_path(x)
        return x


class TimeDownsamplingBlock(nn.Module):
    r"""Time Downsampling Block: LayerNorm -> 1x2 strided Conv -> GELU.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        bias (bool): Whether to use bias in the convolution. Default: True.
    """

    def __init__(self, in_channels, out_channels, bias=True):
        super().__init__()
        self.norm = nn.functional.layer_norm
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=(1, 2), stride=(1, 2), bias=bias)
        self.act = nn.GELU()

    def forward(self, x):
        x = self.norm(x, x.shape[1:])
        x = self.conv(x)
        x = self.act(x)
        return x