"""
Utils for Common Transformer Blocks used in UniCeption
References:
HuggingFace PyTorch Image Models (Timm)
CroCoV2
"""

import collections.abc
import math
from itertools import repeat
from typing import Callable, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.jit import Final

from uniception.models.utils.config import use_fused_attn

torch.backends.cuda.matmul.allow_tf32 = True


def _ntuple(n):
    "Helper function to create n-tuple."

    def parse(x):
        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
            return x
        return tuple(repeat(x, n))

    return parse


to_2tuple = _ntuple(2)


def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
    if drop_prob == 0.0 or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
    if keep_prob > 0.0 and scale_by_keep:
        random_tensor.div_(keep_prob)

    return x * random_tensor


class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob
        self.scale_by_keep = scale_by_keep

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)

    def extra_repr(self):
        return f"drop_prob={round(self.drop_prob,3):0.3f}"


class Mlp(nn.Module):
    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""

    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.0):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        bias = to_2tuple(bias)
        drop_probs = to_2tuple(drop)

        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
        self.act = act_layer()
        self.drop1 = nn.Dropout(drop_probs[0])
        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
        self.drop2 = nn.Dropout(drop_probs[1])

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop1(x)
        x = self.fc2(x)
        x = self.drop2(x)

        return x


class Attention(nn.Module):
    "Self-Attention Layer"

    fused_attn: Final[bool]

    def __init__(
        self,
        dim: int,
        latent_attn_dim: Optional[int] = None,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        norm_layer: nn.Module = nn.LayerNorm,
        custom_positional_encoding: Callable = None,
    ):
        """
        Initialize the Attention layer.

        Args:
            dim (int): Dimension of input features
            latent_attn_dim (int): Dimension of latent attention features (default: None)
            num_heads (int): Number of attention heads (default: 8)
            qkv_bias (bool): Whether to include bias in qkv projection (default: False)
            qk_norm (bool): Whether to normalize q and k (default: False)
            attn_drop (float): Dropout rate for attention weights (default: 0.)
            proj_drop (float): Dropout rate for output (default: 0.)
            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
        """
        super().__init__()

        if latent_attn_dim is not None:
            assert latent_attn_dim % num_heads == 0, "latent_attn_dim should be divisible by num_heads"
            self.latent_attn_dim = latent_attn_dim
            self.latent_attn = True
        else:
            self.latent_attn = False
            assert dim % num_heads == 0, "dim should be divisible by num_heads"

        self.num_heads = num_heads
        self.head_dim = dim // num_heads if not self.latent_attn else latent_attn_dim // num_heads
        self.scale = self.head_dim**-0.5
        self.fused_attn = use_fused_attn()

        self.qkv = (
            nn.Linear(dim, dim * 3, bias=qkv_bias)
            if not self.latent_attn
            else nn.Linear(dim, latent_attn_dim * 3, bias=qkv_bias)
        )
        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim) if not self.latent_attn else nn.Linear(latent_attn_dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        self.custom_positional_encoding = custom_positional_encoding

    def forward(self, x: torch.Tensor, xpos: torch.Tensor = None) -> torch.Tensor:
        """
        Forward pass of the Attention layer.

        Args:
            x (torch.Tensor): Input features
            xpos (torch.Tensor): Positions of tokens (required when using custom positional encoding)

        Returns:
            torch.Tensor: Output features of same shape as input
        """
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        q, k = self.q_norm(q), self.k_norm(k)

        if self.custom_positional_encoding is not None:
            assert (
                xpos is not None
            ), "Positions of tokens (xpos) are a required input when using custom positional encoding"
            q = self.custom_positional_encoding(q, xpos)
            k = self.custom_positional_encoding(k, xpos)

        if self.fused_attn:
            x = F.scaled_dot_product_attention(
                q, k, v, dropout_p=(self.attn_drop.p if self.training else 0.0), scale=self.scale
            )
        else:
            q = q * self.scale
            attn = q @ k.transpose(-2, -1)
            attn = attn.softmax(dim=-1)
            attn = self.attn_drop(attn)
            x = attn @ v

        x = x.transpose(1, 2).reshape(B, N, -1)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class CrossAttention(nn.Module):
    "Cross-Attention Layer"

    fused_attn: Final[bool]

    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        norm_layer: nn.Module = nn.LayerNorm,
        custom_positional_encoding: Callable = None,
    ):
        """
        Initialize the Cross-Attention layer.

        Args:
            dim (int): Dimension of input features
            num_heads (int): Number of attention heads (default: 8)
            qkv_bias (bool): Whether to include bias in qkv projection (default: False)
            qk_norm (bool): Whether to normalize q and k (default: False)
            attn_drop (float): Dropout rate for attention weights (default: 0.)
            proj_drop (float): Dropout rate for output (default: 0.)
            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
        """
        super().__init__()
        assert dim % num_heads == 0, "dim should be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim**-0.5
        self.fused_attn = use_fused_attn()

        self.projq = nn.Linear(dim, dim, bias=qkv_bias)
        self.projk = nn.Linear(dim, dim, bias=qkv_bias)
        self.projv = nn.Linear(dim, dim, bias=qkv_bias)
        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        self.custom_positional_encoding = custom_positional_encoding

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        qpos: torch.Tensor = None,
        kpos: torch.Tensor = None,
    ) -> torch.Tensor:
        """
        Forward pass of the Cross-Attention layer.

        Args:
            query (torch.Tensor): Query features
            key (torch.Tensor): Key features
            value (torch.Tensor): Value features
            qpos (torch.Tensor): Positions of queries (required when using custom positional encoding)
            kpos (torch.Tensor): Positions of keys (required when using custom positional encoding)

        Returns:
            torch.Tensor: Output features of same shape as input
        """
        B, Nq, C = query.shape
        Nk = key.shape[1]
        Nv = value.shape[1]

        q = self.projq(query).reshape(B, Nq, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        k = self.projk(key).reshape(B, Nk, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        v = self.projv(value).reshape(B, Nv, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        q, k = self.q_norm(q), self.k_norm(k)

        if self.custom_positional_encoding is not None:
            assert (
                qpos is not None
            ), "Positions of queries (qpos) are a required input when using custom positional encoding"
            assert (
                kpos is not None
            ), "Positions of keys (kpos) are a required input when using custom positional encoding"
            q = self.custom_positional_encoding(q, qpos)
            k = self.custom_positional_encoding(k, kpos)

        if self.fused_attn:
            x = F.scaled_dot_product_attention(
                q, k, v, dropout_p=(self.attn_drop.p if self.training else 0.0), scale=self.scale
            )
        else:
            q = q * self.scale
            attn = q @ k.transpose(-2, -1)
            attn = attn.softmax(dim=-1)
            attn = self.attn_drop(attn)
            x = attn @ v

        x = x.transpose(1, 2).reshape(B, Nq, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class LayerScale(nn.Module):
    "Layer Scale Layer"

    def __init__(
        self,
        dim: int,
        init_values: float = 1e-5,
        inplace: bool = False,
    ):
        """
        Initialize the Layer Scale layer

        Args:
            dim (int): Dimension of input features
            init_values (float): Initial value for LayerScale gamma (default: 1e-5)
            inplace (bool): Whether to perform inplace operations (default: False)
        """
        super().__init__()
        self.inplace = inplace
        self.gamma = nn.Parameter(init_values * torch.ones(dim))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        "Forward pass of the Layer Scale layer"
        return x.mul_(self.gamma) if self.inplace else x * self.gamma


class SelfAttentionBlock(nn.Module):
    "Self-Attention Block"

    def __init__(
        self,
        dim: int,
        num_heads: int,
        latent_attn_dim: Optional[int] = None,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        proj_drop: float = 0.0,
        attn_drop: float = 0.0,
        init_values: Optional[float] = None,
        drop_path: float = 0.0,
        act_layer: nn.Module = nn.GELU,
        norm_layer: nn.Module = nn.LayerNorm,
        mlp_layer: nn.Module = Mlp,
        custom_positional_encoding: Callable = None,
    ):
        """
        Initialize the Self-Attention Block.

        Args:
            dim (int): Dimension of input features
            num_heads (int): Number of attention heads
            mlp_ratio (float): Ratio of hidden to input dimension in MLP (default: 4.)
            qkv_bias (bool): Whether to include bias in qkv projection (default: False)
            qk_norm (bool): Whether to normalize q and k (default: False)
            proj_drop (float): Dropout rate for output (default: 0.)
            attn_drop (float): Dropout rate for attention weights (default: 0.)
            init_values (float): Initial value for LayerScale gamma (default: None)
            drop_path (float): Dropout rate for stochastic depth (default: 0.)
            act_layer (nn.Module): Activation layer (default: nn.GELU)
            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
            mlp_layer (nn.Module): MLP layer (default: Mlp)
            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
        """
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim,
            latent_attn_dim=latent_attn_dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            attn_drop=attn_drop,
            proj_drop=proj_drop,
            norm_layer=norm_layer,
            custom_positional_encoding=custom_positional_encoding,
        )
        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

        self.norm2 = norm_layer(dim)
        self.mlp = mlp_layer(
            in_features=dim,
            hidden_features=int(dim * mlp_ratio),
            act_layer=act_layer,
            drop=proj_drop,
        )
        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

        self.custom_positional_encoding = custom_positional_encoding

    def forward(self, x: torch.Tensor, xpos: torch.Tensor = None) -> torch.Tensor:
        """
        Forward pass of the Self-Attention Block.

        Args:
            x (torch.Tensor): Input features
            xpos (torch.Tensor): Positions of tokens (required when using custom positional encoding)

        Returns:
            torch.Tensor: Output features of same shape as input
        """
        if self.custom_positional_encoding is not None:
            assert (
                xpos is not None
            ), "Positions of tokens (xpos) are a required input when using custom positional encoding"
        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), xpos)))
        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
        return x


class CrossAttentionBlock(nn.Module):
    "Cross-Attention Block"

    def __init__(
        self,
        dim: int,
        num_heads: int,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        proj_drop: float = 0.0,
        attn_drop: float = 0.0,
        init_values: Optional[float] = None,
        drop_path: float = 0.0,
        act_layer: nn.Module = nn.GELU,
        norm_layer: nn.Module = nn.LayerNorm,
        mlp_layer: nn.Module = Mlp,
        custom_positional_encoding: Callable = None,
        norm_cross_tokens: bool = True,
    ):
        """
        Initialize the Cross-Attention Block.

        Args:
            dim (int): Dimension of input features
            num_heads (int): Number of attention heads
            mlp_ratio (float): Ratio of hidden to input dimension in MLP (default: 4.)
            qkv_bias (bool): Whether to include bias in qkv projection (default: False)
            qk_norm (bool): Whether to normalize q and k (default: False)
            proj_drop (float): Dropout rate for output (default: 0.)
            attn_drop (float): Dropout rate for attention weights (default: 0.)
            init_values (float): Initial value for LayerScale gamma (default: None)
            drop_path (float): Dropout rate for stochastic depth (default: 0.)
            act_layer (nn.Module): Activation layer (default: nn.GELU)
            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
            mlp_layer (nn.Module): MLP layer (default: Mlp)
            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
            norm_cross_tokens (bool): Whether to normalize cross tokens (default: True)

        Returns:
            torch.Tensor: Output features of same shape as input
        """
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            attn_drop=attn_drop,
            proj_drop=proj_drop,
            norm_layer=norm_layer,
            custom_positional_encoding=custom_positional_encoding,
        )
        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

        self.norm_y = norm_layer(dim) if norm_cross_tokens else nn.Identity()
        self.custom_positional_encoding = custom_positional_encoding
        self.norm2 = norm_layer(dim)
        self.cross_attn = CrossAttention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            attn_drop=attn_drop,
            proj_drop=proj_drop,
            norm_layer=norm_layer,
            custom_positional_encoding=custom_positional_encoding,
        )
        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

        self.norm3 = norm_layer(dim)
        self.mlp = mlp_layer(
            in_features=dim,
            hidden_features=int(dim * mlp_ratio),
            act_layer=act_layer,
            drop=proj_drop,
        )
        self.ls3 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
        self.drop_path3 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

    def forward(
        self,
        x: torch.Tensor,
        y: torch.Tensor,
        xpos: torch.Tensor = None,
        ypos: torch.Tensor = None,
    ) -> torch.Tensor:
        """
        Forward pass of the Cross-Attention Block.

        Args:
            x (torch.Tensor): Input features
            y (torch.Tensor): Cross features
            xpos (torch.Tensor): Positions of tokens (required when using custom positional encoding)
            ypos (torch.Tensor): Positions of cross tokens (required when using custom positional encoding)

        Returns:
            torch.Tensor: Output features of same shape as input
        """
        if self.custom_positional_encoding is not None:
            assert (
                xpos is not None
            ), "Positions of tokens (xpos) are a required input when using custom positional encoding"
            assert (
                ypos is not None
            ), "Positions of cross tokens (ypos) are a required input when using custom positional encoding"
        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), xpos)))
        y_ = self.norm_y(y)
        x = x + self.drop_path2(self.ls2(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos)))
        x = x + self.drop_path3(self.ls3(self.mlp(self.norm3(x))))
        return x


def dummy_positional_encoding(x, xpos):
    "Dummy function for positional encoding of tokens"
    x = x
    xpos = xpos
    return x


# copied from DiffTrsformer
class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine=True, memory_efficient=False):
        super().__init__()
        self.dim = dim
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        if self.elementwise_affine:
            self.weight = nn.Parameter(torch.ones(dim))
        else:
            self.register_parameter("weight", None)

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        if self.weight is not None:
            output = output * self.weight
        return output

    def extra_repr(self) -> str:
        return f"dim={self.dim}, eps={self.eps}, elementwise_affine={self.elementwise_affine}"


def lambda_init_fn(depth):
    return 0.8 - 0.6 * math.exp(-0.3 * depth)  # copied from DiffTrsformer


class DiffAttention(nn.Module):
    "Differential Self-Attention Layer"

    fused_attn: Final[bool]

    def __init__(
        self,
        dim: int,
        depth: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        norm_layer: nn.Module = nn.LayerNorm,
        custom_positional_encoding: Callable = None,
    ):
        """
        Initialize the DiffAttention layer.

        Args:
            dim (int): Dimension of input features
            depth (int): Depth of the current layer, used in lambda initialization (default: 0)
            num_heads (int): Number of attention heads (default: 8)
            qkv_bias (bool): Whether to include bias in qkv projection (default: False)
            qk_norm (bool): Whether to normalize q and k (default: False)
            attn_drop (float): Dropout rate for attention weights (default: 0.)
            proj_drop (float): Dropout rate for output (default: 0.)
            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
        """
        super().__init__()
        assert dim % num_heads == 0, "dim should be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = dim // num_heads // 2
        self.scale = self.head_dim**-0.5
        self.fused_attn = use_fused_attn()

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        self.custom_positional_encoding = custom_positional_encoding

        # DiffTransformer specific
        self.lambda_init = lambda_init_fn(depth)
        self.lambda_q1 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, std=0.1))
        self.lambda_k1 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, std=0.1))
        self.lambda_q2 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, std=0.1))
        self.lambda_k2 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, std=0.1))

        self.subln = RMSNorm(2 * self.head_dim, eps=1e-5, elementwise_affine=True)

    def forward(self, x: torch.Tensor, xpos: torch.Tensor = None) -> torch.Tensor:
        """
        Forward pass of the Attention layer.

        Args:
            x (torch.Tensor): Input features
            xpos (torch.Tensor): Positions of tokens (required when using custom positional encoding)

        Returns:
            torch.Tensor: Output features of same shape as input
        """
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim * 2)
        q, k, v = torch.chunk(qkv, 3, dim=2)  # B, N, Nh, Dh

        q = q.view(B, N, 2 * self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        k = k.view(B, N, 2 * self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        v = v.view(B, N, self.num_heads, 2 * self.head_dim).permute(0, 2, 1, 3)

        q, k = self.q_norm(q), self.k_norm(k)

        if self.custom_positional_encoding is not None:
            assert (
                xpos is not None
            ), "Positions of tokens (xpos) are a required input when using custom positional encoding"
            q = self.custom_positional_encoding(q, xpos)
            k = self.custom_positional_encoding(k, xpos)

        q1, q2 = q.chunk(2, dim=1)  # split heads dimension into two
        k1, k2 = k.chunk(2, dim=1)  # split heads dimension into two

        if self.fused_attn:
            attn1 = F.scaled_dot_product_attention(
                q1, k1, v, dropout_p=(self.attn_drop.p if self.training else 0.0), scale=self.scale
            )
            attn2 = F.scaled_dot_product_attention(
                q2, k2, v, dropout_p=(self.attn_drop.p if self.training else 0.0), scale=self.scale
            )
        else:
            q1 = q1 * self.scale
            attn = q1 @ k1.transpose(-2, -1)
            attn = attn.softmax(dim=-1)
            attn = self.attn_drop(attn)
            attn1 = attn @ v

            q2 = q2 * self.scale
            attn = q2 @ k2.transpose(-2, -1)
            attn = attn.softmax(dim=-1)
            attn = self.attn_drop(attn)
            attn2 = attn @ v

        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(q)
        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(q)
        lambda_full = lambda_1 - lambda_2 + self.lambda_init
        attn = attn1 - lambda_full * attn2

        attn = self.subln(attn)
        attn = attn * (1 - self.lambda_init)
        attn = attn.reshape(B, N, self.num_heads * 2 * self.head_dim)

        x = self.proj(attn)
        x = self.proj_drop(x)
        return x


class DiffCrossAttention(nn.Module):
    "Differential Cross-Attention Layer, following https://arxiv.org/abs/2410.05258"

    fused_attn: Final[bool]

    def __init__(
        self,
        dim: int,
        depth: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        norm_layer: nn.Module = nn.LayerNorm,
        custom_positional_encoding: Callable = None,
    ):
        """
        Initialize the Cross-Attention layer.

        Args:
            dim (int): Dimension of input features
            depth (int): Depth of the current layer, used in lambda initialization (default: 0)
            num_heads (int): Number of attention heads (default: 8)
            qkv_bias (bool): Whether to include bias in qkv projection (default: False)
            qk_norm (bool): Whether to normalize q and k (default: False)
            attn_drop (float): Dropout rate for attention weights (default: 0.)
            proj_drop (float): Dropout rate for output (default: 0.)
            norm_layer (nn.Module): Normalization layer (default: nn.LayerNorm)
            custom_positional_encoding (Callable): Custom positional encoding function (default: None)
        """
        super().__init__()
        assert dim % num_heads == 0, "dim should be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = dim // num_heads // 2
        self.scale = self.head_dim**-0.5
        self.fused_attn = use_fused_attn()

        self.projq = nn.Linear(dim, dim, bias=qkv_bias)
        self.projk = nn.Linear(dim, dim, bias=qkv_bias)
        self.projv = nn.Linear(dim, dim, bias=qkv_bias)
        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        # DiffTransformer specific
        self.lambda_init = lambda_init_fn(depth)
        self.lambda_q1 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, std=0.1))
        self.lambda_k1 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, std=0.1))
        self.lambda_q2 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, std=0.1))
        self.lambda_k2 = nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0, std=0.1))

        self.subln = RMSNorm(2 * self.head_dim, eps=1e-5, elementwise_affine=True)

        self.custom_positional_encoding = custom_positional_encoding

    def lambda_init_fn(self, depth):
        return 0.8 - 0.6 * math.exp(-0.3 * depth)  # copied from DiffTrsformer

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        qpos: torch.Tensor = None,
        kpos: torch.Tensor = None,
    ) -> torch.Tensor:
        """
        Forward pass of the Cross-Attention layer.

        Args:
            query (torch.Tensor): Query features
            key (torch.Tensor): Key features
            value (torch.Tensor): Value features
            qpos (torch.Tensor): Positions of queries (required when using custom positional encoding)
            kpos (torch.Tensor): Positions of keys (required when using custom positional encoding)

        Returns:
            torch.Tensor: Output features of same shape as input
        """
        B, Nq, C = query.shape
        Nk = key.shape[1]
        Nv = value.shape[1]

        q = self.projq(query).reshape(B, Nq, 2 * self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        k = self.projk(key).reshape(B, Nk, 2 * self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        v = self.projv(value).reshape(B, Nv, self.num_heads, 2 * self.head_dim).permute(0, 2, 1, 3)
        q, k = self.q_norm(q), self.k_norm(k)

        if self.custom_positional_encoding is not None:
            assert (
                qpos is not None
            ), "Positions of queries (qpos) are a required input when using custom positional encoding"
            assert (
                kpos is not None
            ), "Positions of keys (kpos) are a required input when using custom positional encoding"
            q = self.custom_positional_encoding(q, qpos)
            k = self.custom_positional_encoding(k, kpos)

        q1, q2 = q.chunk(2, dim=1)  # split heads dimension into two
        k1, k2 = k.chunk(2, dim=1)  # split heads dimension into two

        if self.fused_attn:
            attn1 = F.scaled_dot_product_attention(
                q1, k1, v, dropout_p=(self.attn_drop.p if self.training else 0.0), scale=self.scale
            )
            attn2 = F.scaled_dot_product_attention(
                q2, k2, v, dropout_p=(self.attn_drop.p if self.training else 0.0), scale=self.scale
            )
        else:
            q1 = q1 * self.scale
            attn = q1 @ k1.transpose(-2, -1)
            attn = attn.softmax(dim=-1)
            attn = self.attn_drop(attn)
            attn1 = attn @ v

            q2 = q2 * self.scale
            attn = q2 @ k2.transpose(-2, -1)
            attn = attn.softmax(dim=-1)
            attn = self.attn_drop(attn)
            attn2 = attn @ v

        attn1 = attn1.transpose(1, 2)  # B, Nq, Nh, Dh
        attn2 = attn2.transpose(1, 2)

        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(q)
        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(q)
        lambda_full = lambda_1 - lambda_2 + self.lambda_init
        attn = attn1 - lambda_full * attn2

        attn = self.subln(attn)
        attn = attn * (1 - self.lambda_init)
        attn = attn.reshape(B, Nq, self.num_heads * 2 * self.head_dim)

        x = self.proj(attn)
        x = self.proj_drop(x)
        return x


class DiffSelfAttentionBlock(SelfAttentionBlock):
    "Differential Self-Attention Block"

    def __init__(
        self,
        dim: int,
        depth: int,
        num_heads: int,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        proj_drop: float = 0.0,
        attn_drop: float = 0.0,
        init_values: Optional[float] = None,
        drop_path: float = 0.0,
        act_layer: nn.Module = nn.GELU,
        norm_layer: nn.Module = nn.LayerNorm,
        mlp_layer: nn.Module = Mlp,
        custom_positional_encoding: Callable = None,
    ):
        super().__init__(
            dim=dim,
            num_heads=num_heads,
            mlp_ratio=mlp_ratio,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            proj_drop=proj_drop,
            attn_drop=attn_drop,
            init_values=init_values,
            drop_path=drop_path,
            act_layer=act_layer,
            norm_layer=norm_layer,
            mlp_layer=mlp_layer,
            custom_positional_encoding=custom_positional_encoding,
        )

        self.attn = DiffAttention(
            dim,
            depth,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            attn_drop=attn_drop,
            proj_drop=proj_drop,
            norm_layer=norm_layer,
            custom_positional_encoding=custom_positional_encoding,
        )


class DiffCrossAttentionBlock(CrossAttentionBlock):
    "Differential Cross-Attention Block"

    def __init__(
        self,
        dim: int,
        depth: int,
        num_heads: int,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        proj_drop: float = 0.0,
        attn_drop: float = 0.0,
        init_values: Optional[float] = None,
        drop_path: float = 0.0,
        act_layer: nn.Module = nn.GELU,
        norm_layer: nn.Module = nn.LayerNorm,
        mlp_layer: nn.Module = Mlp,
        custom_positional_encoding: Callable = None,
        norm_cross_tokens: bool = True,
    ):
        super().__init__(
            dim=dim,
            num_heads=num_heads,
            mlp_ratio=mlp_ratio,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            proj_drop=proj_drop,
            attn_drop=attn_drop,
            init_values=init_values,
            drop_path=drop_path,
            act_layer=act_layer,
            norm_layer=norm_layer,
            mlp_layer=mlp_layer,
            custom_positional_encoding=custom_positional_encoding,
            norm_cross_tokens=norm_cross_tokens,
        )

        self.cross_attn = DiffCrossAttention(
            dim,
            depth,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            attn_drop=attn_drop,
            proj_drop=proj_drop,
            norm_layer=norm_layer,
            custom_positional_encoding=custom_positional_encoding,
        )


if __name__ == "__main__":
    # Init Attention & CrossAttention classes
    self_attn = Attention(dim=768, custom_positional_encoding=dummy_positional_encoding)
    cross_attn = CrossAttention(dim=768, custom_positional_encoding=dummy_positional_encoding)

    # Perform dummy inference with the Attention classes
    dummy_input = torch.randn((1, 256, 768))
    dummy_x = torch.arange(16)
    dummy_y = torch.arange(16)
    dummy_xpos = torch.cartesian_prod(dummy_y, dummy_x).view(1, 256, 2).expand(1, -1, 2).clone()
    self_attn_output = self_attn(dummy_input, dummy_xpos)
    cross_attn_output = cross_attn(dummy_input, dummy_input, dummy_input, dummy_xpos, dummy_xpos)
    print("Init of Attention & CrossAttention classes is successful!")

    # Init SelfAttentionBlock & CrossAttentionBlock
    self_attn_block = SelfAttentionBlock(dim=768, num_heads=16, custom_positional_encoding=dummy_positional_encoding)
    cross_attn_block = CrossAttentionBlock(dim=768, num_heads=16, custom_positional_encoding=dummy_positional_encoding)

    # Perform dummy inference with the Attention blocks
    self_attn_block_output = self_attn_block(dummy_input, dummy_xpos)
    cross_attn_block_output = cross_attn_block(dummy_input, dummy_input, dummy_xpos, dummy_xpos)
    print("Init of SelfAttentionBlock & CrossAttentionBlock is successful!")

    # Init DiffAttention & DiffCrossAttention classes
    diff_self_attn = DiffAttention(dim=768, depth=0, custom_positional_encoding=dummy_positional_encoding)
    diff_cross_attn = DiffCrossAttention(dim=768, depth=0, custom_positional_encoding=dummy_positional_encoding)

    # Perform dummy inference with the DiffAttention classes
    diff_self_attn_output = diff_self_attn(dummy_input, dummy_xpos)
    diff_cross_attn_output = diff_cross_attn(dummy_input, dummy_input, dummy_input, dummy_xpos, dummy_xpos)
    print("Init of DiffAttention & DiffCrossAttention classes is successful!")

    # Init DiffSelfAttentionBlock & DiffCrossAttentionBlock
    diff_self_attn_block = DiffSelfAttentionBlock(
        dim=768, depth=0, num_heads=8, custom_positional_encoding=dummy_positional_encoding
    )
    diff_cross_attn_block = DiffCrossAttentionBlock(
        dim=768, depth=0, num_heads=8, custom_positional_encoding=dummy_positional_encoding
    )

    # Perform dummy inference with the DiffAttention blocks
    diff_self_attn_block_output = diff_self_attn_block(dummy_input, dummy_xpos)
    diff_cross_attn_block_output = diff_cross_attn_block(dummy_input, dummy_input, dummy_xpos, dummy_xpos)

    print("Init of DiffSelfAttentionBlock & DiffCrossAttentionBlock is successful!")