# Copyright 2025 Dhruv Nair. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
RFDiffusion3 Transformer model.

This module provides a diffusers-compatible implementation of the RFD3
architecture for protein structure prediction and generation. The module
structure matches the foundry checkpoint format for direct weight loading.
"""

import math
from dataclasses import dataclass
from functools import partial
from typing import Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F

from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.models.modeling_utils import ModelMixin


@dataclass
class RFDiffusionTransformerOutput:
    """Output class for RFDiffusion transformer."""

    xyz: torch.Tensor
    single: torch.Tensor
    pair: torch.Tensor
    sequence_logits: Optional[torch.Tensor] = None
    sequence_indices: Optional[torch.Tensor] = None


linearNoBias = partial(nn.Linear, bias=False)


class RMSNorm(nn.Module):
    """Root Mean Square Layer Normalization."""

    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
        return x / rms * self.weight


class FourierEmbedding(nn.Module):
    """Fourier feature embedding for timesteps with learned weights."""

    def __init__(self, c: int):
        super().__init__()
        self.c = c
        self.register_buffer("w", torch.zeros(c, dtype=torch.float32))
        self.register_buffer("b", torch.zeros(c, dtype=torch.float32))
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.normal_(self.w)
        nn.init.normal_(self.b)

    def forward(self, t: torch.Tensor) -> torch.Tensor:
        return torch.cos(2 * math.pi * (t[..., None] * self.w + self.b))


class LinearBiasInit(nn.Linear):
    """Linear layer with custom bias initialization."""

    def __init__(self, *args, biasinit: float = -2.0, **kwargs):
        self.biasinit = biasinit
        super().__init__(*args, **kwargs)

    def reset_parameters(self) -> None:
        super().reset_parameters()
        if self.bias is not None:
            self.bias.data.fill_(self.biasinit)


class RMSNormNoWeight(nn.Module):
    """RMSNorm without learnable weight (elementwise_affine=False)."""

    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.dim = dim

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
        return x / rms


class AdaLN(nn.Module):
    """Adaptive Layer Normalization."""

    def __init__(self, c_a: int, c_s: int):
        super().__init__()
        self.ln_a = RMSNormNoWeight(c_a)
        self.ln_s = RMSNorm(c_s)
        self.to_gain = nn.Sequential(nn.Linear(c_s, c_a), nn.Sigmoid())
        self.to_bias = linearNoBias(c_s, c_a)

    def forward(self, a: torch.Tensor, s: torch.Tensor) -> torch.Tensor:
        a = self.ln_a(a)
        s = self.ln_s(s)
        return self.to_gain(s) * a + self.to_bias(s)


class Transition(nn.Module):
    """SwiGLU-style transition block matching foundry naming."""

    def __init__(self, c: int, n: int = 4):
        super().__init__()
        self.layer_norm_1 = RMSNorm(c)
        self.linear_1 = linearNoBias(c, n * c)
        self.linear_2 = linearNoBias(c, n * c)
        self.linear_3 = linearNoBias(n * c, c)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.layer_norm_1(x)
        return self.linear_3(F.silu(self.linear_1(x)) * self.linear_2(x))


class ConditionedTransitionBlock(nn.Module):
    """SwiGLU transition with adaptive layer norm conditioning."""

    def __init__(self, c_token: int, c_s: int, n: int = 2):
        super().__init__()
        self.ada_ln = AdaLN(c_a=c_token, c_s=c_s)
        self.linear_1 = linearNoBias(c_token, c_token * n)
        self.linear_2 = linearNoBias(c_token, c_token * n)
        self.linear_output_project = nn.Sequential(
            LinearBiasInit(c_s, c_token, biasinit=-2.0),
            nn.Sigmoid(),
        )
        self.linear_3 = linearNoBias(c_token * n, c_token)

    def forward(self, a: torch.Tensor, s: torch.Tensor) -> torch.Tensor:
        a = self.ada_ln(a, s)
        b = F.silu(self.linear_1(a)) * self.linear_2(a)
        return self.linear_output_project(s) * self.linear_3(b)


class MultiDimLinear(nn.Linear):
    """Linear layer that reshapes output to multi-dimensional shape."""

    def __init__(self, in_features: int, out_shape: Tuple[int, ...], norm: bool = False, **kwargs):
        self.out_shape = out_shape
        out_features = 1
        for d in out_shape:
            out_features *= d
        super().__init__(in_features, out_features, **kwargs)
        if norm:
            self.ln = RMSNorm(out_features)
            self.use_ln = True
        else:
            self.use_ln = False

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out = super().forward(x)
        if self.use_ln:
            out = self.ln(out)
        return out.reshape(x.shape[:-1] + self.out_shape)


class AttentionPairBias(nn.Module):
    """Attention with pairwise bias for Pairformer."""

    def __init__(
        self,
        c_a: int,
        c_s: int,
        c_pair: int,
        n_head: int = 8,
        kq_norm: bool = False,
    ):
        super().__init__()
        self.n_head = n_head
        self.c_a = c_a
        self.c_pair = c_pair
        self.c = c_a // n_head

        self.to_q = MultiDimLinear(c_a, (n_head, self.c))
        self.to_k = MultiDimLinear(c_a, (n_head, self.c), bias=False, norm=True)
        self.to_v = MultiDimLinear(c_a, (n_head, self.c), bias=False, norm=True)
        self.to_b = linearNoBias(c_pair, n_head)
        self.to_g = nn.Sequential(
            MultiDimLinear(c_a, (n_head, self.c), bias=False),
            nn.Sigmoid(),
        )
        self.to_a = linearNoBias(c_a, c_a)
        self.ln_0 = RMSNorm(c_pair)
        self.ln_1 = RMSNorm(c_a)

    def forward(
        self,
        a: torch.Tensor,
        s: Optional[torch.Tensor],
        z: torch.Tensor,
        beta: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        a = self.ln_1(a)

        q = self.to_q(a)
        k = self.to_k(a)
        v = self.to_v(a)
        b = self.to_b(self.ln_0(z))
        if beta is not None:
            b = b + beta[..., None]
        g = self.to_g(a)

        q = q / math.sqrt(self.c)
        attn = torch.einsum("...ihd,...jhd->...ijh", q, k) + b
        attn = F.softmax(attn, dim=-2)
        out = torch.einsum("...ijh,...jhc->...ihc", attn, v)
        out = g * out
        out = out.flatten(start_dim=-2)
        out = self.to_a(out)

        return out


class LocalAttentionPairBias(nn.Module):
    """Local attention with pairwise bias for diffusion transformer blocks."""

    def __init__(
        self,
        c_a: int,
        c_s: int,
        c_pair: int,
        n_head: int = 16,
        kq_norm: bool = True,
    ):
        super().__init__()
        self.n_head = n_head
        self.c = c_a
        self.c_head = c_a // n_head
        self.c_s = c_s
        self.use_checkpointing = False

        self.to_q = linearNoBias(c_a, c_a)
        self.to_k = linearNoBias(c_a, c_a)
        self.to_v = linearNoBias(c_a, c_a)
        self.to_b = linearNoBias(c_pair, n_head)
        self.to_g = nn.Sequential(linearNoBias(c_a, c_a), nn.Sigmoid())
        self.to_o = linearNoBias(c_a, c_a)

        self.kq_norm = kq_norm
        if kq_norm:
            self.ln_q = RMSNorm(c_a)
            self.ln_k = RMSNorm(c_a)

        if c_s is not None and c_s > 0:
            self.ada_ln_1 = AdaLN(c_a=c_a, c_s=c_s)
            self.linear_output_project = nn.Sequential(
                LinearBiasInit(c_s, c_a, biasinit=-2.0),
                nn.Sigmoid(),
            )
        else:
            self.ln_1 = RMSNorm(c_a)

    def forward(
        self,
        a: torch.Tensor,
        s: Optional[torch.Tensor],
        z: torch.Tensor,
        **kwargs,
    ) -> torch.Tensor:
        if self.c_s is not None and self.c_s > 0:
            a = self.ada_ln_1(a, s)
        else:
            a = self.ln_1(a)

        q = self.to_q(a)
        k = self.to_k(a)
        v = self.to_v(a)
        g = self.to_g(a)

        if self.kq_norm:
            q = self.ln_q(q)
            k = self.ln_k(k)

        batch_dims = a.shape[:-2]
        L = a.shape[-2]

        q = q.view(*batch_dims, L, self.n_head, self.c_head).transpose(-2, -3)
        k = k.view(*batch_dims, L, self.n_head, self.c_head).transpose(-2, -3)
        v = v.view(*batch_dims, L, self.n_head, self.c_head).transpose(-2, -3)
        g = g.view(*batch_dims, L, self.n_head, self.c_head).transpose(-2, -3)

        b = self.to_b(z).permute(*range(len(batch_dims)), -1, -3, -2)

        attn = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.c_head)
        attn = attn + b
        attn = F.softmax(attn, dim=-1)

        out = torch.matmul(attn, v)
        out = out * g
        out = out.transpose(-2, -3).contiguous()
        out = out.view(*batch_dims, L, self.c)
        out = self.to_o(out)

        if self.c_s is not None and self.c_s > 0:
            out = self.linear_output_project(s) * out

        return out


class PairformerBlock(nn.Module):
    """Pairformer block with attention and transitions."""

    def __init__(
        self,
        c_s: int,
        c_z: int,
        attention_pair_bias: dict,
        n_transition: int = 4,
        p_drop: float = 0.1,
        **kwargs,
    ):
        super().__init__()
        self.z_transition = Transition(c=c_z, n=n_transition)

        if c_s > 0:
            self.s_transition = Transition(c=c_s, n=n_transition)
            self.attention_pair_bias = AttentionPairBias(
                c_a=c_s, c_s=0, c_pair=c_z, **attention_pair_bias
            )

    def forward(
        self,
        s: torch.Tensor,
        z: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        z = z + self.z_transition(z)

        if s is not None:
            beta = torch.tensor([0.0], device=z.device)
            s = s + self.attention_pair_bias(s, None, z, beta=beta)
            s = s + self.s_transition(s)

        return s, z


class StructureLocalAtomTransformerBlock(nn.Module):
    """Single block for atom/token transformer."""

    def __init__(
        self,
        c_atom: int,
        c_s: Optional[int],
        c_atompair: int,
        n_head: int = 4,
        dropout: float = 0.0,
        kq_norm: bool = True,
        **kwargs,
    ):
        super().__init__()
        self.c_s = c_s
        self.dropout = nn.Dropout(dropout)
        self.attention_pair_bias = LocalAttentionPairBias(
            c_a=c_atom, c_s=c_s, c_pair=c_atompair, n_head=n_head, kq_norm=kq_norm
        )
        if c_s is not None and c_s > 0:
            self.transition_block = ConditionedTransitionBlock(c_token=c_atom, c_s=c_s)
        else:
            self.transition_block = Transition(c=c_atom, n=4)

    def forward(
        self,
        q: torch.Tensor,
        c: Optional[torch.Tensor],
        p: torch.Tensor,
        **kwargs,
    ) -> torch.Tensor:
        q = q + self.dropout(self.attention_pair_bias(q, c, p, **kwargs))
        if self.c_s is not None and self.c_s > 0:
            q = q + self.transition_block(q, c)
        else:
            q = q + self.transition_block(q)
        return q


class GatedCrossAttention(nn.Module):
    """Gated cross attention for upcast/downcast."""

    def __init__(
        self,
        c_query: int,
        c_kv: int,
        c_model: int = 128,
        n_head: int = 4,
        kq_norm: bool = True,
        dropout: float = 0.0,
        **kwargs,
    ):
        super().__init__()
        self.n_head = n_head
        self.scale = 1 / math.sqrt(c_model // n_head)

        self.ln_q = RMSNorm(c_query)
        self.ln_kv = RMSNorm(c_kv)

        self.to_q = linearNoBias(c_query, c_model)
        self.to_k = linearNoBias(c_kv, c_model)
        self.to_v = linearNoBias(c_kv, c_model)
        self.to_g = nn.Sequential(linearNoBias(c_query, c_model), nn.Sigmoid())
        self.to_out = nn.Sequential(nn.Linear(c_model, c_query), nn.Dropout(dropout))

        self.kq_norm = kq_norm
        if kq_norm:
            self.k_norm = RMSNorm(c_model)
            self.q_norm = RMSNorm(c_model)

    def forward(
        self,
        q: torch.Tensor,
        kv: torch.Tensor,
        attn_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        q_in = self.ln_q(q)
        kv = self.ln_kv(kv)

        q_proj = self.to_q(q_in)
        k = self.to_k(kv)
        v = self.to_v(kv)
        g = self.to_g(q_in)

        if self.kq_norm:
            k = self.k_norm(k)
            q_proj = self.q_norm(q_proj)

        B = q.shape[0]
        n_tok = q.shape[1] if q.ndim == 4 else 1
        L_q = q.shape[-2]
        L_kv = kv.shape[-2]
        c_head = q_proj.shape[-1] // self.n_head

        if q.ndim == 4:
            q_proj = q_proj.view(B, n_tok, L_q, self.n_head, c_head).permute(0, 3, 1, 2, 4)
            k = k.view(B, n_tok, L_kv, self.n_head, c_head).permute(0, 3, 1, 2, 4)
            v = v.view(B, n_tok, L_kv, self.n_head, c_head).permute(0, 3, 1, 2, 4)
            g = g.view(B, n_tok, L_q, self.n_head, c_head).permute(0, 3, 1, 2, 4)
        else:
            q_proj = q_proj.view(B, L_q, self.n_head, c_head).permute(0, 2, 1, 3)
            k = k.view(B, L_kv, self.n_head, c_head).permute(0, 2, 1, 3)
            v = v.view(B, L_kv, self.n_head, c_head).permute(0, 2, 1, 3)
            g = g.view(B, L_q, self.n_head, c_head).permute(0, 2, 1, 3)

        attn = torch.matmul(q_proj, k.transpose(-1, -2)) * self.scale
        if attn_mask is not None:
            if q.ndim == 4:
                while attn_mask.ndim < attn.ndim:
                    attn_mask = attn_mask.unsqueeze(0)
                if attn_mask.shape[1] != self.n_head and attn_mask.shape[1] != 1:
                    attn_mask = attn_mask.unsqueeze(1)
            else:
                attn_mask = attn_mask.unsqueeze(-3)
            attn = attn.masked_fill(~attn_mask, float("-inf"))
        attn = F.softmax(attn, dim=-1)

        out = torch.matmul(attn, v)
        out = out * g

        if q.ndim == 4:
            out = out.permute(0, 2, 3, 1, 4).contiguous()
            out = out.view(B, n_tok, L_q, -1)
        else:
            out = out.permute(0, 2, 1, 3).contiguous()
            out = out.view(B, L_q, -1)

        out = self.to_out(out)
        return out


class Upcast(nn.Module):
    """Upcast from token level to atom level."""

    def __init__(
        self,
        c_atom: int,
        c_token: int,
        method: str = "cross_attention",
        cross_attention_block: Optional[dict] = None,
        n_split: int = 6,
        **kwargs,
    ):
        super().__init__()
        self.method = method
        self.n_split = n_split
        if method == "broadcast":
            self.project = nn.Sequential(RMSNorm(c_token), linearNoBias(c_token, c_atom))
        elif method == "cross_attention":
            self.gca = GatedCrossAttention(
                c_query=c_atom,
                c_kv=c_token // n_split,
                **(cross_attention_block or {}),
            )

    def forward(self, q: torch.Tensor, a: torch.Tensor, tok_idx: torch.Tensor) -> torch.Tensor:
        if self.method == "broadcast":
            q = q + self.project(a)[..., tok_idx, :]
        elif self.method == "cross_attention":
            B, L, C = q.shape
            I = int(tok_idx.max().item()) + 1

            a_split = a.view(B, I, self.n_split, -1)

            q_grouped = self._group_atoms(q, tok_idx, I)
            valid_mask = self._build_valid_mask(tok_idx, I, q.device)

            attn_mask = torch.ones(I, q_grouped.shape[2], self.n_split, device=q.device, dtype=torch.bool)
            attn_mask[~valid_mask] = False

            q_update = self.gca(q_grouped, a_split, attn_mask=attn_mask)
            q = q + self._ungroup_atoms(q_update, valid_mask, L)

        return q

    def _group_atoms(self, q: torch.Tensor, tok_idx: torch.Tensor, I: int) -> torch.Tensor:
        B, L, C = q.shape
        max_atoms_per_token = 14
        grouped = torch.zeros(B, I, max_atoms_per_token, C, device=q.device, dtype=q.dtype)
        counts = torch.zeros(I, dtype=torch.long, device=q.device)

        for i in range(L):
            t = tok_idx[i].item()
            if counts[t] < max_atoms_per_token:
                grouped[:, t, counts[t]] = q[:, i]
                counts[t] += 1

        return grouped

    def _build_valid_mask(self, tok_idx: torch.Tensor, I: int, device: torch.device) -> torch.Tensor:
        max_atoms_per_token = 14
        valid_mask = torch.zeros(I, max_atoms_per_token, dtype=torch.bool, device=device)
        counts = torch.zeros(I, dtype=torch.long, device=device)

        for i in range(len(tok_idx)):
            t = tok_idx[i].item()
            if counts[t] < max_atoms_per_token:
                valid_mask[t, counts[t]] = True
                counts[t] += 1

        return valid_mask

    def _ungroup_atoms(self, grouped: torch.Tensor, valid_mask: torch.Tensor, L: int) -> torch.Tensor:
        B, I, n_atoms, C = grouped.shape
        out = torch.zeros(B, L, C, device=grouped.device, dtype=grouped.dtype)

        idx = 0
        for t in range(I):
            for a in range(n_atoms):
                if valid_mask[t, a] and idx < L:
                    out[:, idx] = grouped[:, t, a]
                    idx += 1

        return out


class Downcast(nn.Module):
    """Downcast from atom level to token level."""

    def __init__(
        self,
        c_atom: int,
        c_token: int,
        c_s: Optional[int] = None,
        method: str = "mean",
        cross_attention_block: Optional[dict] = None,
        **kwargs,
    ):
        super().__init__()
        self.method = method
        self.c_token = c_token
        self.c_atom = c_atom

        if c_s is not None:
            self.process_s = nn.Sequential(RMSNorm(c_s), linearNoBias(c_s, c_token))
        else:
            self.process_s = None

        if method == "mean":
            self.gca = linearNoBias(c_atom, c_token)
        elif method == "cross_attention":
            self.gca = GatedCrossAttention(
                c_query=c_token,
                c_kv=c_atom,
                **(cross_attention_block or {}),
            )

    def forward(
        self,
        q: torch.Tensor,
        a: Optional[torch.Tensor] = None,
        s: Optional[torch.Tensor] = None,
        tok_idx: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if q.ndim == 2:
            q = q.unsqueeze(0)
            squeeze = True
        else:
            squeeze = False

        B, L, _ = q.shape
        I = int(tok_idx.max().item()) + 1

        if self.method == "mean":
            projected = self.gca(q)
            a_update = torch.zeros(B, I, self.c_token, device=q.device, dtype=q.dtype)
            counts = torch.zeros(B, I, 1, device=q.device, dtype=q.dtype)
            for i in range(L):
                t = tok_idx[i]
                a_update[:, t] += projected[:, i]
                counts[:, t] += 1
            a_update = a_update / (counts + 1e-8)
        elif self.method == "cross_attention":
            if a is None:
                a = torch.zeros(B, I, self.c_token, device=q.device, dtype=q.dtype)
            elif a.ndim == 2:
                a = a.unsqueeze(0)

            q_grouped, valid_mask = self._group_atoms(q, tok_idx, I)
            attn_mask = valid_mask.unsqueeze(-2)
            a_update = self.gca(a.unsqueeze(-2), q_grouped, attn_mask=attn_mask).squeeze(-2)
        else:
            a_update = torch.zeros(B, I, self.c_token, device=q.device, dtype=q.dtype)

        if a is not None:
            if a.ndim == 2:
                a = a.unsqueeze(0)
            a = a + a_update
        else:
            a = a_update

        if self.process_s is not None and s is not None:
            if s.ndim == 2:
                s = s.unsqueeze(0)
            a = a + self.process_s(s)

        if squeeze:
            a = a.squeeze(0)

        return a

    def _group_atoms(
        self, q: torch.Tensor, tok_idx: torch.Tensor, I: int
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        B, L, C = q.shape
        max_atoms_per_token = 14
        grouped = torch.zeros(B, I, max_atoms_per_token, C, device=q.device, dtype=q.dtype)
        valid_mask = torch.zeros(I, max_atoms_per_token, dtype=torch.bool, device=q.device)
        counts = torch.zeros(I, dtype=torch.long, device=q.device)

        for i in range(L):
            t = tok_idx[i].item()
            if counts[t] < max_atoms_per_token:
                grouped[:, t, counts[t]] = q[:, i]
                valid_mask[t, counts[t]] = True
                counts[t] += 1

        return grouped, valid_mask


class LinearEmbedWithPool(nn.Module):
    """Linear embedding with pooling to token level."""

    def __init__(self, c_token: int):
        super().__init__()
        self.c_token = c_token
        self.linear = linearNoBias(3, c_token)

    def forward(self, r: torch.Tensor, tok_idx: torch.Tensor) -> torch.Tensor:
        B = r.shape[0]
        I = int(tok_idx.max().item()) + 1
        q = self.linear(r)

        a = torch.zeros(B, I, self.c_token, device=r.device, dtype=q.dtype)
        counts = torch.zeros(B, I, 1, device=r.device, dtype=q.dtype)

        for i in range(r.shape[1]):
            t = tok_idx[i]
            a[:, t] += q[:, i]
            counts[:, t] += 1

        return a / (counts + 1e-8)


class LinearSequenceHead(nn.Module):
    """Sequence prediction head."""

    def __init__(self, c_token: int):
        super().__init__()
        n_tok_all = 32
        mask = torch.ones(n_tok_all, dtype=torch.bool)
        self.register_buffer("valid_out_mask", mask)
        self.linear = nn.Linear(c_token, n_tok_all)

    def forward(self, a: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        logits = self.linear(a)
        probs = F.softmax(logits, dim=-1)
        probs = probs * self.valid_out_mask[None, None, :].to(probs.device)
        probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)
        indices = probs.argmax(dim=-1)
        return logits, indices


class LocalAtomTransformer(nn.Module):
    """Atom-level transformer encoder."""

    def __init__(
        self,
        c_atom: int,
        c_s: Optional[int],
        c_atompair: int,
        atom_transformer_block: dict,
        n_blocks: int,
    ):
        super().__init__()
        self.blocks = nn.ModuleList([
            StructureLocalAtomTransformerBlock(
                c_atom=c_atom,
                c_s=c_s,
                c_atompair=c_atompair,
                **atom_transformer_block,
            )
            for _ in range(n_blocks)
        ])

    def forward(
        self,
        q: torch.Tensor,
        c: Optional[torch.Tensor],
        p: torch.Tensor,
        **kwargs,
    ) -> torch.Tensor:
        for block in self.blocks:
            q = block(q, c, p, **kwargs)
        return q


class LocalTokenTransformer(nn.Module):
    """Token-level transformer for diffusion."""

    def __init__(
        self,
        c_token: int,
        c_tokenpair: int,
        c_s: int,
        diffusion_transformer_block: dict,
        n_block: int,
        **kwargs,
    ):
        super().__init__()
        self.blocks = nn.ModuleList([
            StructureLocalAtomTransformerBlock(
                c_atom=c_token,
                c_s=c_s,
                c_atompair=c_tokenpair,
                **diffusion_transformer_block,
            )
            for _ in range(n_block)
        ])

    def forward(
        self,
        a: torch.Tensor,
        s: torch.Tensor,
        z: torch.Tensor,
        **kwargs,
    ) -> torch.Tensor:
        for block in self.blocks:
            a = block(a, s, z, **kwargs)
        return a


class CompactStreamingDecoder(nn.Module):
    """Decoder with upcast, atom transformer, and downcast."""

    def __init__(
        self,
        c_atom: int,
        c_atompair: int,
        c_token: int,
        c_s: int,
        c_tokenpair: int,
        atom_transformer_block: dict,
        upcast: dict,
        downcast: dict,
        n_blocks: int,
        **kwargs,
    ):
        super().__init__()
        self.n_blocks = n_blocks

        self.upcast = nn.ModuleList([
            Upcast(c_atom=c_atom, c_token=c_token, **upcast)
            for _ in range(n_blocks)
        ])
        self.atom_transformer = nn.ModuleList([
            StructureLocalAtomTransformerBlock(
                c_atom=c_atom,
                c_s=c_atom,
                c_atompair=c_atompair,
                **atom_transformer_block,
            )
            for _ in range(n_blocks)
        ])
        self.downcast = Downcast(c_atom=c_atom, c_token=c_token, c_s=c_s, **downcast)

    def forward(
        self,
        a: torch.Tensor,
        s: torch.Tensor,
        z: torch.Tensor,
        q: torch.Tensor,
        c: torch.Tensor,
        p: torch.Tensor,
        tok_idx: torch.Tensor,
        **kwargs,
    ) -> Tuple[torch.Tensor, torch.Tensor, dict]:
        for i in range(self.n_blocks):
            q = self.upcast[i](q, a, tok_idx=tok_idx)
            q = self.atom_transformer[i](q, c, p, **kwargs)

        a = self.downcast(q.detach(), a.detach(), s.detach(), tok_idx=tok_idx)

        return a, q, {}


class DiffusionTokenEncoder(nn.Module):
    """Token encoder with pairformer stack for diffusion."""

    def __init__(
        self,
        c_s: int,
        c_z: int,
        c_token: int,
        c_atompair: int,
        n_pairformer_blocks: int,
        pairformer_block: dict,
        use_distogram: bool = True,
        use_self: bool = True,
        n_bins_distogram: int = 65,
        **kwargs,
    ):
        super().__init__()

        self.use_distogram = use_distogram
        self.use_self = use_self
        self.n_bins_distogram = n_bins_distogram

        self.transition_1 = nn.ModuleList([
            Transition(c=c_s, n=2),
            Transition(c=c_s, n=2),
        ])

        n_bins_noise = n_bins_distogram
        cat_c_z = (
            c_z
            + int(use_distogram) * n_bins_noise
            + int(use_self) * n_bins_distogram
        )

        self.process_z = nn.Sequential(
            RMSNorm(cat_c_z),
            linearNoBias(cat_c_z, c_z),
        )

        self.transition_2 = nn.ModuleList([
            Transition(c=c_z, n=2),
            Transition(c=c_z, n=2),
        ])

        self.pairformer_stack = nn.ModuleList([
            PairformerBlock(c_s=c_s, c_z=c_z, **pairformer_block)
            for _ in range(n_pairformer_blocks)
        ])

    def forward(
        self,
        s_init: torch.Tensor,
        z_init: torch.Tensor,
        **kwargs,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        B = z_init.shape[0] if z_init.ndim == 4 else 1

        s = s_init
        for b in range(2):
            s = s + self.transition_1[b](s)

        z = z_init
        if z.ndim == 3:
            z = z.unsqueeze(0).expand(B, -1, -1, -1)

        z_list = [z]
        if self.use_distogram:
            d_noise = torch.zeros(
                z.shape[:-1] + (self.n_bins_distogram,),
                device=z.device, dtype=z.dtype
            )
            z_list.append(d_noise)
        if self.use_self:
            d_self = kwargs.get("d_self")
            if d_self is None:
                d_self = torch.zeros(
                    z.shape[:-1] + (self.n_bins_distogram,),
                    device=z.device, dtype=z.dtype
                )
            z_list.append(d_self)

        z = torch.cat(z_list, dim=-1)
        z = self.process_z(z)

        for b in range(2):
            z = z + self.transition_2[b](z)

        for block in self.pairformer_stack:
            s, z = block(s, z)

        return s, z


class EmbeddingLayer(nn.Module):
    """Embedding layer for 1D features - simple linear projection."""

    def __init__(self, n_channels: int, output_channels: int):
        super().__init__()
        self.weight = nn.Parameter(torch.zeros(output_channels, n_channels))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return F.linear(x, self.weight)


class OneDFeatureEmbedder(nn.Module):
    """Embeds 1D features into a single vector."""

    def __init__(self, features: dict, output_channels: int):
        super().__init__()
        self.features = {k: v for k, v in features.items() if v is not None}
        self.embedders = nn.ModuleDict({
            feature: EmbeddingLayer(n_channels, output_channels)
            for feature, n_channels in self.features.items()
        })

    def forward(self, f: dict) -> torch.Tensor:
        result = None
        for feature in self.features:
            x = f.get(feature)
            if x is not None:
                emb = self.embedders[feature](x.float())
                result = emb if result is None else result + emb
        return result if result is not None else torch.zeros(1)


class PositionPairDistEmbedder(nn.Module):
    """Embeds pairwise position distances."""

    def __init__(self, c_atompair: int, embed_frame: bool = True):
        super().__init__()
        self.embed_frame = embed_frame
        if embed_frame:
            self.process_d = linearNoBias(3, c_atompair)
        self.process_inverse_dist = linearNoBias(1, c_atompair)
        self.process_valid_mask = linearNoBias(1, c_atompair)

    def forward(self, ref_pos: torch.Tensor, valid_mask: torch.Tensor) -> torch.Tensor:
        D_LL = ref_pos.unsqueeze(-2) - ref_pos.unsqueeze(-3)
        norm = torch.linalg.norm(D_LL, dim=-1, keepdim=True) ** 2
        norm = torch.clamp(norm, min=1e-6)
        inv_dist = 1 / (1 + norm)
        P_LL = self.process_inverse_dist(inv_dist) * valid_mask
        P_LL = P_LL + self.process_valid_mask(valid_mask.float()) * valid_mask
        return P_LL


class SinusoidalDistEmbed(nn.Module):
    """Sinusoidal embedding for pairwise distances."""

    def __init__(self, c_atompair: int, n_freqs: int = 32):
        super().__init__()
        self.n_freqs = n_freqs
        self.c_atompair = c_atompair
        self.output_proj = linearNoBias(2 * n_freqs, c_atompair)
        self.process_valid_mask = linearNoBias(1, c_atompair)

    def forward(self, pos: torch.Tensor, valid_mask: torch.Tensor) -> torch.Tensor:
        D_LL = pos.unsqueeze(-2) - pos.unsqueeze(-3)
        dist_matrix = torch.linalg.norm(D_LL, dim=-1)

        freq = torch.exp(
            -math.log(10000.0) * torch.arange(0, self.n_freqs, dtype=torch.float32) / self.n_freqs
        ).to(dist_matrix.device)

        angles = dist_matrix.unsqueeze(-1) * freq
        sincos_embed = torch.cat([torch.sin(angles), torch.cos(angles)], dim=-1)

        P_LL = self.output_proj(sincos_embed) * valid_mask
        P_LL = P_LL + self.process_valid_mask(valid_mask.float()) * valid_mask
        return P_LL


class RelativePositionEncoding(nn.Module):
    """Relative position encoding."""

    def __init__(self, r_max: int, s_max: int, c_z: int):
        super().__init__()
        self.r_max = r_max
        self.s_max = s_max
        num_tok_pos_bins = 2 * r_max + 3
        self.linear = linearNoBias(2 * num_tok_pos_bins + (2 * s_max + 2) + 1, c_z)

    def forward(self, f: dict) -> torch.Tensor:
        I = f.get("residue_index", torch.zeros(1)).shape[-1]
        device = f.get("residue_index", torch.zeros(1)).device
        return torch.zeros(I, I, self.linear.out_features, device=device)


class TokenInitializer(nn.Module):
    """Token embedding module for RFD3 matching foundry checkpoint structure."""

    def __init__(
        self,
        c_s: int = 384,
        c_z: int = 128,
        c_atom: int = 128,
        c_atompair: int = 16,
        r_max: int = 32,
        s_max: int = 2,
        n_pairformer_blocks: int = 2,
        atom_1d_features: Optional[dict] = None,
        token_1d_features: Optional[dict] = None,
        **kwargs,
    ):
        super().__init__()

        if atom_1d_features is None:
            atom_1d_features = {
                "ref_atom_name_chars": 256,
                "ref_element": 128,
                "ref_charge": 1,
                "ref_mask": 1,
                "ref_is_motif_atom_with_fixed_coord": 1,
                "ref_is_motif_atom_unindexed": 1,
                "has_zero_occupancy": 1,
                "ref_pos": 3,
                "ref_atomwise_rasa": 3,
                "active_donor": 1,
                "active_acceptor": 1,
                "is_atom_level_hotspot": 1,
            }

        if token_1d_features is None:
            token_1d_features = {
                "ref_motif_token_type": 3,
                "restype": 32,
                "ref_plddt": 1,
                "is_non_loopy": 1,
            }

        cross_attention_block = {"n_head": 4, "c_model": c_atom, "dropout": 0.0, "kq_norm": True}

        self.atom_1d_embedder_1 = OneDFeatureEmbedder(atom_1d_features, c_s)
        self.atom_1d_embedder_2 = OneDFeatureEmbedder(atom_1d_features, c_atom)
        self.token_1d_embedder = OneDFeatureEmbedder(token_1d_features, c_s)

        self.downcast_atom = Downcast(
            c_atom=c_s, c_token=c_s, c_s=None,
            method="cross_attention", cross_attention_block=cross_attention_block
        )
        self.transition_post_token = Transition(c=c_s, n=2)
        self.transition_post_atom = Transition(c=c_s, n=2)
        self.process_s_init = nn.Sequential(RMSNorm(c_s), linearNoBias(c_s, c_s))

        self.to_z_init_i = linearNoBias(c_s, c_z)
        self.to_z_init_j = linearNoBias(c_s, c_z)
        self.relative_position_encoding = RelativePositionEncoding(r_max=r_max, s_max=s_max, c_z=c_z)
        self.relative_position_encoding2 = RelativePositionEncoding(r_max=r_max, s_max=s_max, c_z=c_z)
        self.process_token_bonds = linearNoBias(1, c_z)

        self.process_z_init = nn.Sequential(RMSNorm(c_z * 2), linearNoBias(c_z * 2, c_z))
        self.transition_1 = nn.ModuleList([Transition(c=c_z, n=2), Transition(c=c_z, n=2)])
        self.ref_pos_embedder_tok = PositionPairDistEmbedder(c_z, embed_frame=False)

        pairformer_block = {"attention_pair_bias": {"n_head": 16, "kq_norm": True}, "n_transition": 4}
        self.transformer_stack = nn.ModuleList([
            PairformerBlock(c_s=c_s, c_z=c_z, **pairformer_block)
            for _ in range(n_pairformer_blocks)
        ])

        self.process_s_trunk = nn.Sequential(RMSNorm(c_s), linearNoBias(c_s, c_atom))
        self.process_single_l = nn.Sequential(nn.ReLU(), linearNoBias(c_atom, c_atompair))
        self.process_single_m = nn.Sequential(nn.ReLU(), linearNoBias(c_atom, c_atompair))
        self.process_z = nn.Sequential(RMSNorm(c_z), linearNoBias(c_z, c_atompair))

        self.motif_pos_embedder = SinusoidalDistEmbed(c_atompair=c_atompair)
        self.ref_pos_embedder = PositionPairDistEmbedder(c_atompair, embed_frame=False)
        self.pair_mlp = nn.Sequential(
            nn.ReLU(), linearNoBias(c_atompair, c_atompair),
            nn.ReLU(), linearNoBias(c_atompair, c_atompair),
            nn.ReLU(), linearNoBias(c_atompair, c_atompair),
        )
        self.process_pll = linearNoBias(c_atompair, c_atompair)
        self.project_pll = linearNoBias(c_atompair, c_z)

    def forward(self, f: dict) -> dict:
        """Compute initial representations from input features."""
        I = f.get("num_tokens", 100)
        device = next(self.parameters()).device
        dtype = next(self.parameters()).dtype

        s_init = torch.zeros(I, self.process_s_init[1].out_features, device=device, dtype=dtype)
        z_init = torch.zeros(I, I, self.process_z_init[1].out_features, device=device, dtype=dtype)

        return {"S_I": s_init, "Z_II": z_init}


class RFD3DiffusionModule(nn.Module):
    """
    RFD3 Diffusion Module matching foundry checkpoint structure.

    This module structure matches `model.diffusion_module.*` keys in the checkpoint.
    """

    def __init__(
        self,
        c_s: int = 384,
        c_z: int = 128,
        c_atom: int = 128,
        c_atompair: int = 16,
        c_token: int = 768,
        c_t_embed: int = 256,
        sigma_data: float = 16.0,
        n_pairformer_blocks: int = 2,
        n_diffusion_blocks: int = 18,
        n_atom_encoder_blocks: int = 3,
        n_atom_decoder_blocks: int = 3,
        n_head: int = 16,
        n_pairformer_head: int = 16,
        n_recycle: int = 2,
        p_drop: float = 0.0,
    ):
        super().__init__()

        self.sigma_data = sigma_data
        self.n_recycle = n_recycle

        self.process_r = linearNoBias(3, c_atom)
        self.to_r_update = nn.Sequential(RMSNorm(c_atom), linearNoBias(c_atom, 3))
        self.sequence_head = LinearSequenceHead(c_token)

        self.fourier_embedding = nn.ModuleList([
            FourierEmbedding(c_t_embed),
            FourierEmbedding(c_t_embed),
        ])
        self.process_n = nn.ModuleList([
            nn.Sequential(RMSNorm(c_t_embed), linearNoBias(c_t_embed, c_atom)),
            nn.Sequential(RMSNorm(c_t_embed), linearNoBias(c_t_embed, c_s)),
        ])

        cross_attention_block = {
            "n_head": 4,
            "c_model": c_atom,
            "dropout": p_drop,
            "kq_norm": True,
        }

        self.downcast_c = Downcast(
            c_atom=c_atom, c_token=c_s, c_s=None,
            method="cross_attention", cross_attention_block=cross_attention_block
        )
        self.downcast_q = Downcast(
            c_atom=c_atom, c_token=c_token, c_s=c_s,
            method="cross_attention", cross_attention_block=cross_attention_block
        )
        self.process_a = LinearEmbedWithPool(c_token)
        self.process_c = nn.Sequential(RMSNorm(c_atom), linearNoBias(c_atom, c_atom))

        atom_transformer_block = {
            "n_head": 4,
            "dropout": p_drop,
            "kq_norm": True,
        }

        self.encoder = LocalAtomTransformer(
            c_atom=c_atom,
            c_s=c_atom,
            c_atompair=c_atompair,
            atom_transformer_block=atom_transformer_block,
            n_blocks=n_atom_encoder_blocks,
        )

        pairformer_block = {
            "attention_pair_bias": {"n_head": n_pairformer_head, "kq_norm": False},
            "n_transition": 4,
        }

        self.diffusion_token_encoder = DiffusionTokenEncoder(
            c_s=c_s,
            c_z=c_z,
            c_token=c_token,
            c_atompair=c_atompair,
            n_pairformer_blocks=n_pairformer_blocks,
            pairformer_block=pairformer_block,
        )

        diffusion_transformer_block = {
            "n_head": n_head,
            "dropout": p_drop,
            "kq_norm": True,
        }

        self.diffusion_transformer = LocalTokenTransformer(
            c_token=c_token,
            c_tokenpair=c_z,
            c_s=c_s,
            diffusion_transformer_block=diffusion_transformer_block,
            n_block=n_diffusion_blocks,
        )

        decoder_upcast = {
            "method": "cross_attention",
            "n_split": 3,
            "cross_attention_block": cross_attention_block,
        }
        decoder_downcast = {
            "method": "cross_attention",
            "cross_attention_block": cross_attention_block,
        }

        self.decoder = CompactStreamingDecoder(
            c_atom=c_atom,
            c_atompair=c_atompair,
            c_token=c_token,
            c_s=c_s,
            c_tokenpair=c_z,
            atom_transformer_block=atom_transformer_block,
            upcast=decoder_upcast,
            downcast=decoder_downcast,
            n_blocks=n_atom_decoder_blocks,
        )

    def scale_positions_in(self, x_noisy: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
        if t.ndim == 1:
            t = t[..., None, None]
        elif t.ndim == 2:
            t = t[..., None]
        return x_noisy / torch.sqrt(t**2 + self.sigma_data**2)

    def scale_positions_out(
        self, r_update: torch.Tensor, x_noisy: torch.Tensor, t: torch.Tensor
    ) -> torch.Tensor:
        if t.ndim == 1:
            t = t[..., None, None]
        elif t.ndim == 2:
            t = t[..., None]
        sigma2 = self.sigma_data**2
        return (sigma2 / (sigma2 + t**2)) * x_noisy + (
            self.sigma_data * t / torch.sqrt(sigma2 + t**2)
        ) * r_update

    def process_time(self, t: torch.Tensor, idx: int) -> torch.Tensor:
        t_clamped = torch.clamp(t, min=1e-20)
        t_log = 0.25 * torch.log(t_clamped / self.sigma_data)
        emb = self.process_n[idx](self.fourier_embedding[idx](t_log))
        emb = emb * (t > 0).float()[..., None]
        return emb

    def compute_pair_features(self, xyz: torch.Tensor, c_atompair: int) -> torch.Tensor:
        dist = torch.cdist(xyz, xyz)
        inv_dist = 1 / (1 + dist**2)
        return inv_dist.unsqueeze(-1).expand(-1, -1, -1, c_atompair)


class RFDiffusionTransformerModel(ModelMixin, ConfigMixin):
    """
    RFDiffusion3 transformer for protein structure prediction.

    This wraps the diffusion module to provide the full model interface.
    The state dict keys match the foundry checkpoint format.
    """

    config_name = "config.json"
    _supports_gradient_checkpointing = True

    @register_to_config
    def __init__(
        self,
        c_s: int = 384,
        c_z: int = 128,
        c_atom: int = 128,
        c_atompair: int = 16,
        c_token: int = 768,
        c_t_embed: int = 256,
        sigma_data: float = 16.0,
        n_pairformer_block: int = 2,
        n_diffusion_block: int = 18,
        n_atom_encoder_block: int = 3,
        n_atom_decoder_block: int = 3,
        n_head: int = 16,
        n_pairformer_head: int = 16,
        n_recycle: int = 2,
        p_drop: float = 0.0,
    ):
        super().__init__()

        self.token_initializer = TokenInitializer(
            c_s=c_s,
            c_z=c_z,
            c_atom=c_atom,
            c_atompair=c_atompair,
        )

        self.diffusion_module = RFD3DiffusionModule(
            c_s=c_s,
            c_z=c_z,
            c_atom=c_atom,
            c_atompair=c_atompair,
            c_token=c_token,
            c_t_embed=c_t_embed,
            sigma_data=sigma_data,
            n_pairformer_blocks=n_pairformer_block,
            n_diffusion_blocks=n_diffusion_block,
            n_atom_encoder_blocks=n_atom_encoder_block,
            n_atom_decoder_blocks=n_atom_decoder_block,
            n_head=n_head,
            n_pairformer_head=n_pairformer_head,
            n_recycle=n_recycle,
            p_drop=p_drop,
        )

    @property
    def sigma_data(self) -> float:
        return self.diffusion_module.sigma_data

    def forward(
        self,
        xyz_noisy: torch.Tensor,
        t: torch.Tensor,
        atom_to_token_map: Optional[torch.Tensor] = None,
        motif_mask: Optional[torch.Tensor] = None,
        s_init: Optional[torch.Tensor] = None,
        z_init: Optional[torch.Tensor] = None,
        n_recycle: Optional[int] = None,
        **kwargs,
    ) -> RFDiffusionTransformerOutput:
        """
        Forward pass of the diffusion module.

        Args:
            xyz_noisy: Noisy atom coordinates [B, L, 3]
            t: Noise level / timestep [B]
            atom_to_token_map: Mapping from atoms to tokens [L]
            motif_mask: Mask for fixed motif atoms [L]
            s_init: Initial single representation [I, c_s]
            z_init: Initial pair representation [I, I, c_z]
            n_recycle: Number of recycling iterations

        Returns:
            RFDiffusionTransformerOutput with denoised coordinates
        """
        dm = self.diffusion_module
        B, L, _ = xyz_noisy.shape

        if atom_to_token_map is None:
            atom_to_token_map = torch.arange(L, device=xyz_noisy.device)
        I = int(atom_to_token_map.max().item()) + 1

        if motif_mask is None:
            motif_mask = torch.zeros(L, dtype=torch.bool, device=xyz_noisy.device)

        t_L = t[:, None].expand(B, L) * (~motif_mask).float()
        t_I = t[:, None].expand(B, I)

        r_scaled = dm.scale_positions_in(xyz_noisy, t)
        r_noisy = dm.scale_positions_in(xyz_noisy, t_L)

        if s_init is None or z_init is None:
            init_output = self.token_initializer({"num_tokens": I})
            if s_init is None:
                s_init = init_output["S_I"]
            if z_init is None:
                z_init = init_output["Z_II"]

        assert s_init is not None and z_init is not None

        p = dm.compute_pair_features(r_scaled, self.config.c_atompair)

        a_I = dm.process_a(r_noisy, tok_idx=atom_to_token_map)
        s_init_expanded = s_init.unsqueeze(0).expand(B, -1, -1) if s_init.ndim == 2 else s_init
        s_I = dm.downcast_c(torch.zeros(B, L, self.config.c_atom, device=xyz_noisy.device),
                           s_init_expanded,
                           tok_idx=atom_to_token_map)

        q = dm.process_r(r_noisy)
        c = dm.process_time(t_L, idx=0)
        q = q + c
        s_I = s_I + dm.process_time(t_I, idx=1)
        c = c + dm.process_c(c)

        q = dm.encoder(q, c, p)
        a_I = dm.downcast_q(q, a_I, s_I, tok_idx=atom_to_token_map)

        if n_recycle is None:
            n_recycle = dm.n_recycle if not self.training else 1
        n_recycle = max(1, n_recycle)

        z_II = z_init
        for _ in range(n_recycle):
            s_I, z_II = dm.diffusion_token_encoder(s_init=s_I, z_init=z_II)
            a_I = dm.diffusion_transformer(a_I, s_I, z_II)

        a_I, q, _ = dm.decoder(a_I, s_I, z_II, q, c, p, tok_idx=atom_to_token_map)

        r_update = dm.to_r_update(q)
        xyz_out = dm.scale_positions_out(r_update, xyz_noisy, t_L)

        sequence_logits, sequence_indices = dm.sequence_head(a_I)

        return RFDiffusionTransformerOutput(
            xyz=xyz_out,
            single=s_I,
            pair=z_II,
            sequence_logits=sequence_logits,
            sequence_indices=sequence_indices,
        )