dn6
/

rfdiffusion

Model card Files Files and versions

xet

Community

dn6 HF Staff commited on Feb 5

Commit

7e6b4cd

verified ·

1 Parent(s): 288a16c

Upload transformer/model.py with huggingface_hub

Browse files

Files changed (1) hide show

transformer/model.py +1183 -0

transformer/model.py ADDED Viewed

	@@ -0,0 +1,1183 @@

+# Copyright 2025 Dhruv Nair. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+RFDiffusion3 Transformer model.
+This module provides a diffusers-compatible implementation of the RFD3
+architecture for protein structure prediction and generation. The module
+structure matches the foundry checkpoint format for direct weight loading.
+"""
+import math
+from dataclasses import dataclass
+from functools import partial
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+@dataclass
+class RFDiffusionTransformerOutput:
+    """Output class for RFDiffusion transformer."""
+    xyz: torch.Tensor
+    single: torch.Tensor
+    pair: torch.Tensor
+    sequence_logits: Optional[torch.Tensor] = None
+    sequence_indices: Optional[torch.Tensor] = None
+linearNoBias = partial(nn.Linear, bias=False)
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization."""
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
+        return x / rms * self.weight
+class FourierEmbedding(nn.Module):
+    """Fourier feature embedding for timesteps with learned weights."""
+    def __init__(self, c: int):
+        super().__init__()
+        self.c = c
+        self.register_buffer("w", torch.zeros(c, dtype=torch.float32))
+        self.register_buffer("b", torch.zeros(c, dtype=torch.float32))
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        nn.init.normal_(self.w)
+        nn.init.normal_(self.b)
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        return torch.cos(2 * math.pi * (t[..., None] * self.w + self.b))
+class LinearBiasInit(nn.Linear):
+    """Linear layer with custom bias initialization."""
+    def __init__(self, *args, biasinit: float = -2.0, **kwargs):
+        self.biasinit = biasinit
+        super().__init__(*args, **kwargs)
+    def reset_parameters(self) -> None:
+        super().reset_parameters()
+        if self.bias is not None:
+            self.bias.data.fill_(self.biasinit)
+class RMSNormNoWeight(nn.Module):
+    """RMSNorm without learnable weight (elementwise_affine=False)."""
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.dim = dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
+        return x / rms
+class AdaLN(nn.Module):
+    """Adaptive Layer Normalization."""
+    def __init__(self, c_a: int, c_s: int):
+        super().__init__()
+        self.ln_a = RMSNormNoWeight(c_a)
+        self.ln_s = RMSNorm(c_s)
+        self.to_gain = nn.Sequential(nn.Linear(c_s, c_a), nn.Sigmoid())
+        self.to_bias = linearNoBias(c_s, c_a)
+    def forward(self, a: torch.Tensor, s: torch.Tensor) -> torch.Tensor:
+        a = self.ln_a(a)
+        s = self.ln_s(s)
+        return self.to_gain(s) * a + self.to_bias(s)
+class Transition(nn.Module):
+    """SwiGLU-style transition block matching foundry naming."""
+    def __init__(self, c: int, n: int = 4):
+        super().__init__()
+        self.layer_norm_1 = RMSNorm(c)
+        self.linear_1 = linearNoBias(c, n * c)
+        self.linear_2 = linearNoBias(c, n * c)
+        self.linear_3 = linearNoBias(n * c, c)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.layer_norm_1(x)
+        return self.linear_3(F.silu(self.linear_1(x)) * self.linear_2(x))
+class ConditionedTransitionBlock(nn.Module):
+    """SwiGLU transition with adaptive layer norm conditioning."""
+    def __init__(self, c_token: int, c_s: int, n: int = 2):
+        super().__init__()
+        self.ada_ln = AdaLN(c_a=c_token, c_s=c_s)
+        self.linear_1 = linearNoBias(c_token, c_token * n)
+        self.linear_2 = linearNoBias(c_token, c_token * n)
+        self.linear_output_project = nn.Sequential(
+            LinearBiasInit(c_s, c_token, biasinit=-2.0),
+            nn.Sigmoid(),
+        )
+        self.linear_3 = linearNoBias(c_token * n, c_token)
+    def forward(self, a: torch.Tensor, s: torch.Tensor) -> torch.Tensor:
+        a = self.ada_ln(a, s)
+        b = F.silu(self.linear_1(a)) * self.linear_2(a)
+        return self.linear_output_project(s) * self.linear_3(b)
+class MultiDimLinear(nn.Linear):
+    """Linear layer that reshapes output to multi-dimensional shape."""
+    def __init__(self, in_features: int, out_shape: Tuple[int, ...], norm: bool = False, **kwargs):
+        self.out_shape = out_shape
+        out_features = 1
+        for d in out_shape:
+            out_features *= d
+        super().__init__(in_features, out_features, **kwargs)
+        if norm:
+            self.ln = RMSNorm(out_features)
+            self.use_ln = True
+        else:
+            self.use_ln = False
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = super().forward(x)
+        if self.use_ln:
+            out = self.ln(out)
+        return out.reshape(x.shape[:-1] + self.out_shape)
+class AttentionPairBias(nn.Module):
+    """Attention with pairwise bias for Pairformer."""
+    def __init__(
+        self,
+        c_a: int,
+        c_s: int,
+        c_pair: int,
+        n_head: int = 8,
+        kq_norm: bool = False,
+    ):
+        super().__init__()
+        self.n_head = n_head
+        self.c_a = c_a
+        self.c_pair = c_pair
+        self.c = c_a // n_head
+        self.to_q = MultiDimLinear(c_a, (n_head, self.c))
+        self.to_k = MultiDimLinear(c_a, (n_head, self.c), bias=False, norm=True)
+        self.to_v = MultiDimLinear(c_a, (n_head, self.c), bias=False, norm=True)
+        self.to_b = linearNoBias(c_pair, n_head)
+        self.to_g = nn.Sequential(
+            MultiDimLinear(c_a, (n_head, self.c), bias=False),
+            nn.Sigmoid(),
+        )
+        self.to_a = linearNoBias(c_a, c_a)
+        self.ln_0 = RMSNorm(c_pair)
+        self.ln_1 = RMSNorm(c_a)
+    def forward(
+        self,
+        a: torch.Tensor,
+        s: Optional[torch.Tensor],
+        z: torch.Tensor,
+        beta: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        a = self.ln_1(a)
+        q = self.to_q(a)
+        k = self.to_k(a)
+        v = self.to_v(a)
+        b = self.to_b(self.ln_0(z))
+        if beta is not None:
+            b = b + beta[..., None]
+        g = self.to_g(a)
+        q = q / math.sqrt(self.c)
+        attn = torch.einsum("...ihd,...jhd->...ijh", q, k) + b
+        attn = F.softmax(attn, dim=-2)
+        out = torch.einsum("...ijh,...jhc->...ihc", attn, v)
+        out = g * out
+        out = out.flatten(start_dim=-2)
+        out = self.to_a(out)
+        return out
+class LocalAttentionPairBias(nn.Module):
+    """Local attention with pairwise bias for diffusion transformer blocks."""
+    def __init__(
+        self,
+        c_a: int,
+        c_s: int,
+        c_pair: int,
+        n_head: int = 16,
+        kq_norm: bool = True,
+    ):
+        super().__init__()
+        self.n_head = n_head
+        self.c = c_a
+        self.c_head = c_a // n_head
+        self.c_s = c_s
+        self.use_checkpointing = False
+        self.to_q = linearNoBias(c_a, c_a)
+        self.to_k = linearNoBias(c_a, c_a)
+        self.to_v = linearNoBias(c_a, c_a)
+        self.to_b = linearNoBias(c_pair, n_head)
+        self.to_g = nn.Sequential(linearNoBias(c_a, c_a), nn.Sigmoid())
+        self.to_o = linearNoBias(c_a, c_a)
+        self.kq_norm = kq_norm
+        if kq_norm:
+            self.ln_q = RMSNorm(c_a)
+            self.ln_k = RMSNorm(c_a)
+        if c_s is not None and c_s > 0:
+            self.ada_ln_1 = AdaLN(c_a=c_a, c_s=c_s)
+            self.linear_output_project = nn.Sequential(
+                LinearBiasInit(c_s, c_a, biasinit=-2.0),
+                nn.Sigmoid(),
+            )
+        else:
+            self.ln_1 = RMSNorm(c_a)
+    def forward(
+        self,
+        a: torch.Tensor,
+        s: Optional[torch.Tensor],
+        z: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        if self.c_s is not None and self.c_s > 0:
+            a = self.ada_ln_1(a, s)
+        else:
+            a = self.ln_1(a)
+        q = self.to_q(a)
+        k = self.to_k(a)
+        v = self.to_v(a)
+        g = self.to_g(a)
+        if self.kq_norm:
+            q = self.ln_q(q)
+            k = self.ln_k(k)
+        batch_dims = a.shape[:-2]
+        L = a.shape[-2]
+        q = q.view(*batch_dims, L, self.n_head, self.c_head).transpose(-2, -3)
+        k = k.view(*batch_dims, L, self.n_head, self.c_head).transpose(-2, -3)
+        v = v.view(*batch_dims, L, self.n_head, self.c_head).transpose(-2, -3)
+        g = g.view(*batch_dims, L, self.n_head, self.c_head).transpose(-2, -3)
+        b = self.to_b(z).permute(*range(len(batch_dims)), -1, -3, -2)
+        attn = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.c_head)
+        attn = attn + b
+        attn = F.softmax(attn, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out * g
+        out = out.transpose(-2, -3).contiguous()
+        out = out.view(*batch_dims, L, self.c)
+        out = self.to_o(out)
+        if self.c_s is not None and self.c_s > 0:
+            out = self.linear_output_project(s) * out
+        return out
+class PairformerBlock(nn.Module):
+    """Pairformer block with attention and transitions."""
+    def __init__(
+        self,
+        c_s: int,
+        c_z: int,
+        attention_pair_bias: dict,
+        n_transition: int = 4,
+        p_drop: float = 0.1,
+        **kwargs,
+    ):
+        super().__init__()
+        self.z_transition = Transition(c=c_z, n=n_transition)
+        if c_s > 0:
+            self.s_transition = Transition(c=c_s, n=n_transition)
+            self.attention_pair_bias = AttentionPairBias(
+                c_a=c_s, c_s=0, c_pair=c_z, **attention_pair_bias
+            )
+    def forward(
+        self,
+        s: torch.Tensor,
+        z: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        z = z + self.z_transition(z)
+        if s is not None:
+            beta = torch.tensor([0.0], device=z.device)
+            s = s + self.attention_pair_bias(s, None, z, beta=beta)
+            s = s + self.s_transition(s)
+        return s, z
+class StructureLocalAtomTransformerBlock(nn.Module):
+    """Single block for atom/token transformer."""
+    def __init__(
+        self,
+        c_atom: int,
+        c_s: Optional[int],
+        c_atompair: int,
+        n_head: int = 4,
+        dropout: float = 0.0,
+        kq_norm: bool = True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.c_s = c_s
+        self.dropout = nn.Dropout(dropout)
+        self.attention_pair_bias = LocalAttentionPairBias(
+            c_a=c_atom, c_s=c_s, c_pair=c_atompair, n_head=n_head, kq_norm=kq_norm
+        )
+        if c_s is not None and c_s > 0:
+            self.transition_block = ConditionedTransitionBlock(c_token=c_atom, c_s=c_s)
+        else:
+            self.transition_block = Transition(c=c_atom, n=4)
+    def forward(
+        self,
+        q: torch.Tensor,
+        c: Optional[torch.Tensor],
+        p: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        q = q + self.dropout(self.attention_pair_bias(q, c, p, **kwargs))
+        if self.c_s is not None and self.c_s > 0:
+            q = q + self.transition_block(q, c)
+        else:
+            q = q + self.transition_block(q)
+        return q
+class GatedCrossAttention(nn.Module):
+    """Gated cross attention for upcast/downcast."""
+    def __init__(
+        self,
+        c_query: int,
+        c_kv: int,
+        c_model: int = 128,
+        n_head: int = 4,
+        kq_norm: bool = True,
+        dropout: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__()
+        self.n_head = n_head
+        self.scale = 1 / math.sqrt(c_model // n_head)
+        self.ln_q = RMSNorm(c_query)
+        self.ln_kv = RMSNorm(c_kv)
+        self.to_q = linearNoBias(c_query, c_model)
+        self.to_k = linearNoBias(c_kv, c_model)
+        self.to_v = linearNoBias(c_kv, c_model)
+        self.to_g = nn.Sequential(linearNoBias(c_query, c_model), nn.Sigmoid())
+        self.to_out = nn.Sequential(nn.Linear(c_model, c_query), nn.Dropout(dropout))
+        self.kq_norm = kq_norm
+        if kq_norm:
+            self.k_norm = RMSNorm(c_model)
+            self.q_norm = RMSNorm(c_model)
+    def forward(
+        self,
+        q: torch.Tensor,
+        kv: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        q_in = self.ln_q(q)
+        kv = self.ln_kv(kv)
+        q_proj = self.to_q(q_in)
+        k = self.to_k(kv)
+        v = self.to_v(kv)
+        g = self.to_g(q_in)
+        if self.kq_norm:
+            k = self.k_norm(k)
+            q_proj = self.q_norm(q_proj)
+        B = q.shape[0]
+        n_tok = q.shape[1] if q.ndim == 4 else 1
+        L_q = q.shape[-2]
+        L_kv = kv.shape[-2]
+        c_head = q_proj.shape[-1] // self.n_head
+        if q.ndim == 4:
+            q_proj = q_proj.view(B, n_tok, L_q, self.n_head, c_head).permute(0, 3, 1, 2, 4)
+            k = k.view(B, n_tok, L_kv, self.n_head, c_head).permute(0, 3, 1, 2, 4)
+            v = v.view(B, n_tok, L_kv, self.n_head, c_head).permute(0, 3, 1, 2, 4)
+            g = g.view(B, n_tok, L_q, self.n_head, c_head).permute(0, 3, 1, 2, 4)
+        else:
+            q_proj = q_proj.view(B, L_q, self.n_head, c_head).permute(0, 2, 1, 3)
+            k = k.view(B, L_kv, self.n_head, c_head).permute(0, 2, 1, 3)
+            v = v.view(B, L_kv, self.n_head, c_head).permute(0, 2, 1, 3)
+            g = g.view(B, L_q, self.n_head, c_head).permute(0, 2, 1, 3)
+        attn = torch.matmul(q_proj, k.transpose(-1, -2)) * self.scale
+        if attn_mask is not None:
+            if q.ndim == 4:
+                while attn_mask.ndim < attn.ndim:
+                    attn_mask = attn_mask.unsqueeze(0)
+                if attn_mask.shape[1] != self.n_head and attn_mask.shape[1] != 1:
+                    attn_mask = attn_mask.unsqueeze(1)
+            else:
+                attn_mask = attn_mask.unsqueeze(-3)
+            attn = attn.masked_fill(~attn_mask, float("-inf"))
+        attn = F.softmax(attn, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out * g
+        if q.ndim == 4:
+            out = out.permute(0, 2, 3, 1, 4).contiguous()
+            out = out.view(B, n_tok, L_q, -1)
+        else:
+            out = out.permute(0, 2, 1, 3).contiguous()
+            out = out.view(B, L_q, -1)
+        out = self.to_out(out)
+        return out
+class Upcast(nn.Module):
+    """Upcast from token level to atom level."""
+    def __init__(
+        self,
+        c_atom: int,
+        c_token: int,
+        method: str = "cross_attention",
+        cross_attention_block: Optional[dict] = None,
+        n_split: int = 6,
+        **kwargs,
+    ):
+        super().__init__()
+        self.method = method
+        self.n_split = n_split
+        if method == "broadcast":
+            self.project = nn.Sequential(RMSNorm(c_token), linearNoBias(c_token, c_atom))
+        elif method == "cross_attention":
+            self.gca = GatedCrossAttention(
+                c_query=c_atom,
+                c_kv=c_token // n_split,
+                c_model=c_atom,
+                **(cross_attention_block or {}),
+            )
+    def forward(self, q: torch.Tensor, a: torch.Tensor, tok_idx: torch.Tensor) -> torch.Tensor:
+        if self.method == "broadcast":
+            q = q + self.project(a)[..., tok_idx, :]
+        elif self.method == "cross_attention":
+            B, L, C = q.shape
+            I = int(tok_idx.max().item()) + 1
+            a_split = a.view(B, I, self.n_split, -1)
+            q_grouped = self._group_atoms(q, tok_idx, I)
+            valid_mask = self._build_valid_mask(tok_idx, I, q.device)
+            attn_mask = torch.ones(I, q_grouped.shape[2], self.n_split, device=q.device, dtype=torch.bool)
+            attn_mask[~valid_mask] = False
+            q_update = self.gca(q_grouped, a_split, attn_mask=attn_mask)
+            q = q + self._ungroup_atoms(q_update, valid_mask, L)
+        return q
+    def _group_atoms(self, q: torch.Tensor, tok_idx: torch.Tensor, I: int) -> torch.Tensor:
+        B, L, C = q.shape
+        max_atoms_per_token = 14
+        grouped = torch.zeros(B, I, max_atoms_per_token, C, device=q.device, dtype=q.dtype)
+        counts = torch.zeros(I, dtype=torch.long, device=q.device)
+        for i in range(L):
+            t = tok_idx[i].item()
+            if counts[t] < max_atoms_per_token:
+                grouped[:, t, counts[t]] = q[:, i]
+                counts[t] += 1
+        return grouped
+    def _build_valid_mask(self, tok_idx: torch.Tensor, I: int, device: torch.device) -> torch.Tensor:
+        max_atoms_per_token = 14
+        valid_mask = torch.zeros(I, max_atoms_per_token, dtype=torch.bool, device=device)
+        counts = torch.zeros(I, dtype=torch.long, device=device)
+        for i in range(len(tok_idx)):
+            t = tok_idx[i].item()
+            if counts[t] < max_atoms_per_token:
+                valid_mask[t, counts[t]] = True
+                counts[t] += 1
+        return valid_mask
+    def _ungroup_atoms(self, grouped: torch.Tensor, valid_mask: torch.Tensor, L: int) -> torch.Tensor:
+        B, I, n_atoms, C = grouped.shape
+        out = torch.zeros(B, L, C, device=grouped.device, dtype=grouped.dtype)
+        idx = 0
+        for t in range(I):
+            for a in range(n_atoms):
+                if valid_mask[t, a] and idx < L:
+                    out[:, idx] = grouped[:, t, a]
+                    idx += 1
+        return out
+class Downcast(nn.Module):
+    """Downcast from atom level to token level."""
+    def __init__(
+        self,
+        c_atom: int,
+        c_token: int,
+        c_s: Optional[int] = None,
+        method: str = "mean",
+        cross_attention_block: Optional[dict] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.method = method
+        self.c_token = c_token
+        self.c_atom = c_atom
+        if c_s is not None:
+            self.process_s = nn.Sequential(RMSNorm(c_s), linearNoBias(c_s, c_token))
+        else:
+            self.process_s = None
+        if method == "mean":
+            self.gca = linearNoBias(c_atom, c_token)
+        elif method == "cross_attention":
+            self.gca = GatedCrossAttention(
+                c_query=c_token,
+                c_kv=c_atom,
+                c_model=c_token,
+                **(cross_attention_block or {}),
+            )
+    def forward(
+        self,
+        q: torch.Tensor,
+        a: Optional[torch.Tensor] = None,
+        s: Optional[torch.Tensor] = None,
+        tok_idx: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if q.ndim == 2:
+            q = q.unsqueeze(0)
+            squeeze = True
+        else:
+            squeeze = False
+        B, L, _ = q.shape
+        I = int(tok_idx.max().item()) + 1
+        if self.method == "mean":
+            projected = self.gca(q)
+            a_update = torch.zeros(B, I, self.c_token, device=q.device, dtype=q.dtype)
+            counts = torch.zeros(B, I, 1, device=q.device, dtype=q.dtype)
+            for i in range(L):
+                t = tok_idx[i]
+                a_update[:, t] += projected[:, i]
+                counts[:, t] += 1
+            a_update = a_update / (counts + 1e-8)
+        elif self.method == "cross_attention":
+            if a is None:
+                a = torch.zeros(B, I, self.c_token, device=q.device, dtype=q.dtype)
+            elif a.ndim == 2:
+                a = a.unsqueeze(0)
+            q_grouped, valid_mask = self._group_atoms(q, tok_idx, I)
+            attn_mask = valid_mask.unsqueeze(-2)
+            a_update = self.gca(a.unsqueeze(-2), q_grouped, attn_mask=attn_mask).squeeze(-2)
+        else:
+            a_update = torch.zeros(B, I, self.c_token, device=q.device, dtype=q.dtype)
+        if a is not None:
+            if a.ndim == 2:
+                a = a.unsqueeze(0)
+            a = a + a_update
+        else:
+            a = a_update
+        if self.process_s is not None and s is not None:
+            if s.ndim == 2:
+                s = s.unsqueeze(0)
+            a = a + self.process_s(s)
+        if squeeze:
+            a = a.squeeze(0)
+        return a
+    def _group_atoms(
+        self, q: torch.Tensor, tok_idx: torch.Tensor, I: int
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, L, C = q.shape
+        max_atoms_per_token = 14
+        grouped = torch.zeros(B, I, max_atoms_per_token, C, device=q.device, dtype=q.dtype)
+        valid_mask = torch.zeros(I, max_atoms_per_token, dtype=torch.bool, device=q.device)
+        counts = torch.zeros(I, dtype=torch.long, device=q.device)
+        for i in range(L):
+            t = tok_idx[i].item()
+            if counts[t] < max_atoms_per_token:
+                grouped[:, t, counts[t]] = q[:, i]
+                valid_mask[t, counts[t]] = True
+                counts[t] += 1
+        return grouped, valid_mask
+class LinearEmbedWithPool(nn.Module):
+    """Linear embedding with pooling to token level."""
+    def __init__(self, c_token: int):
+        super().__init__()
+        self.c_token = c_token
+        self.linear = linearNoBias(3, c_token)
+    def forward(self, r: torch.Tensor, tok_idx: torch.Tensor) -> torch.Tensor:
+        B = r.shape[0]
+        I = int(tok_idx.max().item()) + 1
+        q = self.linear(r)
+        a = torch.zeros(B, I, self.c_token, device=r.device, dtype=q.dtype)
+        counts = torch.zeros(B, I, 1, device=r.device, dtype=q.dtype)
+        for i in range(r.shape[1]):
+            t = tok_idx[i]
+            a[:, t] += q[:, i]
+            counts[:, t] += 1
+        return a / (counts + 1e-8)
+class LinearSequenceHead(nn.Module):
+    """Sequence prediction head."""
+    def __init__(self, c_token: int):
+        super().__init__()
+        n_tok_all = 32
+        mask = torch.ones(n_tok_all, dtype=torch.bool)
+        self.register_buffer("valid_out_mask", mask)
+        self.linear = nn.Linear(c_token, n_tok_all)
+    def forward(self, a: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        logits = self.linear(a)
+        probs = F.softmax(logits, dim=-1)
+        probs = probs * self.valid_out_mask[None, None, :].to(probs.device)
+        probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)
+        indices = probs.argmax(dim=-1)
+        return logits, indices
+class LocalAtomTransformer(nn.Module):
+    """Atom-level transformer encoder."""
+    def __init__(
+        self,
+        c_atom: int,
+        c_s: Optional[int],
+        c_atompair: int,
+        atom_transformer_block: dict,
+        n_blocks: int,
+    ):
+        super().__init__()
+        self.blocks = nn.ModuleList([
+            StructureLocalAtomTransformerBlock(
+                c_atom=c_atom,
+                c_s=c_s,
+                c_atompair=c_atompair,
+                **atom_transformer_block,
+            )
+            for _ in range(n_blocks)
+        ])
+    def forward(
+        self,
+        q: torch.Tensor,
+        c: Optional[torch.Tensor],
+        p: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        for block in self.blocks:
+            q = block(q, c, p, **kwargs)
+        return q
+class LocalTokenTransformer(nn.Module):
+    """Token-level transformer for diffusion."""
+    def __init__(
+        self,
+        c_token: int,
+        c_tokenpair: int,
+        c_s: int,
+        diffusion_transformer_block: dict,
+        n_block: int,
+        **kwargs,
+    ):
+        super().__init__()
+        self.blocks = nn.ModuleList([
+            StructureLocalAtomTransformerBlock(
+                c_atom=c_token,
+                c_s=c_s,
+                c_atompair=c_tokenpair,
+                **diffusion_transformer_block,
+            )
+            for _ in range(n_block)
+        ])
+    def forward(
+        self,
+        a: torch.Tensor,
+        s: torch.Tensor,
+        z: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        for block in self.blocks:
+            a = block(a, s, z, **kwargs)
+        return a
+class CompactStreamingDecoder(nn.Module):
+    """Decoder with upcast, atom transformer, and downcast."""
+    def __init__(
+        self,
+        c_atom: int,
+        c_atompair: int,
+        c_token: int,
+        c_s: int,
+        c_tokenpair: int,
+        atom_transformer_block: dict,
+        upcast: dict,
+        downcast: dict,
+        n_blocks: int,
+        **kwargs,
+    ):
+        super().__init__()
+        self.n_blocks = n_blocks
+        self.upcast = nn.ModuleList([
+            Upcast(c_atom=c_atom, c_token=c_token, **upcast)
+            for _ in range(n_blocks)
+        ])
+        self.atom_transformer = nn.ModuleList([
+            StructureLocalAtomTransformerBlock(
+                c_atom=c_atom,
+                c_s=c_atom,
+                c_atompair=c_atompair,
+                **atom_transformer_block,
+            )
+            for _ in range(n_blocks)
+        ])
+        self.downcast = Downcast(c_atom=c_atom, c_token=c_token, c_s=c_s, **downcast)
+    def forward(
+        self,
+        a: torch.Tensor,
+        s: torch.Tensor,
+        z: torch.Tensor,
+        q: torch.Tensor,
+        c: torch.Tensor,
+        p: torch.Tensor,
+        tok_idx: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor, dict]:
+        for i in range(self.n_blocks):
+            q = self.upcast[i](q, a, tok_idx=tok_idx)
+            q = self.atom_transformer[i](q, c, p, **kwargs)
+        a = self.downcast(q.detach(), a.detach(), s.detach(), tok_idx=tok_idx)
+        return a, q, {}
+class DiffusionTokenEncoder(nn.Module):
+    """Token encoder with pairformer stack for diffusion."""
+    def __init__(
+        self,
+        c_s: int,
+        c_z: int,
+        c_token: int,
+        c_atompair: int,
+        n_pairformer_blocks: int,
+        pairformer_block: dict,
+        **kwargs,
+    ):
+        super().__init__()
+        self.transition_1 = nn.ModuleList([
+            Transition(c=c_s, n=2),
+            Transition(c=c_s, n=2),
+        ])
+        self.process_z = nn.Sequential(
+            RMSNorm(c_z),
+            linearNoBias(c_z, c_z),
+        )
+        self.transition_2 = nn.ModuleList([
+            Transition(c=c_z, n=2),
+            Transition(c=c_z, n=2),
+        ])
+        self.pairformer_stack = nn.ModuleList([
+            PairformerBlock(c_s=c_s, c_z=c_z, **pairformer_block)
+            for _ in range(n_pairformer_blocks)
+        ])
+    def forward(
+        self,
+        s_init: torch.Tensor,
+        z_init: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        B = z_init.shape[0] if z_init.ndim == 4 else 1
+        s = s_init
+        for b in range(2):
+            s = s + self.transition_1[b](s)
+        z = z_init
+        if z.ndim == 3:
+            z = z.unsqueeze(0).expand(B, -1, -1, -1)
+        z = self.process_z(z)
+        for b in range(2):
+            z = z + self.transition_2[b](z)
+        for block in self.pairformer_stack:
+            s, z = block(s, z)
+        return s, z
+class RFD3DiffusionModule(nn.Module):
+    """
+    RFD3 Diffusion Module matching foundry checkpoint structure.
+    This module structure matches `model.diffusion_module.*` keys in the checkpoint.
+    """
+    def __init__(
+        self,
+        c_s: int = 384,
+        c_z: int = 128,
+        c_atom: int = 128,
+        c_atompair: int = 16,
+        c_token: int = 768,
+        c_t_embed: int = 256,
+        sigma_data: float = 16.0,
+        n_pairformer_blocks: int = 2,
+        n_diffusion_blocks: int = 18,
+        n_atom_encoder_blocks: int = 3,
+        n_atom_decoder_blocks: int = 3,
+        n_head: int = 16,
+        n_recycle: int = 2,
+        p_drop: float = 0.0,
+    ):
+        super().__init__()
+        self.sigma_data = sigma_data
+        self.n_recycle = n_recycle
+        self.process_r = linearNoBias(3, c_atom)
+        self.to_r_update = nn.Sequential(RMSNorm(c_atom), linearNoBias(c_atom, 3))
+        self.sequence_head = LinearSequenceHead(c_token)
+        self.fourier_embedding = nn.ModuleList([
+            FourierEmbedding(c_t_embed),
+            FourierEmbedding(c_t_embed),
+        ])
+        self.process_n = nn.ModuleList([
+            nn.Sequential(RMSNorm(c_t_embed), linearNoBias(c_t_embed, c_atom)),
+            nn.Sequential(RMSNorm(c_t_embed), linearNoBias(c_t_embed, c_s)),
+        ])
+        self.downcast_c = Downcast(c_atom=c_atom, c_token=c_s, c_s=None, method="cross_attention")
+        self.downcast_q = Downcast(c_atom=c_atom, c_token=c_token, c_s=c_s, method="cross_attention")
+        self.process_a = LinearEmbedWithPool(c_token)
+        self.process_c = nn.Sequential(RMSNorm(c_atom), linearNoBias(c_atom, c_atom))
+        atom_transformer_block = {
+            "n_head": 4,
+            "dropout": p_drop,
+            "kq_norm": True,
+        }
+        self.encoder = LocalAtomTransformer(
+            c_atom=c_atom,
+            c_s=c_atom,
+            c_atompair=c_atompair,
+            atom_transformer_block=atom_transformer_block,
+            n_blocks=n_atom_encoder_blocks,
+        )
+        pairformer_block = {
+            "attention_pair_bias": {"n_head": 4, "kq_norm": False},
+            "n_transition": 4,
+        }
+        self.diffusion_token_encoder = DiffusionTokenEncoder(
+            c_s=c_s,
+            c_z=c_z,
+            c_token=c_token,
+            c_atompair=c_atompair,
+            n_pairformer_blocks=n_pairformer_blocks,
+            pairformer_block=pairformer_block,
+        )
+        diffusion_transformer_block = {
+            "n_head": n_head,
+            "dropout": p_drop,
+            "kq_norm": True,
+        }
+        self.diffusion_transformer = LocalTokenTransformer(
+            c_token=c_token,
+            c_tokenpair=c_z,
+            c_s=c_s,
+            diffusion_transformer_block=diffusion_transformer_block,
+            n_block=n_diffusion_blocks,
+        )
+        decoder_upcast = {"method": "cross_attention"}
+        decoder_downcast = {"method": "cross_attention"}
+        self.decoder = CompactStreamingDecoder(
+            c_atom=c_atom,
+            c_atompair=c_atompair,
+            c_token=c_token,
+            c_s=c_s,
+            c_tokenpair=c_z,
+            atom_transformer_block=atom_transformer_block,
+            upcast=decoder_upcast,
+            downcast=decoder_downcast,
+            n_blocks=n_atom_decoder_blocks,
+        )
+    def scale_positions_in(self, x_noisy: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        if t.ndim == 1:
+            t = t[..., None, None]
+        elif t.ndim == 2:
+            t = t[..., None]
+        return x_noisy / torch.sqrt(t**2 + self.sigma_data**2)
+    def scale_positions_out(
+        self, r_update: torch.Tensor, x_noisy: torch.Tensor, t: torch.Tensor
+    ) -> torch.Tensor:
+        if t.ndim == 1:
+            t = t[..., None, None]
+        elif t.ndim == 2:
+            t = t[..., None]
+        sigma2 = self.sigma_data**2
+        return (sigma2 / (sigma2 + t**2)) * x_noisy + (
+            self.sigma_data * t / torch.sqrt(sigma2 + t**2)
+        ) * r_update
+    def process_time(self, t: torch.Tensor, idx: int) -> torch.Tensor:
+        t_clamped = torch.clamp(t, min=1e-20)
+        t_log = 0.25 * torch.log(t_clamped / self.sigma_data)
+        emb = self.process_n[idx](self.fourier_embedding[idx](t_log))
+        emb = emb * (t > 0).float()[..., None]
+        return emb
+    def compute_pair_features(self, xyz: torch.Tensor, c_atompair: int) -> torch.Tensor:
+        dist = torch.cdist(xyz, xyz)
+        inv_dist = 1 / (1 + dist**2)
+        return inv_dist.unsqueeze(-1).expand(-1, -1, -1, c_atompair)
+class RFDiffusionTransformerModel(ModelMixin, ConfigMixin):
+    """
+    RFDiffusion3 transformer for protein structure prediction.
+    This wraps the diffusion module to provide the full model interface.
+    The state dict keys match the foundry checkpoint format.
+    """
+    config_name = "config.json"
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        c_s: int = 384,
+        c_z: int = 128,
+        c_atom: int = 128,
+        c_atompair: int = 16,
+        c_token: int = 768,
+        c_t_embed: int = 256,
+        sigma_data: float = 16.0,
+        n_pairformer_block: int = 2,
+        n_diffusion_block: int = 18,
+        n_atom_encoder_block: int = 3,
+        n_atom_decoder_block: int = 3,
+        n_head: int = 16,
+        n_recycle: int = 2,
+        p_drop: float = 0.0,
+    ):
+        super().__init__()
+        self.diffusion_module = RFD3DiffusionModule(
+            c_s=c_s,
+            c_z=c_z,
+            c_atom=c_atom,
+            c_atompair=c_atompair,
+            c_token=c_token,
+            c_t_embed=c_t_embed,
+            sigma_data=sigma_data,
+            n_pairformer_blocks=n_pairformer_block,
+            n_diffusion_blocks=n_diffusion_block,
+            n_atom_encoder_blocks=n_atom_encoder_block,
+            n_atom_decoder_blocks=n_atom_decoder_block,
+            n_head=n_head,
+            n_recycle=n_recycle,
+            p_drop=p_drop,
+        )
+        self.s_init = nn.Parameter(torch.zeros(1, 1, c_s))
+        self.z_init = nn.Parameter(torch.zeros(1, 1, 1, c_z))
+    @property
+    def sigma_data(self) -> float:
+        return self.diffusion_module.sigma_data
+    def forward(
+        self,
+        xyz_noisy: torch.Tensor,
+        t: torch.Tensor,
+        atom_to_token_map: Optional[torch.Tensor] = None,
+        motif_mask: Optional[torch.Tensor] = None,
+        s_init: Optional[torch.Tensor] = None,
+        z_init: Optional[torch.Tensor] = None,
+        n_recycle: Optional[int] = None,
+        **kwargs,
+    ) -> RFDiffusionTransformerOutput:
+        """
+        Forward pass of the diffusion module.
+        Args:
+            xyz_noisy: Noisy atom coordinates [B, L, 3]
+            t: Noise level / timestep [B]
+            atom_to_token_map: Mapping from atoms to tokens [L]
+            motif_mask: Mask for fixed motif atoms [L]
+            s_init: Initial single representation [I, c_s]
+            z_init: Initial pair representation [I, I, c_z]
+            n_recycle: Number of recycling iterations
+        Returns:
+            RFDiffusionTransformerOutput with denoised coordinates
+        """
+        dm = self.diffusion_module
+        B, L, _ = xyz_noisy.shape
+        if atom_to_token_map is None:
+            atom_to_token_map = torch.arange(L, device=xyz_noisy.device)
+        I = atom_to_token_map.max() + 1
+        if motif_mask is None:
+            motif_mask = torch.zeros(L, dtype=torch.bool, device=xyz_noisy.device)
+        t_L = t[:, None].expand(B, L) * (~motif_mask).float()
+        t_I = t[:, None].expand(B, I)
+        r_scaled = dm.scale_positions_in(xyz_noisy, t)
+        r_noisy = dm.scale_positions_in(xyz_noisy, t_L)
+        if s_init is None:
+            s_init = self.s_init.squeeze(0).expand(I, -1)
+        if z_init is None:
+            z_init = self.z_init.squeeze(0).expand(I, I, -1)
+        p = dm.compute_pair_features(r_scaled, self.config.c_atompair)
+        a_I = dm.process_a(r_noisy, tok_idx=atom_to_token_map)
+        s_I = dm.downcast_c(torch.zeros(B, L, self.config.c_atom, device=xyz_noisy.device),
+                           s_init.unsqueeze(0).expand(B, -1, -1) if s_init.ndim == 2 else s_init,
+                           tok_idx=atom_to_token_map)
+        q = dm.process_r(r_noisy)
+        c = dm.process_time(t_L, idx=0)
+        q = q + c
+        s_I = s_I + dm.process_time(t_I, idx=1)
+        c = c + dm.process_c(c)
+        q = dm.encoder(q, c, p)
+        a_I = dm.downcast_q(q, a_I, s_I, tok_idx=atom_to_token_map)
+        if n_recycle is None:
+            n_recycle = dm.n_recycle if not self.training else 1
+        for _ in range(n_recycle):
+            s_I, z_II = dm.diffusion_token_encoder(s_init=s_I, z_init=z_init)
+            a_I = dm.diffusion_transformer(a_I, s_I, z_II)
+        a_I, q, _ = dm.decoder(a_I, s_I, z_II, q, c, p, tok_idx=atom_to_token_map)
+        r_update = dm.to_r_update(q)
+        xyz_out = dm.scale_positions_out(r_update, xyz_noisy, t_L)
+        sequence_logits, sequence_indices = dm.sequence_head(a_I)
+        return RFDiffusionTransformerOutput(
+            xyz=xyz_out,
+            single=s_I,
+            pair=z_II,
+            sequence_logits=sequence_logits,
+            sequence_indices=sequence_indices,
+        )