Adding source code

Browse files

Files changed (8) hide show

layers/__init__.py +20 -0
layers/feedforward.py +196 -0
layers/gla.py +721 -0
layers/mla.py +616 -0
layers/task_heads.py +195 -0
models/__init__.py +33 -0
models/shared_space_config.py +256 -0
models/shared_space_decoder.py +376 -0

layers/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# -*- coding: utf-8 -*-
+"""
+Subspace Decoder Layers
+This module contains the layer implementations for the Shared Subspace Decoder,
+including Multi-Head Latent Attention (MLA) and decomposed MLP layers.
+"""
+# Import the main layer classes
+from .mla import MultiheadLatentAttention, RotaryEmbedding
+from .feedforward import SubspaceFeedForward
+from .task_heads import SharedSpaceDecoderForCausalLM
+__all__ = [
+    "MultiheadLatentAttention",
+    "RotaryEmbedding",
+    "SubspaceFeedForward",
+    "SharedSpaceDecoderForCausalLM",
+]

layers/feedforward.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""# ▂▂▂▂▂▂▂▂▂▂▂▂
+# `feedforward.py`
+Regarding dropout:
+- I don't see it applied to the MoE in DeepSeek-V3, [here](https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py).
+- I don't see it applied in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L140)
+Norms:
+* nn.RMSNorm [here](https://docs.pytorch.org/docs/stable/generated/torch.nn.RMSNorm.html)
+## FFN
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..models.shared_space_config import SharedSpaceDecoderConfig
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+# TODO - Find a shared place to put this.
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class SubspaceFeedForward(nn.Module):
+    """
+    Feed-forward block for SharedSpaceDecoder.
+    Implements SwiGLU:
+        FFN(x) = W_out( Swish(W_in(x)) ⊙ W_gate(x) ) + residual
+    Supports both dense and decomposed MLP variants.
+    Dense:
+        - W_in:   Linear(hidden_dim → intermediate_dim)
+        - W_gate: Linear(hidden_dim → intermediate_dim)
+        - W_out:  Linear(intermediate_dim → hidden_dim)
+    Decomposed:
+        - W_in_shared:   Linear(hidden_dim → rank, bias=False)
+        - W_in_shared_norm: RMSNorm
+        - W_in:          Linear(rank → intermediate_dim)
+        - W_gate_shared: Linear(hidden_dim → rank, bias=False)
+        - W_gate_shared_norm: RMSNorm
+        - W_gate:        Linear(rank → intermediate_dim)
+        - W_out:         Linear(intermediate_dim → rank, bias=False)
+        - W_out_shared:  Linear(rank → hidden_dim)
+    Residual, dropout, and post-norm are handled inside the block.
+    """
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        #dropout_prob = config.hidden_dropout_prob # TODO - Style -- don't define variables if only used once.
+        # Determine whether this is a dense or decomposed layer.
+        # It's dense if either:
+        #  - ffn_decompose is disabled (no dense layers at all)
+        #  - ffn_decompose is enabled, but this is one of the early dense layers.
+        self.is_dense = (not config.ffn_decompose) or (layer_idx < config.num_dense_layers)
+        hidden_dim = config.hidden_size
+        intermediate_dim = config.intermediate_size # TODO - Find something shorter, and use the same name.
+        # If it's one of the dense layers,
+        if self.is_dense:
+            # === Dense FFN Projections ===
+            self.W_in = nn.Linear(hidden_dim, intermediate_dim)
+            self.W_gate = nn.Linear(hidden_dim, intermediate_dim)
+            self.W_out = nn.Linear(intermediate_dim, hidden_dim)
+        # Define weights for the decomposed version.
+        else:
+            rank = config.ffn_rank
+            print("hidden_dim:", hidden_dim)
+            print("rank:", rank)
+            # === Input Projections ===
+            self.W_in_shared = nn.Linear(hidden_dim, rank, bias=False)
+            self.W_in_shared_norm = create_norm_layer(rank, config)
+            self.W_in = nn.Linear(rank, intermediate_dim, bias=True)
+            # === Gate Projections ===
+            self.W_gate_shared = nn.Linear(hidden_dim, rank, bias=False)
+            self.W_gate_shared_norm = create_norm_layer(rank, config)
+            self.W_gate = nn.Linear(rank, intermediate_dim, bias=True)
+            # === Output Projection ===
+            self.W_out = nn.Linear(intermediate_dim, rank, bias=False)
+            # TODO - Could experiment with this.
+            #self.W_out_shared_layernorm = DeepseekV3RMSNorm(rank, eps=config.eps)
+            self.W_out_shared = nn.Linear(rank, hidden_dim, bias=True)
+        # See notes no dropout
+        #self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # === Tensor Dimension Symbols ===
+        # B: batch_size     — number of samples in the batch
+        # T: seq_len        — number of tokens per sample
+        # D: hidden_dim     — model embedding size
+        # R: ffn_rank       — latent shared subspace dimension
+        # D_ff: intermediate_size — FFN hidden dimension
+        # =========================
+        #    Gated Feedforward
+        # =========================
+        if self.is_dense:
+            # =============
+            #     Dense
+            # =============
+            # Input:  x [B, T, D]
+            # Output: x_proj [B, T, D_ff]
+            x_proj = self.W_in(x)
+            # Output: gate [B, T, D_ff]
+            gate = self.W_gate(x)
+            # SwiGLU nonlinearity
+            x = F.silu(x_proj) * gate  # [B, T, D_ff]
+            # See notes on dropout
+            #x = self.dropout(x)
+            # Output: x [B, T, D]
+            x = self.W_out(x)
+        else:
+            # ==================
+            #     Decomposed
+            # ==================
+            # Input:  x [B, T, D]
+            # Output: x_proj [B, T, D_ff]
+            x_proj = self.W_in(self.W_in_shared_norm(self.W_in_shared(x)))
+            # Input:  x [B, T, D]
+            # Output: gate [B, T, D_ff]
+            gate = self.W_gate(self.W_gate_shared_norm(self.W_gate_shared(x)))
+            # SwiGLU nonlinearity
+            x = F.silu(x_proj) * gate  # [B, T, D_ff]
+            # See notes on dropout
+            #x = self.dropout(x)
+            # Output: x [B, T, D]
+            x = self.W_out_shared(self.W_out(x))
+        return x

layers/gla.py ADDED Viewed

	@@ -0,0 +1,721 @@

+"""# ▂▂▂▂▂▂▂▂▂▂▂▂
+# `gla.py`
+Based on: https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+import math
+from models.shared_space_config import SharedSpaceDecoderConfig
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    If `hidden_size` is `None`, this returns an identity layer.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if hidden_size is None:
+        return nn.Identity()
+    elif config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+# TODO - Find a shared place to put this.
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# Helper function needed because it's called twice during RoPE,
+# but I dumped it in the comments there.
+# TODO - Nah, screw it, just write it twice! At least then you get
+# to use the word 'query' instead of 'x'.
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+class RotaryEmbedding(nn.Module):
+    """Precompute RoPE embeddings and store them as buffers."""
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__()
+        dim = config.rope_dims
+        seq_len = config.max_position_embeddings
+        # ------------------------------
+        # Compute inverse frequencies
+        # ------------------------------
+        # Shape: [dim // 2]
+        #   inv_freq[i] = 1 / (theta^(i / dim))
+        inv_freq = 1.0 / (
+            config.rope_theta
+            ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
+        )
+        # ------------------------------
+        # Apply RoPE scaling if configured
+        # ------------------------------
+        if config.rope_scaling is not None:
+            scaling_type = config.rope_scaling.get("type", "linear")
+            scaling_factor = config.rope_scaling.get("factor", 1.0)
+            if scaling_type == "linear":
+                # Linear scaling: divide frequencies by scaling factor
+                inv_freq = inv_freq / scaling_factor
+            elif scaling_type == "dynamic":
+                # Dynamic scaling: adjust based on sequence length
+                # This is a simplified implementation
+                inv_freq = inv_freq / scaling_factor
+            else:
+                print(f"Warning: Unknown RoPE scaling type '{scaling_type}', using linear scaling")
+                inv_freq = inv_freq / scaling_factor
+        # ------------------------------
+        # Compute position indices
+        # ------------------------------
+        # Shape: [seq_len]
+        t = torch.arange(seq_len, dtype=torch.float32)
+        # ------------------------------
+        # Outer product: [seq_len, dim // 2]
+        # Each row i contains: t[i] * inv_freq
+        # ------------------------------
+        freqs = torch.outer(t, inv_freq)
+        # ------------------------------
+        # Duplicate for interleaved sin/cos: [seq_len, dim]
+        # This matches the common format: [sin_0, cos_0, sin_1, cos_1, ...]
+        # ------------------------------
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # ------------------------------
+        # Register cos/sin as buffers
+        # - Stored in float32
+        # - Will be moved to correct device/dtype via model.to(...)
+        # - Not saved with state_dict (persistent=False)
+        # ------------------------------
+        self.register_buffer("cos", emb.cos(), persistent=False)
+        self.register_buffer("sin", emb.sin(), persistent=False)
+    def forward(self, position_ids: torch.LongTensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """ """
+        return None # This function is not necessary.
+"""## GLA"""
+class GroupedLatentAttention(nn.Module):
+    """
+    This version of Multihead Latent Attention applies the re-ordering trick from DeepSeekV3.
+    Instead of comparing the queries and keys in the query-key space, we compare them in the
+    kv-shared space.
+    For clarity, I've re-interpreted the naming of the heads, and am framing it as MQA.
+    What were previously labeled the query and key heads are now treated as a low-rank decomposition
+    of the query heads.
+    What we considered the "shared key/value space" is now a single key head that is also used as the
+    value head.
+    Finally, what we previously labeled the value and output heads are now treated as a low-rank
+    decomposition of the output heads.
+    This interpretation / implementation is designed to leverage the performance benefits of GQA.
+    The trade-off is that the query-key matching space is now larger--it will require a greater
+    number of calculations to match the queries to the keys. The hope is that the memory bandwidth
+    savings will outweigh the increased computational cost.
+    The same applies to the value-output space.
+    Note that, although the query-key and value-output spaces are now large, the low-rank
+    decomposition of the query heads and output heads ensures that the heads are still effectively
+    low rank / not over-parameterized.
+    Finally, note that this implementation also supports the optional use of shared spaces on
+    the query and output sides.
+    I've named the class "GroupedLatentAttention" because I may expand it to support multiple
+    key/value heads (i.e., multiple groups of query heads) in the future.
+    ==== Adding RoPE to VO ====
+    ### **Attempt**
+    We're extending Rotary Position Embeddings (RoPE) beyond the query-key interaction to the **value-output path** in Multihead Latent Attention (MLA).
+    * In DeepSeek-V3's MLA framing, the same **full-rank key/value head** provides both the keys (for patterns) and the values (for messages).
+    * Queries and output heads are low-rank bottlenecks, effectively serving as vocabularies of **pattern directions** (Q) and **message directions** (O).
+    * Standard RoPE only modulates the Q–K dot product. Our attempt is to also apply RoPE phases consistently in the V–O pathway, so that **positional dependence is preserved in both the matching (QK) and messaging (VO) sides**.
+    --
+    ### **Hypothesis**
+    If we rotate value vectors by their **source position phase** and then apply the **inverse rotation at the destination** before output projection, the model gains a clean **relative-position equivariance** in the message path, mirroring the property RoPE provides for queries and keys.
+    This should:
+    1. Make the 1-to-1 correspondence between "pattern templates" (Q) and "message templates" (O) more consistent.
+    2. Reduce the burden on output heads to learn ad-hoc positional compensation.
+    3. Improve long-context generalization, since both attention matching *and* message passing would share the same relative-position geometry.
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        # Used to determine if this layer is dense or uses latents.
+        self.layer_idx = layer_idx
+        self.attention_dropout_prob = config.attention_dropout_prob
+        self.num_heads = config.num_attention_heads
+        self.rope_theta = config.rope_theta
+        self.rope_dims = config.rope_dims
+        self.nope_dims = config.nope_dims
+        self.q_shared_dim = config.q_shared_dim
+        # What was previously considered the key/value shared dimension is now the
+        # size of the MQA style single key/value head.
+        self.kv_head_dim = config.kv_shared_dim
+        self.o_shared_dim = config.o_shared_dim
+        # What was previously the query/key head size is now the size of
+        # the query head decomposition.
+        self.q_inner_dim = config.qk_private_dim
+        # What was previously the value/output head size is now the size of
+        # the output head decomposition.
+        self.o_inner_dim = config.vo_private_dim
+        self.hidden_size = config.hidden_size
+        # =========================
+        #     Input Projections
+        # =========================
+        # If this is one of the dense layers,
+        if self.layer_idx < config.num_dense_layers:
+            # =========================
+            #     Dense Attention
+            # =========================
+            # No latent projections.
+            self.latent_spaces = False
+            # Define the standard QKV projection
+            self.qkv_proj = nn.Linear(
+                config.hidden_size,
+                self.num_heads * (self.qk_private_dim * 2 + self.vo_private_dim),
+                bias=config.attention_bias,
+            )
+            # Dense output projection
+            self.o_proj = nn.Linear(
+                self.num_heads * self.vo_private_dim,
+                config.hidden_size,
+                bias=config.attention_bias,
+            )
+        # If we're past the dense layers,
+        else:
+            # =========================
+            #     Latent Attention
+            # =========================
+            # Use latent projections.
+            self.latent_spaces = True
+            # Input latent projections
+            print("config.q_shared_dim", config.q_shared_dim)
+            # ==========================
+            #     Shared Query Space
+            # ==========================
+            # If we're using a shared query subspace,
+            if config.q_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = True
+                self.q_shared_proj = nn.Linear(
+                    config.hidden_size,
+                    self.q_shared_dim,
+                    bias=config.attention_bias,
+                )
+                self.q_shared_norm = create_norm_layer(self.q_shared_dim, config)
+            else:
+                print("Using identity for shared projection.")
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = False
+                self.q_shared_dim = config.hidden_size
+                #print("Updated self.q_shared_dim to", self.q_shared_dim)
+                # Use identity.
+                self.q_shared_proj = nn.Identity()
+                self.q_shared_norm = nn.Identity()
+            # ==========================
+            #     Shared Output Space
+            # ==========================
+            # If we're using a shared output space,
+            if config.o_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.output_shared = True
+                # Shared output projection
+                # The head outputs from `o_private_proj` are first summed together (across
+                # heads) in the latent space.
+                # Then we project their combined outputs (a single vector per token)
+                # back to model space via `o_shared_proj`.
+                self.o_shared_proj = nn.Linear(
+                    self.o_shared_dim,
+                    self.hidden_size,
+                    bias=config.attention_bias
+                )
+                self.o_shared_norm = create_norm_layer(self.o_shared_dim, config)
+            else:
+                # Set a flag that we'll check in `forward`.
+                self.output_shared = False
+                self.o_shared_dim = config.hidden_size
+                # Use identity.
+                self.o_shared_proj = nn.Identity()
+                self.o_shared_norm = nn.Identity()
+            # ================================
+            #      Decomposed Query Heads
+            # ================================
+            # Query down projections.
+            # The query head inner dimension makes the head low rank, as usual.
+            self.q_priv_a_proj = nn.Linear(
+                self.q_shared_dim,
+                self.num_heads * self.q_inner_dim,
+                bias=False
+            )
+            # Query up projections.
+            # We project back to the larger key/value space.
+            # Rather than create a linear and break it apart, we can create our
+            # desired shapes.
+            #  per-head Dq_c -> Dkv     (store as [H, Dq_c, Dkv])
+            self.q_priv_b_weight = nn.Parameter(
+                torch.empty(self.num_heads, self.q_inner_dim, self.kv_head_dim)
+            )
+            nn.init.kaiming_uniform_(self.q_priv_b_weight, a=math.sqrt(5))
+            # ====================================
+            #      Single Joint Key/Value Head
+            # ====================================
+            # The single joint key/value head.
+            self.kv_priv_proj = nn.Linear(
+                self.hidden_size,
+                self.kv_head_dim,
+                bias=False,
+            )
+            self.kv_priv_norm = create_norm_layer(self.kv_head_dim, config)
+            # ================================
+            #      Decomposed Output Heads
+            # ================================
+            # Down: values [B,H,T,Dkv] -> per-head Do_c using weights [H, Dkv, Do_c]
+            self.o_priv_a_weight = nn.Parameter(
+                torch.empty(self.num_heads, self.kv_head_dim, self.o_inner_dim)
+            )
+            nn.init.kaiming_uniform_(self.o_priv_a_weight, a=math.sqrt(5))
+            # Output up projections.
+            # We project back to the larger output subspace (or the model space,
+            # if no subspace is used).
+            self.o_priv_b_proj = nn.Linear(
+                self.num_heads * self.o_inner_dim,
+                self.o_shared_dim,
+                bias=False
+            )
+        # Let SDPA choose 1/sqrt(E). If you want explicit: self.kv_head_dim ** -0.5
+        self.softmax_scale = None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        #past_key_value: Optional[Cache] = None, # TODO - Can I remove this?
+        #cache_position: Optional[torch.LongTensor] = None, # TODO - Can I remove this?
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # === Tensor Dimension Symbols ===
+        #     B: batch_size     — number of samples in the batch
+        #     T: seq_len        — number of tokens per sample
+        #     H: n_heads        — number of attention heads
+        #     D: hidden_dim     — model embedding size
+        #  Dq_c: q_inner_dim    - per-head decomposition dim for Q
+        Dq_c = self.q_inner_dim        # per-head inner dim for Q
+        #  Do_c: o_inner_dim    - per-head decomposition dim for O
+        Do_c = self.o_inner_dim        # per-head inner dim for O
+        #   Dkv: kv_head_dim    - Head size of the joint key/value head
+        Dkv  = self.kv_head_dim        # Head size of the joint key/value head
+        #    Dr: rope_dims      - The first Dr dimensions receive rope.
+        #  Dq_s: q_shared_dim   - query shared subspace size
+        Dq_s = self.q_shared_dim
+        #  Do_s: o_shared_dim   - output shared subspace size
+        Do_s = self.o_shared_dim
+        # Input token embeddings
+        # hidden_states: [B, T, D]
+        B, T = hidden_states.shape[:2]
+        H = self.num_heads
+        # =============================
+        #     Shared Query Space
+        # =============================
+        # These are set to identity if no shared query space is used.
+        # Project token embeddings into shared latents
+        # Input:
+        #     hidden_states [B, T, D]
+        #     q_shared_proj [D, Dq_s]
+        #    kv_shared_proj [D, Dkv]
+        # Output:
+        #          q_shared  [B, T, Dq_s]
+        #          kv_shared [B, T, Dkv]
+        q_shared = self.q_shared_proj(hidden_states)
+        # Normalize latent vectors, shapes unchanged.
+        q_shared = self.q_shared_norm(q_shared)
+        # ================================
+        #     Decomposed Query Heads
+        # ================================
+        # Project query latents onto decomposed query heads.
+        #
+        # Down projection ('a')
+        # Input:
+        #     q_shared       [B, T, Dq_s]
+        #     q_priv_a_proj [Dq_s, H*Dq_c]
+        # Output:
+        #     queries_c   [B, T, H*Dq_c]
+        queries_c = self.q_priv_a_proj(q_shared)
+        # Split the vectors by head
+        # Input:
+        #     queries_c        [B, T, H*Dq_c]
+        # Output:
+        #     queries_c   [B, T, H, Dq_c]
+        queries_c = queries_c.view(B, T, H, Dq_c)
+        # Up projection ('b')
+        # Input:
+        #     queries_c        [B, T, H, Dq_c]
+        #     q_priv_b_weight        [H, Dq_c, Dkv]
+        # Output:
+        #     queries     [B, H, T, Dkv]
+        queries = torch.einsum("bthd,hdc->bhtc", queries_c, self.q_priv_b_weight)
+        # ===================================
+        #     Single Joint Key/Value Head
+        # ===================================
+        # Project token embeddings into single joint key/value head.
+        # Input:
+        #     hidden_states [B, T, D]
+        #     kv_priv_proj [D, Dkv]
+        # Output:
+        #     keyvalue [B, T, Dkv]
+        keyvalue = self.kv_priv_proj(hidden_states)
+        # Apply QK normalization.
+        keyvalue = self.kv_priv_norm(keyvalue)
+        # Prepare the queries and keyvalue vectors for RoPE and flash attention.
+        # We have multiple query heads, and the queries are in `queries`.
+        # We have a single key head, and the keyvector is in `keyvalue`.
+        # Move the head dimension to the front, so for each head, we have
+        # a series of vectors for each token in the sequence.
+        #
+        # Inputs:
+        #   keyvalue  [B, T, Dkv]
+        # Output:
+        #   keyvalue   [B, 1, T, Dkv]
+        keyvalue = keyvalue.unsqueeze(1)
+        # ==================
+        #        RoPE
+        # ==================
+        # Apply rotary position embeddings to the first `self.rope_dims` of
+        # each head.
+        # The slice operations are free, but the concatenation is
+        # not, because the outputs of the rotation operation are new data
+        # occupying different memory. Still considered the best option,
+        # though.
+        # 1. Unpack the precomputed cosine and sine embeddings
+        # Position embeddings is a tuple of
+        #    (cos [seq_len, rope_dims],
+        #     sin [seq_len, rope_dims])
+        cos, sin = position_embeddings
+        # 2. Split the query and key heads into the part to rotate and the part
+        #    to pass through (early columns get position info, later ones don't)
+        #
+        #  (Using queries as example)
+        #  Inputs:
+        #    queries  [B, H, T, Dkv]  Dkv = rope_dims + not_rope_dims
+        #  Outputs:
+        #    q_rope   [B, H, T, Dr]
+        #    q_pass   [B, H, T, Dkv-Dr]
+        q_rope, q_pass = queries[..., :self.rope_dims], queries[..., self.rope_dims:]
+        k_rope, k_pass =   keyvalue[..., :self.rope_dims],   keyvalue[..., self.rope_dims:]
+        # 3. Apply the rotary embedding to the designated slice
+        #
+        # To broadcast cos and sin across the batch and head dimensions, we unsqueeze them.
+        # Shape change: [T, Dr] -> [1, 1, T, Dr]
+        cos = cos.unsqueeze(0).unsqueeze(0)
+        sin = sin.unsqueeze(0).unsqueeze(0)
+        #print("q_rope.shape[-1] // 2:", (q_rope.shape[-1] // 2))
+        #print("x1 = x[..., :x.shape[-1] // 2 ].shape:", q_rope[..., :q_rope.shape[-1] // 2 ].shape)
+        #print("sin/cos.shape:", cos.shape)
+        #print("q_rope.shape:", q_rope.shape)
+        #print("(q_rope * cos).shape:", (q_rope * cos).shape)
+        #print("rotate_half(q_rope).shape:", rotate_half(q_rope).shape)
+        #print("(rotate_half(q_rope) * sin).shape:", (rotate_half(q_rope) * sin).shape)
+        """
+        In this example   batch_size = 2, hum_heads = 8, seq_len = 65, rope_dims = 16
+                        q_rope.shape[-1] // 2: 8
+        x1 = x[..., :x.shape[-1] // 2 ].shape: torch.Size([2, 8, 65, 8])
+                    sin/cos.shape: torch.Size([1, 1, 65, 16])  # After double unsqueeze.
+                    vq_rope.shape: torch.Size([2, 8, 65, 16])
+             (q_rope * cos).shape: torch.Size([2, 8, 65, 16])
+        rotate_half(q_rope).shape: torch.Size([2, 8, 65, 16])
+        (rotate_half(q_rope) * sin).shape: torch.Size([2, 8, 65, 16])
+        """
+        # Let's walk through the queries as the example.
+        # What does rotate half do?
+        #    dim -1 is the row vectors, the queries
+        #
+        #  Step 1: Split the vector in half.
+        #    "q_rope.shape[-1] // 2" <- How much to select. Half the length of the q_rope vector
+        #    x1 = x[..., :x.shape[-1] // 2 ]  # Select the first half of the vector.
+        #    x2 = x[...,  x.shape[-1] // 2:]  # Select the second half.
+        #
+        #  Step 2:
+        #      - Apply negative to the values in the second half.
+        #      - Reverse the order of the halves.
+        #    return torch.cat((-x2, x1), dim=-1)
+        #
+        # ---- (q_rope * cos) ----
+        # Element-wise multiply the values in each `cos` vector with the
+        # corresponding (i.e., same sequence position) `q_rope` vector.
+        #
+        # Inputs:
+        #    q_rope  [B, H, T, Dr]
+        #       cos  [1, 1, T, Dr]
+        #
+        # Outputs:
+        #        x   [B, H, T, Dr]
+        #
+        # ---- (rotate_half(q_rope)) ----
+        #  TODO
+        #
+        # Inputs:
+        #       q_rope    [B, T, Dr]
+        #
+        # Outputs:
+        #   rot_q_rope    [B, T, Dr]
+        #
+        # ---- rotated * sin ----
+        #  TODO
+        q_rotated = (q_rope * cos) + (rotate_half(q_rope) * sin)
+        k_rotated = (k_rope * cos) + (rotate_half(k_rope) * sin)
+        # 4. Concatenate the rotated and pass-through parts back together
+        # Input (each): [B, H, T, Dr] and [B, H, T, Dkv-Dr]
+        # Output (each): [B, H, T, Dkv]
+        # (Where h = 1 for the key head and h = num_heads for the query heads)
+        queries = torch.cat((q_rotated, q_pass), dim=-1)
+        keyvalue = torch.cat((k_rotated, k_pass), dim=-1)
+        # ====================
+        #      GQA / MQA
+        # ====================
+        # GPT says that flash attention will infer the broadcasting, so `expand` is not needed.
+        #
+        # We need to use the `expand` operation to broadcast the keyvalue vector
+        # across the query heads.
+        # Input:
+        #     keyvalue [B, 1, T, Dkv]
+        # Output:
+        #     keyvalue [B, H, T, Dkv]
+        #keyvalue = keyvalue.expand(-1, H, -1, -1)
+        # ===================
+        #       Attention
+        # ===================
+        # We're ready for the attention score calculation.
+        # Only apply dropout during training.
+        # self.training is a pytorch flag.
+        if self.training:
+            dropout_p = self.attention_dropout_prob
+        else:
+            dropout_p = 0.0
+        # Call SDPA / Flash Attention
+        # https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+        # Apply MQA / GQA. In this case, we have a single key head, and multiple query heads.
+        values = F.scaled_dot_product_attention(
+            queries,
+            keyvalue, # Single key vector (joint with value) for GQA / MQA.
+            keyvalue, # Single value vector (joint with key) for GQA / MQA.
+            attn_mask=None, # attention_mask,
+            dropout_p=dropout_p,
+            scale=self.softmax_scale,
+            is_causal=True, # This is a decoder - apply causal masking
+        )
+        # Attention outputs:
+        # values [B, H, T, Dkv]
+        # The final Dr dims of the value vectors carry RoPE information.
+        # We can either (1) add position dependence to the value-output process,
+        # or (2) we can strip off the RoPE information and only use the non-RoPE parts.
+        # Let's try option 1!
+        # Split the values into the RoPE and non-RoPE parts.
+        # Input:
+        #     values [B, H, T, Dkv]
+        # Output:
+        #     values_rope [B, H, T, Dr]
+        #     values_pass [B, H, T, Dkv-Dr]
+        values_rope, values_pass = values[..., :self.rope_dims], values[..., self.rope_dims:]
+        # Fold the query RoPE information into the value vectors.
+        # Inverse rotation: R_{-θ} x  =  (x * cos)  - (rotate_half(x) * sin)
+        # Input:
+        #     values_rope [B, H, T, Dr]
+        #            cos  [1, 1, T, Dr]
+        #            sin  [1, 1, T, Dr]
+        # Output:
+        #     values_unrot [B, H, T, Dr]
+        values_unrot = (values_rope * cos) - (rotate_half(values_rope) * sin)
+        # Now the values have the offset information in their rope dimensions,
+        # and the output heads can learn to use it.
+        values = torch.cat((values_unrot, values_pass), dim=-1)  # [B,H,T,Dkv]
+        # =========================
+        #     Output Projection
+        # =========================
+        # Project the values onto the decomposed output heads.
+        # Output down projection heads.
+        # Input:
+        #            values  [B, H, T, Dkv]
+        #   o_priv_a_weight     [H, Dkv, Do_c]
+        # Output:
+        #         outputs_c  [B, H, T, Do_c]
+        outputs_c = torch.einsum("bhtd,hdc->bhtc", values, self.o_priv_a_weight)
+        # For the up projection, we can concatenate the 'outputs_c' vectors by head,
+        # (in the same way we would usually concatenate the value vectors)
+        # Input:
+        #    outputs_c  [B, H, T, Do_c]
+        # Output:
+        #   outputs_c  [B, T, H*Do_c]
+        outputs_c = outputs_c.permute(0, 2, 1, 3).contiguous().view(B, T, H * Do_c)
+        # Project up to the shared output space and sum across the output heads.
+        # Input:
+        #    outputs_c  [B, T, H*Do_c]
+        #    o_priv_b_proj [H*Do_c, Do_s]
+        # Output:
+        #    output_s  [B, T, Do_s]
+        output_s = self.o_priv_b_proj(outputs_c)
+        # Apply normalization to the output latents
+        output_s = self.o_shared_norm(output_s)
+        # Re-project the output latent representation back to model space.
+        # Input:
+        #    output_s      [B, T, Do_s]
+        #    o_shared_proj [Do_s, D]
+        # Output:
+        #    attn_output   [B, T, D]
+        attn_output = self.o_shared_proj(output_s)
+        # TODO - Not currently supported.
+        # If this is a dense layer,
+        # Project the values back into model space.
+        # attn_output = self.o_proj(attn_output)
+        # -----------------------------------------
+        return attn_output

layers/mla.py ADDED Viewed

	@@ -0,0 +1,616 @@

+"""# ▂▂▂▂▂▂▂▂▂▂▂▂
+# `mla.py`
+Based on: https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py
+## RotaryEmbedding
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+from ..models.shared_space_config import SharedSpaceDecoderConfig
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    If `hidden_size` is `None`, this returns an identity layer.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if hidden_size is None:
+        return nn.Identity()
+    elif config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+# TODO - Find a shared place to put this.
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# Helper function needed because it's called twice during RoPE,
+# but I dumped it in the comments there.
+# TODO - Nah, screw it, just write it twice! At least then you get
+# to use the word 'query' instead of 'x'.
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+class RotaryEmbedding(nn.Module):
+    """Precompute RoPE embeddings and store them as buffers."""
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__()
+        dim = config.rope_dims
+        seq_len = config.max_position_embeddings
+        # ------------------------------
+        # Compute inverse frequencies
+        # ------------------------------
+        # Shape: [dim // 2]
+        #   inv_freq[i] = 1 / (theta^(i / dim))
+        inv_freq = 1.0 / (
+            config.rope_theta
+            ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
+        )
+        # ------------------------------
+        # Apply RoPE scaling if configured
+        # ------------------------------
+        if config.rope_scaling is not None:
+            scaling_type = config.rope_scaling.get("type", "linear")
+            scaling_factor = config.rope_scaling.get("factor", 1.0)
+            if scaling_type == "linear":
+                # Linear scaling: divide frequencies by scaling factor
+                inv_freq = inv_freq / scaling_factor
+            elif scaling_type == "dynamic":
+                # Dynamic scaling: adjust based on sequence length
+                # This is a simplified implementation
+                inv_freq = inv_freq / scaling_factor
+            else:
+                print(f"Warning: Unknown RoPE scaling type '{scaling_type}', using linear scaling")
+                inv_freq = inv_freq / scaling_factor
+        # ------------------------------
+        # Compute position indices
+        # ------------------------------
+        # Shape: [seq_len]
+        t = torch.arange(seq_len, dtype=torch.float32)
+        # ------------------------------
+        # Outer product: [seq_len, dim // 2]
+        # Each row i contains: t[i] * inv_freq
+        # ------------------------------
+        freqs = torch.outer(t, inv_freq)
+        # ------------------------------
+        # Duplicate for interleaved sin/cos: [seq_len, dim]
+        # This matches the common format: [sin_0, cos_0, sin_1, cos_1, ...]
+        # ------------------------------
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # ------------------------------
+        # Register cos/sin as buffers
+        # - Stored in float32
+        # - Will be moved to correct device/dtype via model.to(...)
+        # - Not saved with state_dict (persistent=False)
+        # ------------------------------
+        self.register_buffer("cos", emb.cos(), persistent=False)
+        self.register_buffer("sin", emb.sin(), persistent=False)
+    def forward(self, position_ids: torch.LongTensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """ """
+        return None # This function is not necessary.
+"""## MLA"""
+class MultiheadLatentAttention(nn.Module):
+    """
+    A variant of MLA with:
+    - Simplified RoPE handling:
+      - A portion of the head dimensions are used for position information.
+      - Same number of queries as keys. (no MQA)
+    - Optional output subspace
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        # Used to determine if this layer is dense or uses latents.
+        self.layer_idx = layer_idx
+        self.attention_dropout_prob = config.attention_dropout_prob
+        self.num_heads = config.num_attention_heads
+        self.rope_theta = config.rope_theta
+        self.rope_dims = config.rope_dims
+        self.nope_dims = config.nope_dims
+        self.q_shared_dim = config.q_shared_dim
+        self.kv_shared_dim = config.kv_shared_dim
+        self.o_shared_dim = config.o_shared_dim
+        self.qk_private_dim = config.qk_private_dim
+        self.vo_private_dim = config.vo_private_dim
+        self.hidden_size = config.hidden_size
+        # =========================
+        #     Input Projections
+        # =========================
+        # If this is one of the dense layers,
+        if self.layer_idx < config.num_dense_layers:
+            # =========================
+            #     Dense Attention
+            # =========================
+            # No latent projections.
+            self.latent_spaces = False
+            # Define the standard QKV projection
+            self.qkv_proj = nn.Linear(
+                config.hidden_size,
+                self.num_heads * (self.qk_private_dim * 2 + self.vo_private_dim),
+                bias=config.attention_bias,
+            )
+            # Dense output projection
+            self.o_proj = nn.Linear(
+                self.num_heads * self.vo_private_dim,
+                config.hidden_size,
+                bias=config.attention_bias,
+            )
+        # If we're past the dense layers,
+        else:
+            # =========================
+            #     Latent Attention
+            # =========================
+            # Use latent projections.
+            self.latent_spaces = True
+            # Input latent projections
+            # If we're using a shared query subspace,
+            if config.q_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = True
+                self.q_shared_proj = nn.Linear(
+                    config.hidden_size,
+                    self.q_shared_dim,
+                    bias=config.attention_bias,
+                )
+                self.q_shared_norm = create_norm_layer(self.q_shared_dim, config)
+            else:
+                # Set a flag that we'll check in `forward`.
+                self.query_shared = False
+                self.q_shared_dim = config.hidden_size
+                #print("Updated self.q_shared_dim to", self.q_shared_dim)
+                # Use identity.
+                self.q_shared_proj = nn.Identity()
+                self.q_shared_norm = nn.Identity()
+            # If we're using a shared key/value subspace,
+            if config.kv_shared_dim is not None:
+                # Set a flag that we'll check in `forward`.
+                self.keyvalue_shared = True
+                self.kv_shared_proj = nn.Linear(
+                    config.hidden_size,
+                    self.kv_shared_dim,
+                    bias=config.attention_bias,
+                )
+                self.kv_shared_norm = create_norm_layer(self.kv_shared_dim, config)
+            else:
+                # Set a flag that we'll check in `forward`.
+                self.keyvalue_shared = False
+                self.kv_shared_dim = config.hidden_size
+                # Use identity.
+                self.kv_shared_proj = nn.Identity()
+                self.kv_shared_norm = nn.Identity()
+            #print("config.q_shared_dim", config.q_shared_dim)
+            #print("self.qk_private_dim", self.qk_private_dim)
+            # Query heads
+            self.q_private_proj = nn.Linear(
+                self.q_shared_dim,
+                self.num_heads * self.qk_private_dim,
+                bias=False # TODO
+            )
+            # Key and Value heads, concatenated
+            self.kv_private_proj = nn.Linear(
+                self.kv_shared_dim,
+                self.num_heads * (self.qk_private_dim + self.vo_private_dim),
+                bias=False,
+            )
+            # Use output subspace if o_shared_dim is specified
+            self.output_subspace = config.o_shared_dim is not None
+            # If we're using an output subspace,
+            if self.output_subspace:
+                # ==========================
+                #     Output Subspace
+                # ==========================
+                self.o_shared_dim = config.o_shared_dim
+                # Per-head output projections
+                # (Similar to original W^O, but projects the scored value vectors
+                #  into a latent space instead of back to the model)
+                self.o_private_proj = nn.Linear(
+                    self.num_heads * self.vo_private_dim,
+                    self.o_shared_dim,
+                    bias=False
+                )
+                # Norm layer between o_private_proj and o_shared_proj
+                # Note: In previous ViT experiments, this norm step hurt performance, but was beneficial
+                #       in the DeepSeekV3 experiments.
+                # However, we're making it configurable so it can be tested in different contexts.
+                self.o_private_norm = create_norm_layer(self.o_shared_dim, config)
+                # Shared output projection
+                # The head outputs from `o_private_proj` are first summed together (across
+                # heads) in the latent space.
+                # Then we project their combined outputs (a single vector per token)
+                # back to model space via `o_shared_proj`.
+                self.o_shared_proj = nn.Linear(
+                    self.o_shared_dim,
+                    self.hidden_size,
+                    bias=config.attention_bias
+                )
+            else:
+                # Dense output projection
+                self.o_proj = nn.Linear(
+                    self.num_heads * self.vo_private_dim,
+                    config.hidden_size,
+                    bias=config.attention_bias,
+                )
+        # Softmax scaling factor.
+        self.softmax_scale = self.qk_private_dim ** (-0.5)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        #past_key_value: Optional[Cache] = None, # TODO - Can I remove this?
+        #cache_position: Optional[torch.LongTensor] = None, # TODO - Can I remove this?
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # === Tensor Dimension Symbols ===
+        #    B: batch_size     — number of samples in the batch
+        #    T: seq_len        — number of tokens per sample
+        #    H: n_heads        — number of attention heads
+        #    D: hidden_dim     — model embedding size
+        #   Dv: vo_private_dim - per-head value/output projection dimension
+        #   Dr: rope_dims      - The first Dr dimensions receive rope.
+        #   Cq: q_shared_dim   - query shared subspace size
+        #  Ckv: kv_shared_dim  - key-value shared subspace size
+        #   Co: o_shared_dim   - output shared subspace size
+        # Input token embeddings
+        # hidden_states: [B, T, D]
+        B, T = hidden_states.shape[:2]
+        H = self.num_heads
+        Dq = self.qk_private_dim     # per-head dim for Q and K
+        Dv = self.vo_private_dim     # per-head dim for V/O
+        Dc_q, Dc_kv = self.q_shared_dim, self.kv_shared_dim
+        # ==============================
+        #      QKV Head Projections
+        # ==============================
+        # Project tokens into per-head query, key, and value vectors
+        # If this layer uses latent projections,
+        if self.latent_spaces:
+            # ================================
+            #     Shared Space Projections
+            # ================================
+            # Project token embeddings into shared latents
+            # Input:
+            #     hidden_states [B, T, D]
+            #     q_shared_proj [D, Cq]
+            #    kv_shared_proj [D, Ckv]
+            # Output:
+            #          q_shared  [B, T, Cq]
+            #          kv_shared [B, T, Ckv]
+            # If we're using a shared query subspace,
+            if self.q_shared_dim is not None:
+                q_shared = self.q_shared_proj(hidden_states)
+                # Normalize latent vectors, shapes unchanged.
+                q_shared = self.q_shared_norm(q_shared)
+            # Otherwise,
+            else:
+                # Use the hidden states
+                q_shared = hidden_states
+            # If we're using a shared key/value subspace,
+            if self.kv_shared_dim is not None:
+                # Project token embeddings into shared subspace.
+                kv_shared = self.kv_shared_proj(hidden_states)
+                # Normalize latent vectors, shapes unchanged.
+                kv_shared = self.kv_shared_norm(kv_shared)
+            # Otherwise,
+            else:
+                # Use the hidden states
+                kv_shared = hidden_states
+            # ======================================
+            #     Per-Head (Private) Projections
+            # ======================================
+            # Project query latents onto query heads.
+            # Input:
+            #     q_shared       [B, T, Cq]
+            #     q_private_proj [Cq, H*Dh]
+            # Output:
+            #     queries   [B, T, H*Dh]
+            queries = self.q_private_proj(q_shared)
+            # Project key/value latents onto key and value heads.
+            # The key and value heads are all concatenated, each head occupies
+            # Dh columns of the kv_private_proj. This yields the key and value
+            # vectors concatenated in the same way.
+            #
+            # Input:
+            #          kv_shared [B, T, Ckv]
+            #    kv_private_proj [Ckv, 2*H*Dh]
+            # Output:
+            #     keysvalues [B, T, 2*H*Dh]
+            keysvalues = self.kv_private_proj(kv_shared)
+            # Split into key and value tensors
+            # Each: [B, T, H * Dh]
+            keys, values = keysvalues.chunk(2, dim=-1)
+        # If this is a dense attention layer (no latent projections),
+        else:
+            # ====================
+            #     Standard MHA
+            # ====================
+            # Standard QKV projection
+            # Input:
+            #   hidden_states     [B, T, D]
+            #         qkv_proj    [D, 3*H*Dh]
+            # Output:
+            #   querieskeysvalues [B, T, 3*H*Dh]
+            querieskeysvalues = self.qkv_proj(hidden_states)
+            # Separate query, key, and value vectors
+            # Each: [B, T, H * Dh]
+            queries, keys, values = querieskeysvalues.chunk(3, dim=-1)
+        # Split up queries so that there's just one per row.
+        # Same for keys and values.
+        #
+        # Inputs:
+        #   Each  [B, T, H*Dh]
+        # Output:
+        #   Each  [B, H,  T,  Dh]
+        queries = queries.view(B, T, H, Dq).transpose(1, 2)
+        keys =       keys.view(B, T, H, Dq).transpose(1, 2)
+        values =   values.view(B, T, H, Dv).transpose(1, 2)
+        # ==================
+        #        RoPE
+        # ==================
+        # Apply rotary position embeddings to the first `self.rope_dims` of
+        # each head.
+        # The slice operations are free, but the concatenation is
+        # not, because the outputs of the rotation operation are new data
+        # occupying different memory. Still considered the best option,
+        # though.
+        # 1. Unpack the precomputed cosine and sine embeddings
+        # Position embeddings is a tuple of
+        #    (cos [seq_len, rope_dims],
+        #     sin [seq_len, rope_dims])
+        cos, sin = position_embeddings
+        # 2. Split the query and key heads into the part to rotate and the part
+        #    to pass through (early columns get position info, later ones don't)
+        #
+        #  (Using queries as example)
+        #  Inputs:
+        #    queries  [B, H, T, Dh]  Dh = rope_dims + not_rope_dims
+        #  Outputs:
+        #    q_rope   [B, H, T,  Dr]
+        #    q_pass   [B, H, T, Dh-Dr]
+        q_rope, q_pass = queries[..., :self.rope_dims], queries[..., self.rope_dims:]
+        k_rope, k_pass =    keys[..., :self.rope_dims],    keys[..., self.rope_dims:]
+        # 3. Apply the rotary embedding to the designated slice
+        #
+        # To broadcast cos and sin across the batch and head dimensions, we unsqueeze them.
+        # Shape change: [T, Dr] -> [1, 1, T, Dr]
+        cos = cos.unsqueeze(0).unsqueeze(0)
+        sin = sin.unsqueeze(0).unsqueeze(0)
+        #print("q_rope.shape[-1] // 2:", (q_rope.shape[-1] // 2))
+        #print("x1 = x[..., :x.shape[-1] // 2 ].shape:", q_rope[..., :q_rope.shape[-1] // 2 ].shape)
+        #print("sin/cos.shape:", cos.shape)
+        #print("q_rope.shape:", q_rope.shape)
+        #print("(q_rope * cos).shape:", (q_rope * cos).shape)
+        #print("rotate_half(q_rope).shape:", rotate_half(q_rope).shape)
+        #print("(rotate_half(q_rope) * sin).shape:", (rotate_half(q_rope) * sin).shape)
+        """
+        In this example   batch_size = 2, hum_heads = 8, seq_len = 65, rope_dims = 16
+                        q_rope.shape[-1] // 2: 8
+        x1 = x[..., :x.shape[-1] // 2 ].shape: torch.Size([2, 8, 65, 8])
+                    sin/cos.shape: torch.Size([1, 1, 65, 16])  # After double unsqueeze.
+                    vq_rope.shape: torch.Size([2, 8, 65, 16])
+             (q_rope * cos).shape: torch.Size([2, 8, 65, 16])
+        rotate_half(q_rope).shape: torch.Size([2, 8, 65, 16])
+        (rotate_half(q_rope) * sin).shape: torch.Size([2, 8, 65, 16])
+        """
+        # Let's walk through the queries as the example.
+        # What does rotate half do?
+        #    dim -1 is the row vectors, the queries
+        #
+        #  Step 1: Split the vector in half.
+        #    "q_rope.shape[-1] // 2" <- How much to select. Half the length of the q_rope vector
+        #    x1 = x[..., :x.shape[-1] // 2 ]  # Select the first half of the vector.
+        #    x2 = x[...,  x.shape[-1] // 2:]  # Select the second half.
+        #
+        #  Step 2:
+        #      - Apply negative to the values in the second half.
+        #      - Reverse the order of the halves.
+        #    return torch.cat((-x2, x1), dim=-1)
+        #
+        # ---- (q_rope * cos) ----
+        # Element-wise multiply the values in each `cos` vector with the
+        # corresponding (i.e., same sequence position) `q_rope` vector.
+        #
+        # Inputs:
+        #    q_rope  [B, H, T, Dr]
+        #       cos  [1, 1, T, Dr]
+        #
+        # Outputs:
+        #        x   [B, H, T, Dr]
+        #
+        # ---- (rotate_half(q_rope)) ----
+        #  TODO
+        #
+        # Inputs:
+        #       q_rope    [B, T, Dr]
+        #
+        # Outputs:
+        #   rot_q_rope    [B, T, Dr]
+        #
+        # ---- rotated * sin ----
+        #  TODO
+        q_rotated = (q_rope * cos) + (rotate_half(q_rope) * sin)
+        k_rotated = (k_rope * cos) + (rotate_half(k_rope) * sin)
+        # 4. Concatenate the rotated and pass-through parts back together
+        # Input (each): [B, H, T, Dr] and [B, H, T, Dq-Dr]
+        # Output (each): [B, H, T, Dq]
+        queries = torch.cat((q_rotated, q_pass), dim=-1)
+        keys = torch.cat((k_rotated, k_pass), dim=-1)
+        # ===================
+        #       Attention
+        # ===================
+        # The tensors (queries, keys, values) now have shape [B, H, T, Dq]
+        # and are ready for the attention score calculation.
+        # Only apply dropout during training.
+        # self.training is a pytorch flag.
+        if self.training:
+            dropout_p = self.attention_dropout_prob
+        else:
+            dropout_p = 0.0
+        # Call SDPA / Flash Attention
+        # https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+        attn_output = F.scaled_dot_product_attention(
+            queries,
+            keys,
+            values,
+            attn_mask=None, # attention_mask,
+            dropout_p=dropout_p,
+            scale=self.softmax_scale,
+            is_causal=True, # This is a decoder - apply causal masking
+        )
+        # Reshape output back to [B, T, H * Dv] from [B, H, T, Dv]
+        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, H * Dv)
+        # =========================
+        #     Output Projection
+        # =========================
+        # If we are using an output latent projection,
+        if self.latent_spaces and self.output_subspace:
+            # Project the attention output into the output latent space.
+            # This is analogous to the W^O matrix in standard attention but
+            # projects to an intermediate latent dimension.
+            attn_output = self.o_private_proj(attn_output)
+            # Apply normalization to the output latents
+            attn_output = self.o_private_norm(attn_output)
+            # Re-project the output latent representation back to model space.
+            attn_output = self.o_shared_proj(attn_output)
+        # If this is a dense layer,
+        else:
+            # Project the values back into model space.
+            attn_output = self.o_proj(attn_output)
+        # -----------------------------------------
+        return attn_output

layers/task_heads.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Union
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from ..models.shared_space_config import SharedSpaceDecoderConfig
+from ..models.shared_space_decoder import (
+    SharedSpaceDecoderPreTrainedModel,
+    SharedSpaceDecoderModel,
+    DeepseekV3RMSNorm
+)
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+class SharedSpaceDecoderForCausalLM(SharedSpaceDecoderPreTrainedModel):
+    """
+    Subspace Decoder model with a causal language modeling head.
+    This model extends the SharedSpaceDecoderModel with:
+    - A language modeling head that projects hidden states to vocabulary logits
+    - Support for computing cross-entropy loss for language modeling
+    - Proper HuggingFace compatibility for causal language modeling tasks
+    - Decoder-specific initialization strategies
+    The model can be used for:
+    - Text generation
+    - Language modeling pretraining
+    - Fine-tuning on downstream tasks
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__(config)
+        # Initialize the base decoder model
+        self.model = SharedSpaceDecoderModel(config)
+        # Final layer norm before the language modeling head
+        self.norm = create_norm_layer(config.hidden_size, config)
+        # Language modeling head
+        # Projects from hidden_size to vocab_size to get logits for each token
+        self.lm_head = nn.Linear(
+            config.hidden_size,
+            config.vocab_size,
+            bias=False  # Following common practice in modern LMs
+        )
+        # Initialize weights with decoder-specific strategy
+        # Note: tie_weights() will be called automatically by post_init() if config.tie_word_embeddings=True
+        self.post_init()
+    def _init_weights(self, module: nn.Module) -> None:
+        """
+        Decoder-specific weight initialization with special handling for language modeling head.
+        Key differences from encoder initialization:
+        - Language modeling head gets specialized initialization for stability
+        - Configurable normalization layers (LayerNorm or RMSNorm) are properly handled
+        - Weight tying considerations for embedding/lm_head relationship
+        """
+        # Use the base class initialization for most modules
+        super()._init_weights(module)
+        # Special handling for language modeling head
+        if module is self.lm_head:
+            # Use smaller initialization for the language modeling head
+            # This helps with training stability in autoregressive generation
+            # Common practice is to use std=initializer_range or smaller
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            # If weight tying is not used, we might want even smaller init
+            if self.model.vocab_proj is not None:
+                # For vocab subspace models where weights aren't tied,
+                # use a smaller scale to prevent initial logits from being too large
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range * 0.5)
+    def get_input_embeddings(self):
+        """Return the input embedding layer for compatibility with HuggingFace."""
+        return self.model.vocab_embed
+    def set_input_embeddings(self, value):
+        """Set the input embedding layer for compatibility with HuggingFace."""
+        self.model.vocab_embed = value
+    def get_output_embeddings(self):
+        """Return the output embedding layer (lm_head) for compatibility."""
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Set the output embedding layer for compatibility."""
+        self.lm_head = new_embeddings
+    def tie_weights(self):
+        """
+        Tie the input and output embedding weights.
+        This method sets the language modeling head's weight to be the same as
+        the input embedding weight. This reduces the number of parameters and
+        is a common practice in modern language models.
+        Note: For vocab subspace models, we need to handle the case where
+        input embeddings go through a projection layer.
+        """
+        # Only tie when embeddings live in model space (no vocab_proj)
+        if getattr(self.model, "vocab_proj", None) is None:
+            # Use HF utility for correct tying/cloning semantics
+            self._tie_or_clone_weights(self.lm_head, self.model.vocab_embed)
+        # else: leave untied for subspace case
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[CausalLMOutputWithPast, tuple]:
+        """
+        Forward pass for causal language modeling.
+        Args:
+            input_ids: Token ids of shape [batch_size, seq_len]
+            attention_mask: Attention mask of shape [batch_size, seq_len]
+                           (1 for real tokens, 0 for padding)
+            labels: Ground truth token ids for computing loss. Same shape as input_ids.
+                   If provided, loss will be computed. Typically input_ids shifted by 1.
+        Returns:
+            CausalLMOutputWithPast containing:
+            - logits: Prediction logits of shape [batch_size, seq_len, vocab_size]
+            - loss: Cross-entropy loss if labels provided, else None
+            - hidden_states: Final layer hidden states [batch_size, seq_len, hidden_size]
+        """
+        # Run the base decoder model
+        # This applies all the transformer layers with causal attention
+        hidden_states = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **kwargs
+        )
+        # Apply final layer normalization
+        # This normalizes the final hidden states before the language modeling head
+        hidden_states = self.norm(hidden_states)
+        # Project to vocabulary logits
+        # Shape: [batch_size, seq_len, vocab_size]
+        logits = self.lm_head(hidden_states)
+        # Compute loss if labels are provided
+        # Previously, we had custom loss computation here, but now we use the
+        # standard HuggingFace loss function.
+        loss = None
+        if labels is not None:
+            # Flatten the tokens
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+        # Return in HuggingFace format
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,  # Not implementing KV cache yet
+            #hidden_states=hidden_states,
+            hidden_states=hidden_states if kwargs.get("output_hidden_states", False) else None,
+            attentions=None,
+        )

models/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# -*- coding: utf-8 -*-
+"""
+Shared Subspace Decoder Models
+This module contains the implementation of the Shared Subspace Decoder architecture,
+including Multi-Head Latent Attention (MLA) and decomposed MLP layers.
+"""
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from .shared_space_config import SharedSpaceDecoderConfig
+from .shared_space_decoder import (
+    SharedSpaceDecoderPreTrainedModel,
+    SharedSpaceDecoderModel,
+)
+# Import from task_heads in layers directory
+from ..layers.task_heads import SharedSpaceDecoderForCausalLM
+# Register the configuration class with AutoConfig
+AutoConfig.register("shared_space_decoder", SharedSpaceDecoderConfig)
+# Register the model classes with AutoModel
+AutoModel.register(SharedSpaceDecoderConfig, SharedSpaceDecoderModel)
+AutoModelForCausalLM.register(SharedSpaceDecoderConfig, SharedSpaceDecoderForCausalLM)
+__all__ = [
+    "SharedSpaceDecoderConfig",
+    "SharedSpaceDecoderPreTrainedModel",
+    "SharedSpaceDecoderModel",
+    "SharedSpaceDecoderForCausalLM",
+]

models/shared_space_config.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from typing import Optional
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+class SharedSpaceDecoderConfig(PretrainedConfig):
+    r"""
+    Configuration class for SharedSpaceDecoderConfig.
+    Extends the HuggingFace `PretrainedConfig` to support architectural
+    variations including:
+    - Multi-Head Latent Attention (MLA)
+    - Decomposed MLPs (low-rank FFNs)
+    - Flexible attention backends (eager, flash, sdpa)
+    - Explicit shared subspaces for Q, K, V, and O projections
+    This config does not infer any defaults based on `hidden_size`. All
+    dimensions and ranks must be explicitly specified. If required values are
+    missing, a `ValueError` is raised during initialization.
+    ----------------------
+    Core Model Parameters:
+    ----------------------
+    - vocab_size (`int`) — Vocabulary size.
+    - hidden_size (`int`) — Model hidden dimension.
+    - num_hidden_layers (`int`) — Number of transformer blocks.
+    - intermediate_size (`int`) — Feed-forward hidden dimension.
+    - hidden_act (`str`) — Activation function.
+    - hidden_dropout_prob (`float`) — Dropout after projections and FFNs.
+    - attention_dropout_prob (`float`) — Dropout applied to attention scores.
+    - max_position_embeddings (`int`) — Max sequence length.
+    - initializer_range (`float`) — Stddev of weight init.
+    - layer_norm_eps (`float`) — Epsilon for LayerNorm.
+    - rms_norm_ps (`float`) — Epsilon for RMSNorm
+    - classifier_dropout (`float` or None) — Dropout for final classifier.
+    - vocab_subspace
+    - vocab_rank
+    ----------------------------------
+    Multi-Head Latent Attention (MLA):
+    ----------------------------------
+    - num_attention_heads (`int`) — Number of attention heads.
+    - q_shared_dim (`int`) — Rank of the shared query subspace.
+    - kv_shared_dim (`int`) — Rank of the shared key/value subspace.
+    - output_subspace (`bool`) — Whether to use a shared latent subspace for output projections.
+    - o_shared_dim (`int`) — Rank of the shared output subspace (required if `output_subspace=True`).
+    - qk_private_dim (`int`) — Query/key private dimension per head.
+    - vo_private_dim (`int`) — Value/output private dimension per head.
+    - rope_dims (`int`) — Number of head dimensions carrying RoPE.
+    - nope_dims (`int`) — Non-positional encoding dimensions.
+    - rope_theta (`float`) — Base frequency used for RoPE.
+    - rope_scaling (`dict` or None) — HF-style scaling dict for RoPE.
+    - attention_bias (`bool`) — Whether to include bias terms in Q/K/V projections.
+    - num_dense_layers (`int`) — Number of leading layers that do not use
+                                 subspaces for attention or FFNs.
+    - attention_backend (`str`) — Must be one of `"eager"`, `"flash_attention_2"`, or `"sdpa"`.
+    ----------------------
+    Decomposed MLP (Low-Rank FFN):
+    ----------------------
+    - ffn_decompose (`bool`) — Whether to enable low-rank FFNs.
+    - ffn_rank (`int`) — Rank of the shared FFN latent space (required if `ffn_decompose=True`).
+    ----------------------
+    Validation Behavior:
+    ----------------------
+    Raises `ValueError` at init time if:
+    - FFN decomposition is enabled without specifying `ffn_rank`.
+    - An unknown `attention_backend` is provided.
+    """
+    model_type = "shared_subspace_decoder"
+    def __init__(
+        self,
+        # === Core Model ===
+        vocab_size:         int = 30522,
+        hidden_size:        int = 512,
+        num_hidden_layers:  int = 12,
+        intermediate_size:  int = 3072,
+        hidden_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        max_position_embeddings: int = 2048,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        rms_norm_eps=1e-6, # Their default, but confirm in config.
+        norm_type="layernorm", # Choice between "layernorm" and "rmsnorm"
+        classifier_dropout=None,
+        vocab_subspace=False,
+        vocab_rank=None,
+        tie_word_embeddings=True,
+        # === Multi-Head Latent Attention ===
+        num_attention_heads: int = 16,
+        rope_dims:           int = 16,
+        q_shared_dim:        int = None,
+        kv_shared_dim:       int = None,
+        o_shared_dim=None,  # If None, no output subspace is used
+        # Private head dimensions
+        qk_private_dim:      int = None,  # Query/key private dimension per head
+        vo_private_dim:      int = None,  # Value/output private dimension per head
+        nope_dims:           int = None,  # Non-positional encoding dimensions
+        attention_backend="eager",
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        # === MLA Composition ===
+        num_dense_layers=12,  # dense MHA layers before MLA starts
+        # === Decomposed MLP ===
+        ffn_decompose=False,
+        ffn_rank=None,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        # === Core Model ===
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.rms_norm_eps = rms_norm_eps
+        self.norm_type = norm_type
+        self.classifier_dropout = classifier_dropout
+        self.vocab_subspace = vocab_subspace
+        self.vocab_rank = vocab_rank
+        self.tie_word_embeddings = tie_word_embeddings
+        # === MLA ===
+        self.num_attention_heads = num_attention_heads
+        self.rope_dims = rope_dims
+        self.q_shared_dim = q_shared_dim
+        self.kv_shared_dim = kv_shared_dim
+        self.o_shared_dim = o_shared_dim
+        # Private head dimensions
+        self.qk_private_dim = qk_private_dim
+        self.vo_private_dim = vo_private_dim
+        self.nope_dims = nope_dims
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.num_dense_layers = num_dense_layers
+        # === Decomposed FFN ===
+        self.ffn_decompose = ffn_decompose
+        self.ffn_rank = ffn_rank
+        # === Attention backend ===
+        self.attention_backend = attention_backend
+        # === Validation ===
+        # TODO - Somewhere during training these get instantiated with bad
+        #        values...
+        #self._validate()
+        #print(f"  > SubEnc *Config.init: {make_shorthand(self)}\n")
+    def _validate(self):
+        # === Model ===
+        if self.num_dense_layers > self.num_hidden_layers:
+            raise ValueError("`num_dense_layers` must be <= `num_hidden_layers`")
+        if self.vocab_subspace and self.vocab_rank is None:
+            raise ValueError("`vocab_rank` must be set when `vocab_subspace=True`")
+        # === MLA Validation ===
+        # At least one of q_shared_dim or kv_shared_dim must be set if we have subspace layers
+        if self.num_dense_layers < self.num_hidden_layers and self.q_shared_dim is None and self.kv_shared_dim is None:
+            raise ValueError("At least one of q_shared_dim or kv_shared_dim must be set when there are subspace layers")
+        # Validate that private dimensions are set
+        if self.qk_private_dim is None or self.vo_private_dim is None:
+            raise ValueError("Must set qk_private_dim and vo_private_dim")
+        if self.nope_dims is None:
+            raise ValueError("Must set nope_dims")
+        # === Decomposed FFN ===
+        if self.ffn_decompose and self.ffn_rank is None:
+            raise ValueError("`ffn_rank` must be set when `ffn_decompose=True`")
+        if self.ffn_decompose and self.num_dense_layers >= self.num_hidden_layers:
+            raise ValueError("`ffn_decompose` was set but `num_dense` is >= number of layers")
+        # === Attention Backend ===
+        valid_backends = ["eager", "flash_attention_2", "sdpa"]
+        if self.attention_backend not in valid_backends:
+            raise ValueError(f"Unknown attention backend: {self.attention_backend}, options are {valid_backends}")
+        # === Norm Type ===
+        valid_norm_types = ["layernorm", "rmsnorm"]
+        if self.norm_type not in valid_norm_types:
+            raise ValueError(f"Unknown norm type: {self.norm_type}, options are {valid_norm_types}")
+import json
+def get_config(filename):
+    # Load the config file.
+    with open(filename) as f:
+        full_cfg = json.load(f)
+    # Strict key check on the model configuration.
+    # Get the list of keys allowed / required by `*Config`
+    valid_keys = SharedSpaceDecoderConfig.__init__.__code__.co_varnames
+    # Remove `self` and `kwargs`
+    valid_keys = set(valid_keys) - {"self", "kwargs"}
+    # Compare the set of keys in the json file vs `*Config`
+    extra_keys = set(full_cfg["model"]) - valid_keys
+    missing_keys = valid_keys - set(full_cfg["model"])
+    # If there any in the `json` that aren't in `*Config`,
+    if extra_keys:
+        # List them for the user.
+        raise ValueError(f"Unknown keys in config: {sorted(extra_keys)}")
+    #  If the json config is missing required keys,
+    if missing_keys:
+        # List them for the user.
+        raise ValueError(f"config json is missing: {sorted(missing_keys)}")
+    # Will raise TypeError, by design, if required args are missing
+    # The asterisks unpack the dictionary into a list of keywords as though
+    # all of the settings were writting out individually.
+    model_cfg = SharedSpaceDecoderConfig(**full_cfg["model"])
+    return full_cfg, model_cfg

models/shared_space_decoder.py ADDED Viewed

	@@ -0,0 +1,376 @@

+# -*- coding: utf-8 -*-
+"""# shared_subspace_encoder.py"""
+from typing import Optional
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
+from ..layers.mla import MultiheadLatentAttention, RotaryEmbedding
+from ..layers.feedforward import SubspaceFeedForward
+from ..models.shared_space_config import SharedSpaceDecoderConfig
+"""
+RMSNorm
+From: https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py
+"""
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
+    """
+    Create a normalization layer based on the config norm_type.
+    Args:
+        hidden_size: The dimension to normalize over
+        config: Configuration containing norm_type and epsilon values
+    Returns:
+        Either a LayerNorm or RMSNorm layer
+    """
+    if config.norm_type == "layernorm":
+        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+    elif config.norm_type == "rmsnorm":
+        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
+    else:
+        # This should be caught by config validation, but being defensive
+        raise ValueError(f"Unknown norm_type: {config.norm_type}")
+"""#### *PreTrainedModel"""
+class SharedSpaceDecoderPreTrainedModel(PreTrainedModel):
+    """
+    The **PreTrainedModel object:
+      - Is instantiated when TODO
+      - Initializes:
+        - TODO
+      - Provides access to TODO
+      - Executes TODO
+    """
+    config_class = SharedSpaceDecoderConfig
+    base_model_prefix = "model"
+    def _init_weights(self, module: nn.Module) -> None:
+        """Weight initialization hook used by :class:`PreTrainedModel`.
+        ``PreTrainedModel.post_init`` will recursively apply this function to
+        every submodule right after construction.  HuggingFace models override
+        it so that creating a model from scratch yields the same initialization
+        as ``from_pretrained`` when no checkpoint is supplied.
+        This decoder-specific initialization strategy includes:
+        - Proper handling of configurable normalization layers (LayerNorm or RMSNorm)
+        - Special initialization for language modeling heads
+        - Considerations for causal attention and autoregressive modeling
+        - Support for both dense and decomposed vocabulary embeddings
+        """
+        if isinstance(module, nn.Linear):
+            # Standard linear layer initialization
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            # Initialize embeddings with normal distribution
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, DeepseekV3RMSNorm):
+            # RMSNorm initialization: weight to 1.0, no bias term
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.LayerNorm):
+            # LayerNorm initialization: bias to 0, weight to 1.0
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class SharedSpaceDecoderLayer(nn.Module):
+    """
+    The **Layer object:
+      - Is instantiated by :class:`SharedSpaceDecoderModel` for each
+        Transformer block in the decoder.
+      - Initializes:
+        - ``self_attn`` – multi-head latent attention implementing either
+          dense or latent projections depending on the configuration.
+        - ``ffn`` – a :class:`SubspaceFeedForward` block.
+        - RMSNorm layers for pre-attention and pre-FFN normalization.
+      - Provides access to the attention and feed-forward submodules via the
+        attributes ``self_attn`` and ``ffn``.
+      - Executes a single decoder block in :meth:`forward`.
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig, layer_idx: int) -> None:
+        super().__init__()
+        # Norm applied prior to attention.
+        self.attn_input_norm = create_norm_layer(config.hidden_size, config)
+        # Attention block
+        self.self_attn = MultiheadLatentAttention(config, layer_idx)
+        # Norm applied prior to FFN
+        self.ffn_input_norm = create_norm_layer(config.hidden_size, config)
+        # Feed-forward network used after attention
+        self.ffn = SubspaceFeedForward(config, layer_idx)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor], # RoPE embeddings
+        attention_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # ========================
+        #     Self Attention
+        # ========================
+        residual_strm = hidden_states
+        # Normalize the hidden states to create the input to attention.
+        attn_input = self.attn_input_norm(hidden_states)
+        # Evaluate
+        attn_output = self.self_attn(
+            attn_input,
+            position_embeddings,
+            attention_mask,
+        )
+        # Add the attention output (the residual) back to the non-normalized
+        # hidden_states.
+        hidden_states = residual_strm + attn_output
+        # ===========================
+        #     Feed-Forward Network
+        # ===========================
+        residual_strm = hidden_states
+        # Normalize the updated hidden states prior to the FFN
+        ffn_input = self.ffn_input_norm(hidden_states)
+        # Evaluate
+        ffn_output = self.ffn(ffn_input)
+        # Add the output the un-normalized hidden states.
+        hidden_states = residual_strm + ffn_output
+        return hidden_states
+class SharedSpaceDecoderModel(SharedSpaceDecoderPreTrainedModel):
+    """
+    The **Model object:
+      - Initializes:
+        - The vocabulary embeddings (and optional decomposition)
+        - Position embeddings (calculated in RotaryEmbedding)
+        - All of the **Layer objects.
+      - Provides interface to vocab embeddings.
+      - Executes the whole decoder model in `forward` with causal attention.
+      This is the base decoder without the language modeling head.
+      Use SubspaceDecoderForCausalLM for language modeling tasks.
+    """
+    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
+        super().__init__(config)
+        # ============================
+        #    Vocabulary Embeddings
+        # ============================
+        # Decomposing the vocabulary (if enabled) defines a shared projection
+        # which constrains the model to store semantic information (and
+        # whatever other static token knowledge) into a limited set of
+        # feature directions.
+        # If we're decomposing the token embeddings,
+        # TODO - Rename to vocab_subspace.
+        if config.vocab_subspace:
+            # Create the embedding table. Vocabulary embeddings are learned
+            # in a lower dimensional latent space.
+            self.vocab_embed = nn.Embedding(
+                config.vocab_size, # Number of tokens
+                config.vocab_rank  # Subspace dimension
+            )
+            # Create a
+            # Selected token latents will be projected up to model size.
+            # vocab_proj has shape [vocab_rank x model_size]
+            self.vocab_proj = nn.Linear(
+                config.vocab_rank,  # Size of latents
+                config.hidden_size, # Model size
+                bias=False
+            )
+        # Otherwise, for a dense vocabulary,
+        else:
+            # Create the dense embedding table in model space.
+            self.vocab_embed = nn.Embedding(
+                config.vocab_size,  # Number of tokens
+                config.hidden_size  # Model size
+            )
+            self.vocab_proj = None
+        # =====================
+        #   RoPE Embeddings
+        # =====================
+        # Pre-computes the table of RoPE embeddings, leaving them in
+        # GPU memory.
+        self.rope = RotaryEmbedding(config)
+        # ===================
+        #    Create Layers
+        # ===================
+        layers = []
+        # For each layer,
+        for i in range(config.num_hidden_layers):
+            # Create a **Layer, providing the config and indicating its number.
+            layers.append(
+                SharedSpaceDecoderLayer(
+                    config,
+                    layer_idx = i
+                )
+            )
+        # Wrap in torch ModuleList
+        self.layers = nn.ModuleList(layers)
+        # Whatever huggingface does behind the scenes...
+        self.post_init()
+    # Agents: Do not define boilerplate helpers, e.g., get/set_input_embeddings
+    def embed(self, input_ids: torch.LongTensor) -> torch.Tensor:
+        """
+        Return token embeddings for input ids.
+        This will perform the up projection to model space if the vocabulary is
+        decomposed.
+        input_ids have shape [batch_size, seq_len]
+        """
+        # If the vocabulary is decomposed,
+        if self.vocab_proj is not None:
+            # Retrieve the latents
+            #  input_ids: [batch_size, seq_len]
+            #          x: [batch_size, seq_len, latent_dim]
+            x = self.vocab_embed(input_ids)
+            #  Project the latents back to model space and return.
+            return(self.vocab_proj(x))
+        # If the vocabulary is dense,
+        else:
+            # Just return the embeddings.
+            return self.vocab_embed(input_ids)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Run the full decoder stack with causal attention.
+        Inputs:
+            input_ids       [batch_size, seq_len]
+            attention_mask  [batch_size, seq_len] - 1 for real tokens, 0 for padding
+        Returns:
+            Final decoder layer output   [batch_size, seq_len, model_size]
+        """
+        # Retrieve the token embeddings for this sequence.
+        # These are model_size, regardless of whether the vocab is decompd.
+        hidden_states = self.embed(input_ids)
+        # Retrieve the rotary position embeddings for all of the positions in
+        # our current input sequence.
+        seq_len = hidden_states.size(1)
+        # Retrieves just the ones necessary for the sequence length of the
+        # input. These are vectors, two per token. Their length is the
+        # number of head dimensions we're applying RoPE to.
+        #  Input
+        #     cos: [max_seq_len, rope_dims]
+        #     sin: [max_seq_len, rope_dims]
+        #  Outputs:
+        #     R_cos [seq_len, rope_dims]
+        #     R_sin [seq_len, rope_dims]
+        R_cos = self.rope.cos[:seq_len]
+        R_sin = self.rope.sin[:seq_len]
+        # ===============================
+        #   Attention Mask Conversion
+        # ===============================
+        """
+        use_sdpa_attention_masks = (
+            self.attn_implementation == "sdpa"
+            and self.position_embedding_type == "absolute"
+            and head_mask is None
+            and not output_attentions
+        )
+        """
+        # Expand the attention mask
+        #if use_sdpa_attention_masks and attention_mask.dim() == 2:
+        if True:
+            # Expand the attention mask for SDPA.
+            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+            extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                attention_mask,
+                hidden_states.dtype,
+                tgt_len = seq_len
+            )
+            attention_mask = extended_attention_mask
+        # Run the model!
+        # For each decoder layer,
+        for layer_i, layer in enumerate(self.layers):
+            # Evaluate the layer
+            hidden_states = layer(
+                hidden_states,       # Token embeddings
+                (R_cos, R_sin),      # Rope embeddings, passed as a tuple.
+                attention_mask,      # Attn mask
+            )
+        # Return the final output of the decoder stack.
+        return hidden_states