ChrisMcCormick
/

SubspaceDecoder_mla192-96-192

PyTorch

shared_space_decoder

custom_code

Model card Files Files and versions

xet

Community

ChrisMcCormick commited on Oct 3, 2025

Commit

572eda0

verified ·

1 Parent(s): 7bca257

Fixed architecture name

Browse files

Files changed (1) hide show

models/shared_space_config.py +256 -256

models/shared_space_config.py CHANGED Viewed

@@ -1,256 +1,256 @@
-from typing import Optional
-import torch
-from torch import nn
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_utils import PreTrainedModel
-class SharedSpaceDecoderConfig(PretrainedConfig):
-    r"""
-    Configuration class for SharedSpaceDecoderConfig.
-    Extends the HuggingFace `PretrainedConfig` to support architectural
-    variations including:
-    - Multi-Head Latent Attention (MLA)
-    - Decomposed MLPs (low-rank FFNs)
-    - Flexible attention backends (eager, flash, sdpa)
-    - Explicit shared subspaces for Q, K, V, and O projections
-    This config does not infer any defaults based on `hidden_size`. All
-    dimensions and ranks must be explicitly specified. If required values are
-    missing, a `ValueError` is raised during initialization.
-    ----------------------
-    Core Model Parameters:
-    ----------------------
-    - vocab_size (`int`) — Vocabulary size.
-    - hidden_size (`int`) — Model hidden dimension.
-    - num_hidden_layers (`int`) — Number of transformer blocks.
-    - intermediate_size (`int`) — Feed-forward hidden dimension.
-    - hidden_act (`str`) — Activation function.
-    - hidden_dropout_prob (`float`) — Dropout after projections and FFNs.
-    - attention_dropout_prob (`float`) — Dropout applied to attention scores.
-    - max_position_embeddings (`int`) — Max sequence length.
-    - initializer_range (`float`) — Stddev of weight init.
-    - layer_norm_eps (`float`) — Epsilon for LayerNorm.
-    - rms_norm_ps (`float`) — Epsilon for RMSNorm
-    - classifier_dropout (`float` or None) — Dropout for final classifier.
-    - vocab_subspace
-    - vocab_rank
-    ----------------------------------
-    Multi-Head Latent Attention (MLA):
-    ----------------------------------
-    - num_attention_heads (`int`) — Number of attention heads.
-    - q_shared_dim (`int`) — Rank of the shared query subspace.
-    - kv_shared_dim (`int`) — Rank of the shared key/value subspace.
-    - output_subspace (`bool`) — Whether to use a shared latent subspace for output projections.
-    - o_shared_dim (`int`) — Rank of the shared output subspace (required if `output_subspace=True`).
-    - qk_private_dim (`int`) — Query/key private dimension per head.
-    - vo_private_dim (`int`) — Value/output private dimension per head.
-    - rope_dims (`int`) — Number of head dimensions carrying RoPE.
-    - nope_dims (`int`) — Non-positional encoding dimensions.
-    - rope_theta (`float`) — Base frequency used for RoPE.
-    - rope_scaling (`dict` or None) — HF-style scaling dict for RoPE.
-    - attention_bias (`bool`) — Whether to include bias terms in Q/K/V projections.
-    - num_dense_layers (`int`) — Number of leading layers that do not use
-                                 subspaces for attention or FFNs.
-    - attention_backend (`str`) — Must be one of `"eager"`, `"flash_attention_2"`, or `"sdpa"`.
-    ----------------------
-    Decomposed MLP (Low-Rank FFN):
-    ----------------------
-    - ffn_decompose (`bool`) — Whether to enable low-rank FFNs.
-    - ffn_rank (`int`) — Rank of the shared FFN latent space (required if `ffn_decompose=True`).
-    ----------------------
-    Validation Behavior:
-    ----------------------
-    Raises `ValueError` at init time if:
-    - FFN decomposition is enabled without specifying `ffn_rank`.
-    - An unknown `attention_backend` is provided.
-    """
-    model_type = "shared_subspace_decoder"
-    def __init__(
-        self,
-        # === Core Model ===
-        vocab_size:         int = 30522,
-        hidden_size:        int = 512,
-        num_hidden_layers:  int = 12,
-        intermediate_size:  int = 3072,
-        hidden_dropout_prob=0.1,
-        attention_dropout_prob=0.1,
-        max_position_embeddings: int = 2048,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        rms_norm_eps=1e-6, # Their default, but confirm in config.
-        norm_type="layernorm", # Choice between "layernorm" and "rmsnorm"
-        classifier_dropout=None,
-        vocab_subspace=False,
-        vocab_rank=None,
-        tie_word_embeddings=True,
-        # === Multi-Head Latent Attention ===
-        num_attention_heads: int = 16,
-        rope_dims:           int = 16,
-        q_shared_dim:        int = None,
-        kv_shared_dim:       int = None,
-        o_shared_dim=None,  # If None, no output subspace is used
-        # Private head dimensions
-        qk_private_dim:      int = None,  # Query/key private dimension per head
-        vo_private_dim:      int = None,  # Value/output private dimension per head
-        nope_dims:           int = None,  # Non-positional encoding dimensions
-        attention_backend="eager",
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        # === MLA Composition ===
-        num_dense_layers=12,  # dense MHA layers before MLA starts
-        # === Decomposed MLP ===
-        ffn_decompose=False,
-        ffn_rank=None,
-        **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-        # === Core Model ===
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_dropout_prob = attention_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.rms_norm_eps = rms_norm_eps
-        self.norm_type = norm_type
-        self.classifier_dropout = classifier_dropout
-        self.vocab_subspace = vocab_subspace
-        self.vocab_rank = vocab_rank
-        self.tie_word_embeddings = tie_word_embeddings
-        # === MLA ===
-        self.num_attention_heads = num_attention_heads
-        self.rope_dims = rope_dims
-        self.q_shared_dim = q_shared_dim
-        self.kv_shared_dim = kv_shared_dim
-        self.o_shared_dim = o_shared_dim
-        # Private head dimensions
-        self.qk_private_dim = qk_private_dim
-        self.vo_private_dim = vo_private_dim
-        self.nope_dims = nope_dims
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.num_dense_layers = num_dense_layers
-        # === Decomposed FFN ===
-        self.ffn_decompose = ffn_decompose
-        self.ffn_rank = ffn_rank
-        # === Attention backend ===
-        self.attention_backend = attention_backend
-        # === Validation ===
-        # TODO - Somewhere during training these get instantiated with bad
-        #        values...
-        #self._validate()
-        #print(f"  > SubEnc *Config.init: {make_shorthand(self)}\n")
-    def _validate(self):
-        # === Model ===
-        if self.num_dense_layers > self.num_hidden_layers:
-            raise ValueError("`num_dense_layers` must be <= `num_hidden_layers`")
-        if self.vocab_subspace and self.vocab_rank is None:
-            raise ValueError("`vocab_rank` must be set when `vocab_subspace=True`")
-        # === MLA Validation ===
-        # At least one of q_shared_dim or kv_shared_dim must be set if we have subspace layers
-        if self.num_dense_layers < self.num_hidden_layers and self.q_shared_dim is None and self.kv_shared_dim is None:
-            raise ValueError("At least one of q_shared_dim or kv_shared_dim must be set when there are subspace layers")
-        # Validate that private dimensions are set
-        if self.qk_private_dim is None or self.vo_private_dim is None:
-            raise ValueError("Must set qk_private_dim and vo_private_dim")
-        if self.nope_dims is None:
-            raise ValueError("Must set nope_dims")
-        # === Decomposed FFN ===
-        if self.ffn_decompose and self.ffn_rank is None:
-            raise ValueError("`ffn_rank` must be set when `ffn_decompose=True`")
-        if self.ffn_decompose and self.num_dense_layers >= self.num_hidden_layers:
-            raise ValueError("`ffn_decompose` was set but `num_dense` is >= number of layers")
-        # === Attention Backend ===
-        valid_backends = ["eager", "flash_attention_2", "sdpa"]
-        if self.attention_backend not in valid_backends:
-            raise ValueError(f"Unknown attention backend: {self.attention_backend}, options are {valid_backends}")
-        # === Norm Type ===
-        valid_norm_types = ["layernorm", "rmsnorm"]
-        if self.norm_type not in valid_norm_types:
-            raise ValueError(f"Unknown norm type: {self.norm_type}, options are {valid_norm_types}")
-import json
-def get_config(filename):
-    # Load the config file.
-    with open(filename) as f:
-        full_cfg = json.load(f)
-    # Strict key check on the model configuration.
-    # Get the list of keys allowed / required by `*Config`
-    valid_keys = SharedSpaceDecoderConfig.__init__.__code__.co_varnames
-    # Remove `self` and `kwargs`
-    valid_keys = set(valid_keys) - {"self", "kwargs"}
-    # Compare the set of keys in the json file vs `*Config`
-    extra_keys = set(full_cfg["model"]) - valid_keys
-    missing_keys = valid_keys - set(full_cfg["model"])
-    # If there any in the `json` that aren't in `*Config`,
-    if extra_keys:
-        # List them for the user.
-        raise ValueError(f"Unknown keys in config: {sorted(extra_keys)}")
-    #  If the json config is missing required keys,
-    if missing_keys:
-        # List them for the user.
-        raise ValueError(f"config json is missing: {sorted(missing_keys)}")
-    # Will raise TypeError, by design, if required args are missing
-    # The asterisks unpack the dictionary into a list of keywords as though
-    # all of the settings were writting out individually.
-    model_cfg = SharedSpaceDecoderConfig(**full_cfg["model"])
-    return full_cfg, model_cfg

+from typing import Optional
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+class SharedSpaceDecoderConfig(PretrainedConfig):
+    r"""
+    Configuration class for SharedSpaceDecoderConfig.
+    Extends the HuggingFace `PretrainedConfig` to support architectural
+    variations including:
+    - Multi-Head Latent Attention (MLA)
+    - Decomposed MLPs (low-rank FFNs)
+    - Flexible attention backends (eager, flash, sdpa)
+    - Explicit shared subspaces for Q, K, V, and O projections
+    This config does not infer any defaults based on `hidden_size`. All
+    dimensions and ranks must be explicitly specified. If required values are
+    missing, a `ValueError` is raised during initialization.
+    ----------------------
+    Core Model Parameters:
+    ----------------------
+    - vocab_size (`int`) — Vocabulary size.
+    - hidden_size (`int`) — Model hidden dimension.
+    - num_hidden_layers (`int`) — Number of transformer blocks.
+    - intermediate_size (`int`) — Feed-forward hidden dimension.
+    - hidden_act (`str`) — Activation function.
+    - hidden_dropout_prob (`float`) — Dropout after projections and FFNs.
+    - attention_dropout_prob (`float`) — Dropout applied to attention scores.
+    - max_position_embeddings (`int`) — Max sequence length.
+    - initializer_range (`float`) — Stddev of weight init.
+    - layer_norm_eps (`float`) — Epsilon for LayerNorm.
+    - rms_norm_ps (`float`) — Epsilon for RMSNorm
+    - classifier_dropout (`float` or None) — Dropout for final classifier.
+    - vocab_subspace
+    - vocab_rank
+    ----------------------------------
+    Multi-Head Latent Attention (MLA):
+    ----------------------------------
+    - num_attention_heads (`int`) — Number of attention heads.
+    - q_shared_dim (`int`) — Rank of the shared query subspace.
+    - kv_shared_dim (`int`) — Rank of the shared key/value subspace.
+    - output_subspace (`bool`) — Whether to use a shared latent subspace for output projections.
+    - o_shared_dim (`int`) — Rank of the shared output subspace (required if `output_subspace=True`).
+    - qk_private_dim (`int`) — Query/key private dimension per head.
+    - vo_private_dim (`int`) — Value/output private dimension per head.
+    - rope_dims (`int`) — Number of head dimensions carrying RoPE.
+    - nope_dims (`int`) — Non-positional encoding dimensions.
+    - rope_theta (`float`) — Base frequency used for RoPE.
+    - rope_scaling (`dict` or None) — HF-style scaling dict for RoPE.
+    - attention_bias (`bool`) — Whether to include bias terms in Q/K/V projections.
+    - num_dense_layers (`int`) — Number of leading layers that do not use
+                                 subspaces for attention or FFNs.
+    - attention_backend (`str`) — Must be one of `"eager"`, `"flash_attention_2"`, or `"sdpa"`.
+    ----------------------
+    Decomposed MLP (Low-Rank FFN):
+    ----------------------
+    - ffn_decompose (`bool`) — Whether to enable low-rank FFNs.
+    - ffn_rank (`int`) — Rank of the shared FFN latent space (required if `ffn_decompose=True`).
+    ----------------------
+    Validation Behavior:
+    ----------------------
+    Raises `ValueError` at init time if:
+    - FFN decomposition is enabled without specifying `ffn_rank`.
+    - An unknown `attention_backend` is provided.
+    """
+    model_type = "shared_space_decoder"
+    def __init__(
+        self,
+        # === Core Model ===
+        vocab_size:         int = 30522,
+        hidden_size:        int = 512,
+        num_hidden_layers:  int = 12,
+        intermediate_size:  int = 3072,
+        hidden_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        max_position_embeddings: int = 2048,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        rms_norm_eps=1e-6, # Their default, but confirm in config.
+        norm_type="layernorm", # Choice between "layernorm" and "rmsnorm"
+        classifier_dropout=None,
+        vocab_subspace=False,
+        vocab_rank=None,
+        tie_word_embeddings=True,
+        # === Multi-Head Latent Attention ===
+        num_attention_heads: int = 16,
+        rope_dims:           int = 16,
+        q_shared_dim:        int = None,
+        kv_shared_dim:       int = None,
+        o_shared_dim=None,  # If None, no output subspace is used
+        # Private head dimensions
+        qk_private_dim:      int = None,  # Query/key private dimension per head
+        vo_private_dim:      int = None,  # Value/output private dimension per head
+        nope_dims:           int = None,  # Non-positional encoding dimensions
+        attention_backend="eager",
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        # === MLA Composition ===
+        num_dense_layers=12,  # dense MHA layers before MLA starts
+        # === Decomposed MLP ===
+        ffn_decompose=False,
+        ffn_rank=None,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        # === Core Model ===
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.rms_norm_eps = rms_norm_eps
+        self.norm_type = norm_type
+        self.classifier_dropout = classifier_dropout
+        self.vocab_subspace = vocab_subspace
+        self.vocab_rank = vocab_rank
+        self.tie_word_embeddings = tie_word_embeddings
+        # === MLA ===
+        self.num_attention_heads = num_attention_heads
+        self.rope_dims = rope_dims
+        self.q_shared_dim = q_shared_dim
+        self.kv_shared_dim = kv_shared_dim
+        self.o_shared_dim = o_shared_dim
+        # Private head dimensions
+        self.qk_private_dim = qk_private_dim
+        self.vo_private_dim = vo_private_dim
+        self.nope_dims = nope_dims
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.num_dense_layers = num_dense_layers
+        # === Decomposed FFN ===
+        self.ffn_decompose = ffn_decompose
+        self.ffn_rank = ffn_rank
+        # === Attention backend ===
+        self.attention_backend = attention_backend
+        # === Validation ===
+        # TODO - Somewhere during training these get instantiated with bad
+        #        values...
+        #self._validate()
+        #print(f"  > SubEnc *Config.init: {make_shorthand(self)}\n")
+    def _validate(self):
+        # === Model ===
+        if self.num_dense_layers > self.num_hidden_layers:
+            raise ValueError("`num_dense_layers` must be <= `num_hidden_layers`")
+        if self.vocab_subspace and self.vocab_rank is None:
+            raise ValueError("`vocab_rank` must be set when `vocab_subspace=True`")
+        # === MLA Validation ===
+        # At least one of q_shared_dim or kv_shared_dim must be set if we have subspace layers
+        if self.num_dense_layers < self.num_hidden_layers and self.q_shared_dim is None and self.kv_shared_dim is None:
+            raise ValueError("At least one of q_shared_dim or kv_shared_dim must be set when there are subspace layers")
+        # Validate that private dimensions are set
+        if self.qk_private_dim is None or self.vo_private_dim is None:
+            raise ValueError("Must set qk_private_dim and vo_private_dim")
+        if self.nope_dims is None:
+            raise ValueError("Must set nope_dims")
+        # === Decomposed FFN ===
+        if self.ffn_decompose and self.ffn_rank is None:
+            raise ValueError("`ffn_rank` must be set when `ffn_decompose=True`")
+        if self.ffn_decompose and self.num_dense_layers >= self.num_hidden_layers:
+            raise ValueError("`ffn_decompose` was set but `num_dense` is >= number of layers")
+        # === Attention Backend ===
+        valid_backends = ["eager", "flash_attention_2", "sdpa"]
+        if self.attention_backend not in valid_backends:
+            raise ValueError(f"Unknown attention backend: {self.attention_backend}, options are {valid_backends}")
+        # === Norm Type ===
+        valid_norm_types = ["layernorm", "rmsnorm"]
+        if self.norm_type not in valid_norm_types:
+            raise ValueError(f"Unknown norm type: {self.norm_type}, options are {valid_norm_types}")
+import json
+def get_config(filename):
+    # Load the config file.
+    with open(filename) as f:
+        full_cfg = json.load(f)
+    # Strict key check on the model configuration.
+    # Get the list of keys allowed / required by `*Config`
+    valid_keys = SharedSpaceDecoderConfig.__init__.__code__.co_varnames
+    # Remove `self` and `kwargs`
+    valid_keys = set(valid_keys) - {"self", "kwargs"}
+    # Compare the set of keys in the json file vs `*Config`
+    extra_keys = set(full_cfg["model"]) - valid_keys
+    missing_keys = valid_keys - set(full_cfg["model"])
+    # If there any in the `json` that aren't in `*Config`,
+    if extra_keys:
+        # List them for the user.
+        raise ValueError(f"Unknown keys in config: {sorted(extra_keys)}")
+    #  If the json config is missing required keys,
+    if missing_keys:
+        # List them for the user.
+        raise ValueError(f"config json is missing: {sorted(missing_keys)}")
+    # Will raise TypeError, by design, if required args are missing
+    # The asterisks unpack the dictionary into a list of keywords as though
+    # all of the settings were writting out individually.
+    model_cfg = SharedSpaceDecoderConfig(**full_cfg["model"])
+    return full_cfg, model_cfg