Update architecture and tokenizer

Browse files

Files changed (4) hide show

README.md +1 -0
config.json +2 -1
configuration.py +6 -0
huggingface.py +26 -11

README.md CHANGED Viewed

@@ -95,6 +95,7 @@ contains no weights. All values are overridable via kwargs.
 | `tie_word_embeddings` | False |
 | `training_sequence_length` | 1024 |
 | `use_cache` | True |
 | `vocab_size` | 50277 |
 | `window_size` | 128 |

 | `tie_word_embeddings` | False |
 | `training_sequence_length` | 1024 |
 | `use_cache` | True |
+| `use_residual_gate` | True |
 | `vocab_size` | 50277 |
 | `window_size` | 128 |

config.json CHANGED Viewed

@@ -21,8 +21,9 @@
   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
-  "transformers_version": "5.12.0",
   "use_cache": true,
   "vocab_size": 50277,
   "window_size": 128
 }

   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
+  "transformers_version": "5.12.1",
   "use_cache": true,
+  "use_residual_gate": true,
   "vocab_size": 50277,
   "window_size": 128
 }

configuration.py CHANGED Viewed

@@ -79,6 +79,10 @@ class ShramConfig(PretrainedConfig):
         use_cache: Whether to return past_key_values for KV caching.
         output_hidden_states: Whether to return hidden states after each layer.
         tie_word_embeddings: Whether input embedding and LM head share weights.
     """
     model_type = "shram"
@@ -111,6 +115,7 @@ class ShramConfig(PretrainedConfig):
         use_cache: bool = True,
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
         **kwargs
     ):
         if head_dim % 2 != 0:
@@ -167,6 +172,7 @@ class ShramConfig(PretrainedConfig):
         self.beta = beta
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,

         use_cache: Whether to return past_key_values for KV caching.
         output_hidden_states: Whether to return hidden states after each layer.
         tie_word_embeddings: Whether input embedding and LM head share weights.
+        use_residual_gate: When True, each DecoderLayer gates its residual contributions
+            with a learnable scalar parameter (init: zero). When False, uses a fixed
+            ``1/√num_decoder_layers`` scale instead, which preserves O(1) residual
+            variance at depth with no learnable gate. Default True.
     """
     model_type = "shram"
         use_cache: bool = True,
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
+        use_residual_gate: bool = True,
         **kwargs
     ):
         if head_dim % 2 != 0:
         self.beta = beta
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
+        self.use_residual_gate = use_residual_gate
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,

huggingface.py CHANGED Viewed

@@ -165,6 +165,10 @@ class ShramConfig(PretrainedConfig):
         use_cache: Whether to return past_key_values for KV caching.
         output_hidden_states: Whether to return hidden states after each layer.
         tie_word_embeddings: Whether input embedding and LM head share weights.
     """
     model_type = "shram"
@@ -197,6 +201,7 @@ class ShramConfig(PretrainedConfig):
         use_cache: bool = True,
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
         **kwargs
     ):
         if head_dim % 2 != 0:
@@ -253,6 +258,7 @@ class ShramConfig(PretrainedConfig):
         self.beta = beta
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
@@ -1736,17 +1742,19 @@ gated residual connections around both sublayers:
     normed_attn = RMSNorm(x)
     attn_out, router_diagnostics = SHRAMHybridLayer(normed_attn, ...)
-    h = x + attn_residual_gate * attn_out
     normed_mlp = RMSNorm(h)
     mlp_out = SwiGLUMLP(normed_mlp)
-    out = h + mlp_residual_gate * mlp_out
-Two independent residual gate vectors (shape: embedding_width, init: near-zero) gate
-the attention and MLP sublayer contributions separately. At initialisation the layer is
-a pure identity. The gates are independent trainable parameters so gradients from the
-two sublayers never accumulate into a shared parameter, preventing norm explosion at
-depth.
 Pre-norm keeps the residual stream unnormalised. Gradients flow more cleanly
 through unnormalised residuals at depth, and each sublayer receives a stable,
@@ -1763,6 +1771,8 @@ subtraction, is faster than LayerNorm, and proved more stable at scale.
 # -----------
 # Inlined from: shram.py
 # -----------
@@ -3747,8 +3757,13 @@ class DecoderLayer(nn.Module):
         self.mlp_norm = nn.RMSNorm(config.embedding_width, eps=config.rms_norm_eps)
         self.attention = SHRAMHybridLayer(config)
         self.mlp = SwiGLUMLP(config)
-        self.attn_residual_gate = nn.Parameter(1e-6*torch.randn([config.embedding_width]))
-        self.mlp_residual_gate = nn.Parameter(1e-6*torch.randn([config.embedding_width]))
     def num_mosrah_parameters(self) -> int:
         """Return the total number of trainable MoSRAH parameters in this decoder layer."""
         return self.attention.num_mosrah_parameters()
@@ -3782,8 +3797,8 @@ class DecoderLayer(nn.Module):
             active_mask=active_mask,
             cache=cache,
         )
-        hidden_states = x + self.attn_residual_gate*attn_out
-        output = hidden_states + self.mlp_residual_gate*self.mlp(self.mlp_norm(hidden_states))
         return output, router_diagnostics

         use_cache: Whether to return past_key_values for KV caching.
         output_hidden_states: Whether to return hidden states after each layer.
         tie_word_embeddings: Whether input embedding and LM head share weights.
+        use_residual_gate: When True, each DecoderLayer gates its residual contributions
+            with a learnable scalar parameter (init: zero). When False, uses a fixed
+            ``1/√num_decoder_layers`` scale instead, which preserves O(1) residual
+            variance at depth with no learnable gate. Default True.
     """
     model_type = "shram"
         use_cache: bool = True,
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
+        use_residual_gate: bool = True,
         **kwargs
     ):
         if head_dim % 2 != 0:
         self.beta = beta
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
+        self.use_residual_gate = use_residual_gate
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
     normed_attn = RMSNorm(x)
     attn_out, router_diagnostics = SHRAMHybridLayer(normed_attn, ...)
+    h = x + attn_residual_scale * attn_out
     normed_mlp = RMSNorm(h)
     mlp_out = SwiGLUMLP(normed_mlp)
+    out = h + mlp_residual_scale * mlp_out
+``attn_residual_scale`` and ``mlp_residual_scale`` are always present. Their nature
+depends on ``config.use_residual_gate``:
+- ``True`` (default): learnable scalar ``nn.Parameter`` initialised to zero. The layer
+  is a pure identity at initialisation and the scales open during training.
+- ``False``: fixed buffer ``1/√num_decoder_layers``. No learnable parameter; residual
+  variance sums to O(1) across depth by construction.
 Pre-norm keeps the residual stream unnormalised. Gradients flow more cleanly
 through unnormalised residuals at depth, and each sublayer receives a stable,
 # -----------
 # Inlined from: shram.py
 # -----------
         self.mlp_norm = nn.RMSNorm(config.embedding_width, eps=config.rms_norm_eps)
         self.attention = SHRAMHybridLayer(config)
         self.mlp = SwiGLUMLP(config)
+        scale = 1.0 / math.sqrt(config.num_decoder_layers)
+        if config.use_residual_gate:
+            self.attn_residual_scale = nn.Parameter(torch.zeros(1))
+            self.mlp_residual_scale = nn.Parameter(torch.zeros(1))
+        else:
+            self.register_buffer("attn_residual_scale", torch.full((1,), scale))
+            self.register_buffer("mlp_residual_scale", torch.full((1,), scale))
     def num_mosrah_parameters(self) -> int:
         """Return the total number of trainable MoSRAH parameters in this decoder layer."""
         return self.attention.num_mosrah_parameters()
             active_mask=active_mask,
             cache=cache,
         )
+        hidden_states = x + self.attn_residual_scale * attn_out
+        output = hidden_states + self.mlp_residual_scale * self.mlp(self.mlp_norm(hidden_states))
         return output, router_diagnostics