coolpoodle
/

Qwen3-0.6B-Looped

@@ -6,6 +6,7 @@ from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention, apply_rotar
 class Qwen3LoopConfig:
     def __init__(self, base_config, loop_window_size=64):
         self.base_config = base_config
         self.loop_window_size = loop_window_size
@@ -13,6 +14,7 @@ class Qwen3LoopConfig:
     def __getattr__(self, name):
         return getattr(self.base_config, name)
 class LoopGate(nn.Module):
     def __init__(self, num_heads, head_dim):
@@ -20,8 +22,10 @@ class LoopGate(nn.Module):
         # Initialize weights to near-zero random noise to break symmetry
         self.weight = nn.Parameter(torch.randn(num_heads, head_dim) * 0.01)
-        # Initialize bias to +5.0, this is important for anyone tryna implement this cross-architecture, dont forget this.
         # Sigmoid(5.0) ≈ 0.993
         self.bias = nn.Parameter(torch.full((num_heads,), 5.0))
     def forward(self, query_states):
@@ -31,7 +35,8 @@ class LoopGate(nn.Module):
-# Loop Attention
 class Qwen3LoopAttention(nn.Module):
     def __init__(self, original_attn: Qwen3Attention, loop_window_size: int = 64):
         super().__init__()
@@ -73,6 +78,7 @@ class Qwen3LoopAttention(nn.Module):
                 cache_position=None, **kwargs):
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
@@ -97,7 +103,6 @@ class Qwen3LoopAttention(nn.Module):
         key_states_rpt = repeat_kv(key_states, self.num_key_value_groups)
         value_states_rpt = repeat_kv(value_states, self.num_key_value_groups)
         if self._loop_mode == 1:
             # Loop 1: Capture Global Context
             self._global_k = key_states_rpt.detach()
@@ -112,13 +117,12 @@ class Qwen3LoopAttention(nn.Module):
             # Loop 2: Mixed Attention
             g = self.gate(query_states)
-            # Global (from cache)
             attn_global = F.scaled_dot_product_attention(
                 query_states, self._global_k, self._global_v,
                 attn_mask=attention_mask, is_causal=self.is_causal and attention_mask is None
             )
-            # Local (Windowed)
             ids_q = torch.arange(q_len, device=query_states.device).unsqueeze(1)
             ids_k = torch.arange(key_states.shape[2], device=query_states.device).unsqueeze(0)
             mask_window = (ids_k <= ids_q) & (ids_k > (ids_q - self.loop_window_size))
@@ -137,7 +141,7 @@ class Qwen3LoopAttention(nn.Module):
                 attn_mask=local_mask, is_causal=False
             )
-            # Mixing: If Bias=5.0, g ~ 1.0, so result is mostly global
             attn_output = g * attn_global + (1.0 - g) * attn_local
         else:
@@ -183,7 +187,9 @@ class Qwen3LoopForCausalLM(nn.Module):
                 use_cache=None, output_attentions=None, output_hidden_states=None,
                 return_dict=None, cache_position=None, **kwargs):
         if use_cache or (use_cache is None and self.config.use_cache and not self.training):
             for layer in self.model.layers:
                 layer.self_attn._loop_mode = 0
             return self._forward_standard(
@@ -201,6 +207,7 @@ class Qwen3LoopForCausalLM(nn.Module):
                 **kwargs
             )
         for layer in self.model.layers:
             layer.self_attn._loop_mode = 1
         with torch.no_grad():
@@ -214,6 +221,7 @@ class Qwen3LoopForCausalLM(nn.Module):
                 **kwargs
             )
         for layer in self.model.layers:
             layer.self_attn._loop_mode = 2
         outputs = self._forward_standard(
@@ -230,6 +238,7 @@ class Qwen3LoopForCausalLM(nn.Module):
             **kwargs
         )
         for layer in self.model.layers:
             layer.self_attn._loop_mode = 0
             layer.self_attn._global_k = None
@@ -287,6 +296,7 @@ class Qwen3LoopForCausalLM(nn.Module):
     def generate(self, input_ids=None, **kwargs):
         """Generate text - always uses standard attention."""
         for layer in self.model.layers:
             layer.self_attn._loop_mode = 0
             layer.self_attn._global_k = None
@@ -338,7 +348,8 @@ class Qwen3LoopForCausalLM(nn.Module):
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
                                        attention_mask=None, inputs_embeds=None,
                                        cache_position=None, **kwargs):
-        """Prepare inputs for generation step."""
         if past_key_values is not None:
             if inputs_embeds is not None:
                 input_ids = input_ids[:, -cache_position.shape[0]:]
@@ -372,9 +383,42 @@ class Qwen3LoopForCausalLM(nn.Module):
         total = sum(p.numel() for p in self.parameters())
         print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.4f}%)")
     def get_gate_parameters(self):
-        """Return list of gate parameters for optimizer."""
         params = []
         for layer in self.model.layers:
             params.extend(layer.self_attn.gate.parameters())
         return params

 class Qwen3LoopConfig:
     def __init__(self, base_config, loop_window_size=64):
         self.base_config = base_config
         self.loop_window_size = loop_window_size
     def __getattr__(self, name):
         return getattr(self.base_config, name)
+# Learned Gate (With Fix for Init Shock)
 class LoopGate(nn.Module):
     def __init__(self, num_heads, head_dim):
         # Initialize weights to near-zero random noise to break symmetry
         self.weight = nn.Parameter(torch.randn(num_heads, head_dim) * 0.01)
+        # Initialize bias to +5.0
         # Sigmoid(5.0) ≈ 0.993
+        # This means the model starts with 99.3% Global Attention (Standard Qwen)
+        # and only 0.7% Local Attention. This prevents "garbage" output at step 0.
         self.bias = nn.Parameter(torch.full((num_heads,), 5.0))
     def forward(self, query_states):
+# Loop Attention Layer
 class Qwen3LoopAttention(nn.Module):
     def __init__(self, original_attn: Qwen3Attention, loop_window_size: int = 64):
         super().__init__()
                 cache_position=None, **kwargs):
         bsz, q_len, _ = hidden_states.size()
+        # Standard Projections
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
         key_states_rpt = repeat_kv(key_states, self.num_key_value_groups)
         value_states_rpt = repeat_kv(value_states, self.num_key_value_groups)
         if self._loop_mode == 1:
             # Loop 1: Capture Global Context
             self._global_k = key_states_rpt.detach()
             # Loop 2: Mixed Attention
             g = self.gate(query_states)
             attn_global = F.scaled_dot_product_attention(
                 query_states, self._global_k, self._global_v,
                 attn_mask=attention_mask, is_causal=self.is_causal and attention_mask is None
             )
             ids_q = torch.arange(q_len, device=query_states.device).unsqueeze(1)
             ids_k = torch.arange(key_states.shape[2], device=query_states.device).unsqueeze(0)
             mask_window = (ids_k <= ids_q) & (ids_k > (ids_q - self.loop_window_size))
                 attn_mask=local_mask, is_causal=False
             )
+            # Mixing: If Bias=5.0, g ~ 1.0, so result is mostly Global (Standard)
             attn_output = g * attn_global + (1.0 - g) * attn_local
         else:
                 use_cache=None, output_attentions=None, output_hidden_states=None,
                 return_dict=None, cache_position=None, **kwargs):
+        # If generating (use_cache=True), we disable the loop logic.
         if use_cache or (use_cache is None and self.config.use_cache and not self.training):
+            # Standard forward - bypass loop logic
             for layer in self.model.layers:
                 layer.self_attn._loop_mode = 0
             return self._forward_standard(
                 **kwargs
             )
+        # Loop 1: Capture Global
         for layer in self.model.layers:
             layer.self_attn._loop_mode = 1
         with torch.no_grad():
                 **kwargs
             )
+        # Loop 2: Mix
         for layer in self.model.layers:
             layer.self_attn._loop_mode = 2
         outputs = self._forward_standard(
             **kwargs
         )
+        # Cleanup
         for layer in self.model.layers:
             layer.self_attn._loop_mode = 0
             layer.self_attn._global_k = None
     def generate(self, input_ids=None, **kwargs):
         """Generate text - always uses standard attention."""
+        # Ensure we use standard mode for generation
         for layer in self.model.layers:
             layer.self_attn._loop_mode = 0
             layer.self_attn._global_k = None
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
                                        attention_mask=None, inputs_embeds=None,
                                        cache_position=None, **kwargs):
+        # If we have past key values, only use last token
         if past_key_values is not None:
             if inputs_embeds is not None:
                 input_ids = input_ids[:, -cache_position.shape[0]:]
         total = sum(p.numel() for p in self.parameters())
         print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.4f}%)")
+    def enable_gate_and_layernorm_training(self):
+        self.requires_grad_(False)
+        # Unfreeze gates
+        for layer in self.model.layers:
+            layer.self_attn.gate.requires_grad_(True)
+            # Unfreeze layer norms
+            layer.input_layernorm.requires_grad_(True)
+            layer.post_attention_layernorm.requires_grad_(True)
+            # Unfreeze Q/K norms in attention
+            layer.self_attn.q_norm.requires_grad_(True)
+            layer.self_attn.k_norm.requires_grad_(True)
+        # Unfreeze final layer norm
+        self.model.norm.requires_grad_(True)
+        trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        total = sum(p.numel() for p in self.parameters())
+        print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.4f}%)")
     def get_gate_parameters(self):
         params = []
         for layer in self.model.layers:
             params.extend(layer.self_attn.gate.parameters())
         return params
+    def get_trainable_parameters(self):
+        return [p for p in self.parameters() if p.requires_grad]
+    def save_pretrained(self, save_directory):
+        """Save the model weights and configuration."""
+        import os
+        os.makedirs(save_directory, exist_ok=True)
+        # Save config / added .bin compatability
+        self.config.save_pretrained(save_directory)
+        torch.save(self.state_dict(), os.path.join(save_directory, "qwen3looped.bin"))
+        print(f"Model saved to {save_directory}")