Update model weights after training (epoch 5, loss 6.9589)

Browse files

Files changed (10) hide show

audio_decoder.safetensors +1 -1
config.json +2 -2
configuration_xoron.py +6 -0
cross_attention.safetensors +1 -1
llm.safetensors +2 -2
model.safetensors.index.json +7 -1
modeling_xoron.py +456 -136
streaming_state.json +21 -17
trainer_state.json +10 -10
training_state.pt +2 -2

audio_decoder.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0644bb8cb74a2a1d0e055138e41ec52d65d83dca9bc9466cbdd8f388f1aa96b2
 size 1458410612

 version https://git-lfs.github.com/spec/v1
+oid sha256:c225077ec0e29909d0f390011f666158ae658fa3385cf8032280f5203da09cae
 size 1458410612

config.json CHANGED Viewed

@@ -49,10 +49,10 @@
   "image_size_step": 32,
   "video_min_size": 128,
   "video_max_size": 320,
-  "video_base_size": 192,
   "video_size_step": 32,
   "video_min_frames": 8,
-  "video_max_frames": 24,
   "video_base_frames": 16,
   "video_frame_step": 4,
   "multi_scale_strategy": "adaptive",

   "image_size_step": 32,
   "video_min_size": 128,
   "video_max_size": 320,
+  "video_base_size": 320,
   "video_size_step": 32,
   "video_min_frames": 8,
+  "video_max_frames": 8,
   "video_base_frames": 16,
   "video_frame_step": 4,
   "multi_scale_strategy": "adaptive",

configuration_xoron.py CHANGED Viewed

@@ -213,11 +213,17 @@ class XoronConfig(PreTrainedConfig):
         # Output path (used during training)
         output_dir: str = "./xoron-model",
         **kwargs,
     ):
         # Call parent init
         super().__init__(**kwargs)
         # Model identification
         self.model_name = model_name

         # Output path (used during training)
         output_dir: str = "./xoron-model",
+        # Training Configuration
+        modality_dropout_prob: float = 0.0,
         **kwargs,
     ):
         # Call parent init
         super().__init__(**kwargs)
+        # Training Configuration
+        self.modality_dropout_prob = modality_dropout_prob
         # Model identification
         self.model_name = model_name

cross_attention.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:10a70bf7bf4edce737146b199b106166957aa843440edfc45831f1d6033b7e11
 size 174191400

 version https://git-lfs.github.com/spec/v1
+oid sha256:c5dc29d69984df0e49cf508c56c03b7a18a7a49baf89a414fa3128513d753e7e
 size 174191400

llm.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b78daf2a6be38a3c0753175dd705363f8a348dc24b7d7a6fb9539715c530f22e
-size 1506831304

 version https://git-lfs.github.com/spec/v1
+oid sha256:5de86313a868d4108f814a3debd9d1ed31dc72281458ef9c7824b9a4398ce28f
+size 1506832040

model.safetensors.index.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "metadata": {
-    "total_size": 7309365038,
     "format": "components"
   },
   "weight_map": {
@@ -36,6 +36,7 @@
     "llm.model.layers.1.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.1.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.1.post_attention_layernorm.weight": "llm.safetensors",
     "llm.model.layers.1.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.1.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.1.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
@@ -150,6 +151,7 @@
     "llm.model.layers.3.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.3.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.3.post_attention_layernorm.weight": "llm.safetensors",
     "llm.model.layers.3.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.3.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.3.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
@@ -264,6 +266,7 @@
     "llm.model.layers.5.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.5.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.5.post_attention_layernorm.weight": "llm.safetensors",
     "llm.model.layers.5.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.5.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.5.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
@@ -378,6 +381,7 @@
     "llm.model.layers.7.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.7.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.7.post_attention_layernorm.weight": "llm.safetensors",
     "llm.model.layers.7.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.7.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.7.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
@@ -492,6 +496,7 @@
     "llm.model.layers.9.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.9.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.9.post_attention_layernorm.weight": "llm.safetensors",
     "llm.model.layers.9.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.9.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.9.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
@@ -606,6 +611,7 @@
     "llm.model.layers.11.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.11.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.11.post_attention_layernorm.weight": "llm.safetensors",
     "llm.model.layers.11.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.11.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.11.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",

 {
   "metadata": {
+    "total_size": 7309365134,
     "format": "components"
   },
   "weight_map": {
     "llm.model.layers.1.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.1.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.1.post_attention_layernorm.weight": "llm.safetensors",
+    "llm.model.layers.1.mlp.router.expert_bias": "llm.safetensors",
     "llm.model.layers.1.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.1.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.1.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
     "llm.model.layers.3.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.3.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.3.post_attention_layernorm.weight": "llm.safetensors",
+    "llm.model.layers.3.mlp.router.expert_bias": "llm.safetensors",
     "llm.model.layers.3.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.3.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.3.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
     "llm.model.layers.5.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.5.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.5.post_attention_layernorm.weight": "llm.safetensors",
+    "llm.model.layers.5.mlp.router.expert_bias": "llm.safetensors",
     "llm.model.layers.5.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.5.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.5.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
     "llm.model.layers.7.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.7.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.7.post_attention_layernorm.weight": "llm.safetensors",
+    "llm.model.layers.7.mlp.router.expert_bias": "llm.safetensors",
     "llm.model.layers.7.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.7.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.7.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
     "llm.model.layers.9.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.9.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.9.post_attention_layernorm.weight": "llm.safetensors",
+    "llm.model.layers.9.mlp.router.expert_bias": "llm.safetensors",
     "llm.model.layers.9.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.9.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.9.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",
     "llm.model.layers.11.self_attn.o_proj.linear.weight": "llm.safetensors",
     "llm.model.layers.11.input_layernorm.weight": "llm.safetensors",
     "llm.model.layers.11.post_attention_layernorm.weight": "llm.safetensors",
+    "llm.model.layers.11.mlp.router.expert_bias": "llm.safetensors",
     "llm.model.layers.11.mlp.router.input_norm.weight": "llm.safetensors",
     "llm.model.layers.11.mlp.router.gate.weight": "llm.safetensors",
     "llm.model.layers.11.mlp.experts.0.gate_proj.lora_A": "llm.safetensors",

modeling_xoron.py CHANGED Viewed

@@ -436,18 +436,25 @@ def compute_qk_scale(head_dim: int) -> float:
     return head_dim ** -0.25
-@dataclass
 class AttentionKVCache:
-    """
-    KV Cache for efficient autoregressive attention.
-    Features:
-    - Memory-efficient storage
-    - Support for cross-attention caching
     """
-    key_cache: torch.Tensor = None
-    value_cache: torch.Tensor = None
-    seen_tokens: int = 0
     def update(
         self,
@@ -455,36 +462,45 @@ class AttentionKVCache:
         value_states: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
-        Update cache with new key/value states.
         Args:
             key_states: New key states [batch, num_heads, seq_len, head_dim]
             value_states: New value states [batch, num_heads, seq_len, head_dim]
         Returns:
-            Updated key and value states including cache
         """
-        if self.key_cache is None:
-            self.key_cache = key_states
-            self.value_cache = value_states
-        else:
-            self.key_cache = torch.cat([self.key_cache, key_states], dim=2)
-            self.value_cache = torch.cat([self.value_cache, value_states], dim=2)
-        self.seen_tokens += key_states.shape[2]
-        return self.key_cache, self.value_cache
     def get_seq_length(self) -> int:
         """Get current sequence length in cache."""
-        if self.key_cache is None:
-            return 0
-        return self.key_cache.shape[2]
     def reset(self):
-        """Reset the cache."""
-        self.key_cache = None
-        self.value_cache = None
         self.seen_tokens = 0
@@ -1447,12 +1463,12 @@ class PerceiverAttention(nn.Module):
             sin = sin.unsqueeze(0).unsqueeze(0)
             k = apply_rope(k, cos, sin)
-        # Attention
-        attn = torch.matmul(q, k.transpose(-1, -2)) * self.scale
-        attn = torch.clamp(attn, min=-11.0, max=11.0)
-        attn = attn.softmax(dim=-1)
-        out = torch.matmul(attn, v)
         out = out.transpose(1, 2).reshape(b, n, self.inner_dim)
         return self.to_out(out)
@@ -1848,6 +1864,171 @@ class MultimodalProjector(nn.Module):
 EPS = 1e-5
 class MoERouter(nn.Module):
     """
     SOTA Router for Mixture of Experts v2.0 - FP16 native.
@@ -2036,6 +2217,10 @@ class MoELayer(nn.Module):
         top_k_probs, top_k_indices, router_logits = self.router(hidden_states)
         final_output = torch.zeros_like(hidden_flat)
         for expert_idx in range(self.num_experts):
@@ -4726,21 +4911,23 @@ class RotaryMultiHeadLatentAttention(nn.Module):
         present_key_value = (key, value) if use_cache else None
-        # Expand KV for grouped query attention
-        if self.num_key_value_groups > 1:
-            key = key.repeat_interleave(self.num_key_value_groups, dim=1)
-            value = value.repeat_interleave(self.num_key_value_groups, dim=1)
-        # Attention computation
-        attn_weights = torch.matmul(query, key.transpose(-1, -2)) * self.scale
-        if attention_mask is not None:
-            attn_weights = attn_weights + attention_mask
-        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-        attn_weights = self.dropout(attn_weights)
-        output = torch.matmul(attn_weights, value)
         output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
         output = self.o_proj(output)
@@ -6719,6 +6906,7 @@ class DualStreamSelfAttention(nn.Module):
     """
     Symmetric Dual-Stream Self-Attention (SD3/Flux-style).
     Two parallel streams with cross-stream information exchange.
     """
     def __init__(self, hidden_size: int, num_heads: int = 8, max_height: int = 64, max_width: int = 64):
@@ -6727,6 +6915,8 @@ class DualStreamSelfAttention(nn.Module):
         self.num_heads = num_heads
         self.head_dim = hidden_size // num_heads
         self.scale = self.head_dim ** -0.5
         self.to_qkv_a = nn.Linear(hidden_size, hidden_size * 3, bias=False)
         self.to_qkv_b = nn.Linear(hidden_size, hidden_size * 3, bias=False)
@@ -6752,8 +6942,6 @@ class DualStreamSelfAttention(nn.Module):
         q_b, k_b, v_b = qkv_b.unbind(dim=2)
         cos, sin = self.rope_2d(x_a, height, width)
-        # cos/sin shape: [seq_len, head_dim] -> [1, 1, seq_len, head_dim]
-        # to broadcast with q/k shape: [B, num_heads, seq_len, head_dim]
         cos = cos.unsqueeze(0).unsqueeze(1)
         sin = sin.unsqueeze(0).unsqueeze(1)
@@ -6772,13 +6960,15 @@ class DualStreamSelfAttention(nn.Module):
         k_combined = torch.cat([k_a, k_b], dim=2)
         v_combined = torch.cat([v_a, v_b], dim=2)
-        attn_a = torch.matmul(q_a, k_combined.transpose(-1, -2)) * self.scale
-        attn_a = F.softmax(attn_a, dim=-1, dtype=torch.float32).to(x_a.dtype)
-        out_a = torch.matmul(attn_a, v_combined)
-        attn_b = torch.matmul(q_b, k_combined.transpose(-1, -2)) * self.scale
-        attn_b = F.softmax(attn_b, dim=-1, dtype=torch.float32).to(x_b.dtype)
-        out_b = torch.matmul(attn_b, v_combined)
         out_a = out_a.transpose(1, 2).reshape(batch_size, seq_len, self.hidden_size)
         out_b = out_b.transpose(1, 2).reshape(batch_size, seq_len, self.hidden_size)
@@ -6815,10 +7005,12 @@ class CrossAttention(nn.Module):
         k = self.to_k(context).reshape(batch_size, ctx_len, self.heads, self.head_dim).transpose(1, 2)
         v = self.to_v(context).reshape(batch_size, ctx_len, self.heads, self.head_dim).transpose(1, 2)
-        attn = torch.matmul(q, k.transpose(-1, -2)) * self.scale
-        attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(x.dtype)
-        out = torch.matmul(attn, v)
         out = out.transpose(1, 2).reshape(batch_size, seq_len, -1)
         out = self.to_out(out)
@@ -6984,12 +7176,18 @@ class MoEDiT(nn.Module):
         self.final_norm = nn.LayerNorm(hidden_size)
         self.unpatch_embed = UnpatchEmbed(patch_size, out_channels, hidden_size)
         self._init_weights()
     def _init_weights(self):
         nn.init.zeros_(self.unpatch_embed.proj.weight)
         nn.init.zeros_(self.unpatch_embed.proj.bias)
     def forward(self, x: torch.Tensor, timesteps: torch.Tensor, context: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
         batch_size, channels, height, width = x.shape
         patch_height = height // self.patch_size
@@ -7010,7 +7208,13 @@ class MoEDiT(nn.Module):
         x_b = x_patches.clone()
         for block in self.blocks:
-            x_a, x_b = block(x_a, x_b, context_proj, t_emb, patch_height, patch_width)
         x_combined = (x_a + x_b) / 2
         x_combined = self.final_norm(x_combined)
@@ -7518,11 +7722,12 @@ class SpatialAttention(nn.Module):
         q = apply_rope(q, cos, sin)
         k = apply_rope(k, cos, sin)
-        # Attention only within each frame: [B*T, heads, H*W, H*W]
-        attn = torch.matmul(q, k.transpose(-1, -2)) * self.scale
-        attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(x.dtype)
-        out = torch.matmul(attn, v)
         out = out.transpose(1, 2).reshape(batch_size * frames, spatial_len, self.hidden_size)
         out = self.to_out(out)
@@ -7573,16 +7778,12 @@ class TemporalAttention(nn.Module):
         q = apply_rope(q, cos, sin)
         k = apply_rope(k, cos, sin)
-        # Attention across time for each position: [B*H*W, heads, T, T]
-        attn = torch.matmul(q, k.transpose(-1, -2)) * self.scale
-        if causal:
-            causal_mask = torch.triu(torch.ones(frames, frames, device=x.device, dtype=torch.bool), diagonal=1)
-            attn = attn.masked_fill(causal_mask, float('-inf'))
-        attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(x.dtype)
-        out = torch.matmul(attn, v)
         out = out.transpose(1, 2).reshape(batch_size * spatial_len, frames, self.hidden_size)
         # Reshape back to [B, T*H*W, hidden]
@@ -7647,10 +7848,12 @@ class CrossAttention3D(nn.Module):
         k = self.to_k(context).reshape(batch_size, ctx_len, self.heads, self.head_dim).transpose(1, 2)
         v = self.to_v(context).reshape(batch_size, ctx_len, self.heads, self.head_dim).transpose(1, 2)
-        attn = torch.matmul(q, k.transpose(-1, -2)) * self.scale
-        attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(x.dtype)
-        out = torch.matmul(attn, v)
         out = out.transpose(1, 2).reshape(batch_size, seq_len, -1)
         out = self.to_out(out)
@@ -7768,6 +7971,12 @@ class VideoUNet3D(nn.Module):
         nn.init.zeros_(self.output_proj[-1].weight)
         nn.init.zeros_(self.output_proj[-1].bias)
     def forward(self, x: torch.Tensor, timesteps: torch.Tensor, context: torch.Tensor, first_frame_latent: Optional[torch.Tensor] = None) -> torch.Tensor:
         batch_size, channels, frames, height, width = x.shape
@@ -7786,7 +7995,13 @@ class VideoUNet3D(nn.Module):
         temporal_context = t_emb.unsqueeze(1).expand(-1, frames * height * width, -1)
         for block in self.transformer_blocks:
-            h = block(h, context, height, width, frames, temporal_context)
         h = h.reshape(batch_size, frames, height, width, self.hidden_size).permute(0, 4, 1, 2, 3)
@@ -8260,12 +8475,31 @@ def apply_rotary_pos_emb(
     return q_embed, k_embed
-@dataclass
 class KVCache:
-    """KV Cache for efficient autoregressive generation."""
-    key_cache: torch.Tensor
-    value_cache: torch.Tensor
-    seen_tokens: int = 0
     def update(
         self,
@@ -8273,20 +8507,43 @@ class KVCache:
         value_states: torch.Tensor,
         chunk_size: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if self.key_cache is None:
-            self.key_cache = key_states
-            self.value_cache = value_states
-        else:
-            self.key_cache = torch.cat([self.key_cache, key_states], dim=2)
-            self.value_cache = torch.cat([self.value_cache, value_states], dim=2)
-        self.seen_tokens = self.key_cache.shape[2]
-        if chunk_size is not None and self.key_cache.shape[2] > chunk_size * 2:
-            self.key_cache = self.key_cache[:, :, -chunk_size * 2:]
-            self.value_cache = self.value_cache[:, :, -chunk_size * 2:]
-        return self.key_cache, self.value_cache
 def ring_attention(
@@ -8298,7 +8555,7 @@ def ring_attention(
 ) -> torch.Tensor:
     """
     Ring Attention for distributed long-context processing.
-    Processes sequence in chunks with proper attention accumulation.
     Args:
         query: [batch, heads, seq_len, head_dim]
@@ -8312,24 +8569,45 @@ def ring_attention(
     """
     batch_size, num_heads, seq_len, head_dim = query.shape
     kv_len = key.shape[2]
-    scale = head_dim ** -0.5
     if seq_len <= chunk_size and kv_len <= chunk_size:
-        attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scale
-        if causal:
-            causal_mask = torch.triu(torch.ones(seq_len, kv_len, device=query.device, dtype=torch.bool), diagonal=1)
-            if kv_len > seq_len:
-                causal_mask = causal_mask[:, -seq_len:]
-            attn_weights = attn_weights.masked_fill(causal_mask, float('-inf'))
-        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-        return torch.matmul(attn_weights, value)
     output = torch.zeros_like(query)
     max_logits = torch.full((batch_size, num_heads, seq_len, 1), float('-inf'), device=query.device, dtype=query.dtype)
     sum_exp = torch.zeros((batch_size, num_heads, seq_len, 1), device=query.device, dtype=query.dtype)
     num_kv_chunks = (kv_len + chunk_size - 1) // chunk_size
     for kv_idx in range(num_kv_chunks):
@@ -8342,13 +8620,11 @@ def ring_attention(
         attn_chunk = torch.matmul(query, key_chunk.transpose(-1, -2)) * scale
         if causal:
-            chunk_len = kv_end - kv_start
-            for q_idx in range(seq_len):
-                q_pos = q_idx + (kv_len - seq_len) if kv_len > seq_len else q_idx
-                for k_idx in range(chunk_len):
-                    k_pos = kv_start + k_idx
-                    if k_pos > q_pos:
-                        attn_chunk[:, :, q_idx, k_idx] = float('-inf')
         chunk_max = attn_chunk.max(dim=-1, keepdim=True)[0]
         new_max = torch.maximum(max_logits, chunk_max)
@@ -8465,31 +8741,35 @@ class MultiHeadLatentAttention(nn.Module):
                 self.ring_chunk_size if self.use_ring_attention else None
             )
-        if self.num_key_value_groups > 1:
-            key_states = key_states.repeat_interleave(self.num_key_value_groups, dim=1)
-            value_states = value_states.repeat_interleave(self.num_key_value_groups, dim=1)
         if self.use_ring_attention:
             attn_output = ring_attention(
-                query_states, key_states, value_states,
                 chunk_size=self.ring_chunk_size,
                 causal=True,
             )
         else:
-            attn_weights = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.scale
             kv_len = key_states.shape[2]
-            causal_mask = torch.triu(
-                torch.ones(seq_len, kv_len, device=hidden_states.device, dtype=torch.bool),
-                diagonal=kv_len - seq_len + 1
             )
-            attn_weights = attn_weights.masked_fill(causal_mask, float('-inf'))
-            if attention_mask is not None:
-                attn_weights = attn_weights + attention_mask
-            attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(hidden_states.dtype)
-            attn_output = torch.matmul(attn_weights, value_states)
         attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
         attn_output = self.o_proj(attn_output)
@@ -8518,6 +8798,11 @@ class AuxLosslessMoERouter(nn.Module):
         self.input_norm = LlamaRMSNorm(hidden_size)
         self.gate = nn.Linear(hidden_size, num_experts, bias=False)
         nn.init.normal_(self.gate.weight, mean=0.0, std=0.01)
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         batch_size, seq_len, hidden_dim = hidden_states.shape
@@ -8526,7 +8811,10 @@ class AuxLosslessMoERouter(nn.Module):
         hidden_norm = self.input_norm(hidden_flat)
         router_logits = self.gate(hidden_norm)
-        router_probs = F.softmax(router_logits, dim=-1, dtype=hidden_states.dtype)
         top_k_probs, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
@@ -8614,8 +8902,13 @@ class AuxLosslessMoELayer(nn.Module):
         batch_size, seq_len, hidden_size = hidden_states.shape
         original_dtype = hidden_states.dtype
         hidden_flat = hidden_states.view(-1, hidden_size)
-        top_k_probs, top_k_indices, _ = self.router(hidden_states)
         final_output = torch.zeros_like(hidden_flat)
@@ -8635,10 +8928,37 @@ class AuxLosslessMoELayer(nn.Module):
         final_output = final_output.view(batch_size, seq_len, hidden_size)
-        aux_loss = torch.tensor(0.0, device=hidden_states.device, dtype=hidden_states.dtype)
         return final_output, aux_loss
 MoELayer = AuxLosslessMoELayer

     return head_dim ** -0.25
 class AttentionKVCache:
+    """Pre-allocated KV Cache — static buffer with index-based filling.
+    Eliminates VRAM fragmentation from torch.cat during autoregressive generation.
+    Buffer is allocated once at first use and reused via slice assignment.
     """
+    __slots__ = ('key_cache', 'value_cache', 'seen_tokens', '_max_len')
+    def __init__(self, max_seq_len: int = 131072):
+        self.key_cache: torch.Tensor = None
+        self.value_cache: torch.Tensor = None
+        self.seen_tokens: int = 0
+        self._max_len = max_seq_len
+    def _allocate(self, batch: int, heads: int, head_dim: int, device: torch.device, dtype: torch.dtype):
+        """Allocate static buffer on first use."""
+        self.key_cache = torch.zeros(batch, heads, self._max_len, head_dim, device=device, dtype=dtype)
+        self.value_cache = torch.zeros(batch, heads, self._max_len, head_dim, device=device, dtype=dtype)
     def update(
         self,
         value_states: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
+        Update cache with new key/value states using index-based filling.
         Args:
             key_states: New key states [batch, num_heads, seq_len, head_dim]
             value_states: New value states [batch, num_heads, seq_len, head_dim]
         Returns:
+            Updated key and value states including cache (views, no copy)
         """
+        batch, heads, new_len, head_dim = key_states.shape
+        if self.key_cache is None:
+            self._allocate(batch, heads, head_dim, key_states.device, key_states.dtype)
+            self.seen_tokens = 0
+        # Grow buffer if needed (rare fallback)
+        if self.seen_tokens + new_len > self.key_cache.shape[2]:
+            new_max = max(self.key_cache.shape[2] * 2, self.seen_tokens + new_len)
+            new_key = torch.zeros(batch, heads, new_max, head_dim, device=key_states.device, dtype=key_states.dtype)
+            new_val = torch.zeros(batch, heads, new_max, head_dim, device=key_states.device, dtype=key_states.dtype)
+            new_key[:, :, :self.seen_tokens] = self.key_cache[:, :, :self.seen_tokens]
+            new_val[:, :, :self.seen_tokens] = self.value_cache[:, :, :self.seen_tokens]
+            self.key_cache = new_key
+            self.value_cache = new_val
+        # Index-based fill — no allocation, no fragmentation
+        self.key_cache[:, :, self.seen_tokens:self.seen_tokens + new_len] = key_states
+        self.value_cache[:, :, self.seen_tokens:self.seen_tokens + new_len] = value_states
+        self.seen_tokens += new_len
+        # Return valid slice (view, no copy)
+        return self.key_cache[:, :, :self.seen_tokens], self.value_cache[:, :, :self.seen_tokens]
     def get_seq_length(self) -> int:
         """Get current sequence length in cache."""
+        return self.seen_tokens
     def reset(self):
+        """Reset cache position without deallocating the buffer."""
         self.seen_tokens = 0
             sin = sin.unsqueeze(0).unsqueeze(0)
             k = apply_rope(k, cos, sin)
+        # Flash Attention 2.0 via SDPA — FP16-safe with Q/K pre-scaling
+        qk_scale = d ** -0.25
+        out = F.scaled_dot_product_attention(
+            q * qk_scale, k * qk_scale, v,
+            is_causal=False, scale=1.0,
+        )
         out = out.transpose(1, 2).reshape(b, n, self.inner_dim)
         return self.to_out(out)
 EPS = 1e-5
+class ExpertUtilizationTracker:
+    """
+    Tracks expert utilization across MoE layers.
+    Attach to any MoE layer to log per-expert usage histograms.
+    Every `report_interval` steps, prints a report showing:
+    - Frequency of use per expert
+    - Cold experts (used < 1% of tokens)
+    - Count of experts offloaded to CPU (if ExpertOffloadManager is available)
+    Usage:
+        tracker = ExpertUtilizationTracker(num_experts=8, layer_name="layer.3.moe")
+        # In forward: tracker.record(top_k_indices)
+        # Every N steps: tracker.step() (auto-prints when interval hit)
+    """
+    def __init__(
+        self,
+        num_experts: int,
+        layer_name: str = "moe",
+        report_interval: int = 100,
+        cold_threshold_pct: float = 1.0,
+    ):
+        self.num_experts = num_experts
+        self.layer_name = layer_name
+        self.report_interval = report_interval
+        self.cold_threshold_pct = cold_threshold_pct
+        self._counts = torch.zeros(num_experts, dtype=torch.long)
+        self._total_tokens = 0
+        self._step = 0
+        self._offload_manager = None  # Link to ExpertOffloadManager if available
+    def link_offload_manager(self, manager):
+        """Link an ExpertOffloadManager for cold-expert reporting."""
+        self._offload_manager = manager
+    def record(self, expert_indices: torch.Tensor):
+        """
+        Record expert selections from a forward pass.
+        Args:
+            expert_indices: [num_tokens, top_k] tensor of selected expert indices
+        """
+        indices_flat = expert_indices.detach().cpu().reshape(-1)
+        for idx in range(self.num_experts):
+            self._counts[idx] += (indices_flat == idx).sum().item()
+        self._total_tokens += expert_indices.shape[0]
+    def step(self):
+        """Advance step counter. Prints report and resets when interval is hit."""
+        self._step += 1
+        if self._step % self.report_interval == 0:
+            self._print_report()
+            self._reset()
+    def _reset(self):
+        """Reset accumulators for next interval."""
+        self._counts.zero_()
+        self._total_tokens = 0
+    def _print_report(self):
+        """Print expert utilization histogram."""
+        if self._total_tokens == 0:
+            return
+        freqs = self._counts.float()
+        total_assignments = freqs.sum().item()
+        if total_assignments == 0:
+            return
+        pcts = (freqs / total_assignments * 100).tolist()
+        # Identify cold experts
+        cold_experts = [i for i, p in enumerate(pcts) if p < self.cold_threshold_pct]
+        # Build histogram
+        max_pct = max(pcts) if pcts else 0
+        bar_max = 30  # max bar width
+        lines = [f"\n{'='*60}"]
+        lines.append(f"  Expert Utilization — {self.layer_name}  (step {self._step})")
+        lines.append(f"  {self._total_tokens:,} tokens, {int(total_assignments):,} assignments")
+        lines.append(f"{'─'*60}")
+        for i, pct in enumerate(pcts):
+            bar_len = int(pct / max_pct * bar_max) if max_pct > 0 else 0
+            bar = "█" * bar_len
+            cold_tag = " ❄️" if pct < self.cold_threshold_pct else ""
+            lines.append(f"  Expert {i:2d}  │{bar:<{bar_max}}│ {pct:5.1f}% ({int(self._counts[i]):>6d}){cold_tag}")
+        lines.append(f"{'─'*60}")
+        if cold_experts:
+            lines.append(f"  ❄️  Cold experts (<{self.cold_threshold_pct}%): {cold_experts}")
+        else:
+            lines.append(f"  ✅ All experts active (no cold experts)")
+        # Report offloaded experts if manager linked
+        if self._offload_manager is not None:
+            status = self._offload_manager.get_status()
+            lines.append(f"  💾 Offloaded to CPU: {status['cpu']}/{status['total']}")
+        # Compute load balance score (1.0 = perfectly balanced)
+        ideal_pct = 100.0 / self.num_experts
+        balance = 1.0 - (sum(abs(p - ideal_pct) for p in pcts) / (2 * 100))
+        lines.append(f"  ⚖️  Load balance score: {balance:.3f} (1.0 = perfect)")
+        lines.append(f"{'='*60}")
+        print("\n".join(lines))
+    def get_stats(self) -> dict:
+        """Return current stats as a dict (for programmatic access)."""
+        total = self._counts.sum().item()
+        if total == 0:
+            pcts = [0.0] * self.num_experts
+        else:
+            pcts = (self._counts.float() / total * 100).tolist()
+        cold = [i for i, p in enumerate(pcts) if p < self.cold_threshold_pct]
+        ideal_pct = 100.0 / self.num_experts
+        balance = 1.0 - (sum(abs(p - ideal_pct) for p in pcts) / (2 * 100)) if total > 0 else 0.0
+        return {
+            "step": self._step,
+            "layer_name": self.layer_name,
+            "total_tokens": self._total_tokens,
+            "expert_counts": self._counts.tolist(),
+            "expert_pcts": pcts,
+            "cold_experts": cold,
+            "balance_score": balance,
+        }
+def attach_utilization_trackers(
+    model: torch.nn.Module,
+    report_interval: int = 100,
+) -> list:
+    """
+    Find all MoE layers in a model and attach ExpertUtilizationTrackers.
+    Returns list of trackers for manual step() calls in the training loop.
+    """
+    trackers = []
+    for name, module in model.named_modules():
+        if hasattr(module, 'experts') and hasattr(module, 'router'):
+            num_experts = len(module.experts)
+            tracker = ExpertUtilizationTracker(
+                num_experts=num_experts,
+                layer_name=name,
+                report_interval=report_interval,
+            )
+            # Link offload manager if present
+            if hasattr(module, '_expert_offload_manager'):
+                tracker.link_offload_manager(module._expert_offload_manager)
+            module._utilization_tracker = tracker
+            trackers.append(tracker)
+    if trackers:
+        print(f"  📊 Attached {len(trackers)} expert utilization trackers (report every {report_interval} steps)")
+    return trackers
 class MoERouter(nn.Module):
     """
     SOTA Router for Mixture of Experts v2.0 - FP16 native.
         top_k_probs, top_k_indices, router_logits = self.router(hidden_states)
+        # Record expert utilization if tracker is attached
+        if hasattr(self, '_utilization_tracker'):
+            self._utilization_tracker.record(top_k_indices)
         final_output = torch.zeros_like(hidden_flat)
         for expert_idx in range(self.num_experts):
         present_key_value = (key, value) if use_cache else None
+        # True GQA via SDPA — no repeat_interleave, O(N) memory, FP16-safe
+        qk_scale = self.head_dim ** -0.25
+        kv_len = key.shape[2]
+        use_causal = (attention_mask is None and seq_len > 1 and seq_len == kv_len)
+        dropout_p = self.dropout.p if self.training else 0.0
+        output = F.scaled_dot_product_attention(
+            query * qk_scale,
+            key * qk_scale,
+            value,
+            attn_mask=attention_mask,
+            is_causal=use_causal,
+            dropout_p=dropout_p,
+            scale=1.0,
+            enable_gqa=(self.num_key_value_groups > 1),
+        )
         output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
         output = self.o_proj(output)
     """
     Symmetric Dual-Stream Self-Attention (SD3/Flux-style).
     Two parallel streams with cross-stream information exchange.
+    Uses Flash Attention 2.0 via SDPA for O(N) memory.
     """
     def __init__(self, hidden_size: int, num_heads: int = 8, max_height: int = 64, max_width: int = 64):
         self.num_heads = num_heads
         self.head_dim = hidden_size // num_heads
         self.scale = self.head_dim ** -0.5
+        # Pre-compute Q/K scaling for FP16 stability
+        self._qk_scale = self.head_dim ** -0.25
         self.to_qkv_a = nn.Linear(hidden_size, hidden_size * 3, bias=False)
         self.to_qkv_b = nn.Linear(hidden_size, hidden_size * 3, bias=False)
         q_b, k_b, v_b = qkv_b.unbind(dim=2)
         cos, sin = self.rope_2d(x_a, height, width)
         cos = cos.unsqueeze(0).unsqueeze(1)
         sin = sin.unsqueeze(0).unsqueeze(1)
         k_combined = torch.cat([k_a, k_b], dim=2)
         v_combined = torch.cat([v_a, v_b], dim=2)
+        # Flash Attention 2.0 via SDPA — O(N) memory, FP16-safe with pre-scaling
+        out_a = F.scaled_dot_product_attention(
+            q_a * self._qk_scale, k_combined * self._qk_scale, v_combined,
+            is_causal=False, scale=1.0,
+        )
+        out_b = F.scaled_dot_product_attention(
+            q_b * self._qk_scale, k_combined * self._qk_scale, v_combined,
+            is_causal=False, scale=1.0,
+        )
         out_a = out_a.transpose(1, 2).reshape(batch_size, seq_len, self.hidden_size)
         out_b = out_b.transpose(1, 2).reshape(batch_size, seq_len, self.hidden_size)
         k = self.to_k(context).reshape(batch_size, ctx_len, self.heads, self.head_dim).transpose(1, 2)
         v = self.to_v(context).reshape(batch_size, ctx_len, self.heads, self.head_dim).transpose(1, 2)
+        # Flash Attention 2.0 via SDPA — O(N) memory, non-causal cross-attention
+        qk_scale = self.head_dim ** -0.25
+        out = F.scaled_dot_product_attention(
+            q * qk_scale, k * qk_scale, v,
+            is_causal=False, scale=1.0,
+        )
         out = out.transpose(1, 2).reshape(batch_size, seq_len, -1)
         out = self.to_out(out)
         self.final_norm = nn.LayerNorm(hidden_size)
         self.unpatch_embed = UnpatchEmbed(patch_size, out_channels, hidden_size)
+        self.gradient_checkpointing = False
         self._init_weights()
     def _init_weights(self):
         nn.init.zeros_(self.unpatch_embed.proj.weight)
         nn.init.zeros_(self.unpatch_embed.proj.bias)
+    def enable_gradient_checkpointing(self):
+        """Enable gradient checkpointing for memory efficiency."""
+        self.gradient_checkpointing = True
     def forward(self, x: torch.Tensor, timesteps: torch.Tensor, context: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
         batch_size, channels, height, width = x.shape
         patch_height = height // self.patch_size
         x_b = x_patches.clone()
         for block in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                x_a, x_b = torch.utils.checkpoint.checkpoint(
+                    block, x_a, x_b, context_proj, t_emb, patch_height, patch_width,
+                    use_reentrant=False
+                )
+            else:
+                x_a, x_b = block(x_a, x_b, context_proj, t_emb, patch_height, patch_width)
         x_combined = (x_a + x_b) / 2
         x_combined = self.final_norm(x_combined)
         q = apply_rope(q, cos, sin)
         k = apply_rope(k, cos, sin)
+        # Flash Attention 2.0 via SDPA — O(N) memory, non-causal spatial attention
+        qk_scale = self.head_dim ** -0.25
+        out = F.scaled_dot_product_attention(
+            q * qk_scale, k * qk_scale, v,
+            is_causal=False, scale=1.0,
+        )
         out = out.transpose(1, 2).reshape(batch_size * frames, spatial_len, self.hidden_size)
         out = self.to_out(out)
         q = apply_rope(q, cos, sin)
         k = apply_rope(k, cos, sin)
+        # Flash Attention 2.0 via SDPA — causal temporal attention
+        qk_scale = self.head_dim ** -0.25
+        out = F.scaled_dot_product_attention(
+            q * qk_scale, k * qk_scale, v,
+            is_causal=causal, scale=1.0,
+        )
         out = out.transpose(1, 2).reshape(batch_size * spatial_len, frames, self.hidden_size)
         # Reshape back to [B, T*H*W, hidden]
         k = self.to_k(context).reshape(batch_size, ctx_len, self.heads, self.head_dim).transpose(1, 2)
         v = self.to_v(context).reshape(batch_size, ctx_len, self.heads, self.head_dim).transpose(1, 2)
+        # Flash Attention 2.0 via SDPA — non-causal cross-attention
+        qk_scale = self.head_dim ** -0.25
+        out = F.scaled_dot_product_attention(
+            q * qk_scale, k * qk_scale, v,
+            is_causal=False, scale=1.0,
+        )
         out = out.transpose(1, 2).reshape(batch_size, seq_len, -1)
         out = self.to_out(out)
         nn.init.zeros_(self.output_proj[-1].weight)
         nn.init.zeros_(self.output_proj[-1].bias)
+        self.gradient_checkpointing = False
+    def enable_gradient_checkpointing(self):
+        """Enable gradient checkpointing for memory efficiency."""
+        self.gradient_checkpointing = True
     def forward(self, x: torch.Tensor, timesteps: torch.Tensor, context: torch.Tensor, first_frame_latent: Optional[torch.Tensor] = None) -> torch.Tensor:
         batch_size, channels, frames, height, width = x.shape
         temporal_context = t_emb.unsqueeze(1).expand(-1, frames * height * width, -1)
         for block in self.transformer_blocks:
+            if self.gradient_checkpointing and self.training:
+                h = torch.utils.checkpoint.checkpoint(
+                    block, h, context, height, width, frames, temporal_context,
+                    use_reentrant=False
+                )
+            else:
+                h = block(h, context, height, width, frames, temporal_context)
         h = h.reshape(batch_size, frames, height, width, self.hidden_size).permute(0, 4, 1, 2, 3)
     return q_embed, k_embed
 class KVCache:
+    """Pre-allocated KV Cache — static buffer with index-based filling.
+    Eliminates VRAM fragmentation from torch.cat during autoregressive generation.
+    Buffer is allocated once at first use and reused via slice assignment.
+    """
+    __slots__ = ('key_cache', 'value_cache', 'seen_tokens', '_max_len')
+    def __init__(
+        self,
+        key_cache: torch.Tensor = None,
+        value_cache: torch.Tensor = None,
+        seen_tokens: int = 0,
+        max_seq_len: int = 131072,
+    ):
+        self.key_cache = key_cache
+        self.value_cache = value_cache
+        self.seen_tokens = seen_tokens
+        self._max_len = max_seq_len
+    def _allocate(self, batch: int, heads: int, head_dim: int, device: torch.device, dtype: torch.dtype):
+        """Allocate static buffer on first use."""
+        self.key_cache = torch.zeros(batch, heads, self._max_len, head_dim, device=device, dtype=dtype)
+        self.value_cache = torch.zeros(batch, heads, self._max_len, head_dim, device=device, dtype=dtype)
     def update(
         self,
         value_states: torch.Tensor,
         chunk_size: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch, heads, new_len, head_dim = key_states.shape
+        if self.key_cache is None:
+            # Allocate static buffer on first call
+            self._allocate(batch, heads, head_dim, key_states.device, key_states.dtype)
+            self.seen_tokens = 0
+        # Check if we need to apply chunk windowing
+        if chunk_size is not None and self.seen_tokens + new_len > chunk_size * 2:
+            # Shift: keep only the last chunk_size tokens, then append
+            keep = chunk_size
+            if self.seen_tokens > keep:
+                self.key_cache[:, :, :keep] = self.key_cache[:, :, self.seen_tokens - keep:self.seen_tokens].clone()
+                self.value_cache[:, :, :keep] = self.value_cache[:, :, self.seen_tokens - keep:self.seen_tokens].clone()
+                self.seen_tokens = keep
+        # Grow buffer if needed (rare fallback — avoids crash on very long sequences)
+        if self.seen_tokens + new_len > self.key_cache.shape[2]:
+            new_max = max(self.key_cache.shape[2] * 2, self.seen_tokens + new_len)
+            new_key = torch.zeros(batch, heads, new_max, head_dim, device=key_states.device, dtype=key_states.dtype)
+            new_val = torch.zeros(batch, heads, new_max, head_dim, device=key_states.device, dtype=key_states.dtype)
+            new_key[:, :, :self.seen_tokens] = self.key_cache[:, :, :self.seen_tokens]
+            new_val[:, :, :self.seen_tokens] = self.value_cache[:, :, :self.seen_tokens]
+            self.key_cache = new_key
+            self.value_cache = new_val
+        # Index-based fill — no allocation, no fragmentation
+        self.key_cache[:, :, self.seen_tokens:self.seen_tokens + new_len] = key_states
+        self.value_cache[:, :, self.seen_tokens:self.seen_tokens + new_len] = value_states
+        self.seen_tokens += new_len
+        # Return only the valid slice (view, no copy)
+        return self.key_cache[:, :, :self.seen_tokens], self.value_cache[:, :, :self.seen_tokens]
+    def reset(self):
+        """Reset cache position without deallocating the buffer."""
+        self.seen_tokens = 0
 def ring_attention(
 ) -> torch.Tensor:
     """
     Ring Attention for distributed long-context processing.
+    Processes sequence in chunks with online softmax accumulation.
     Args:
         query: [batch, heads, seq_len, head_dim]
     """
     batch_size, num_heads, seq_len, head_dim = query.shape
     kv_len = key.shape[2]
+    # Short path: use SDPA directly for small sequences
     if seq_len <= chunk_size and kv_len <= chunk_size:
+        qk_scale = head_dim ** -0.25
+        use_causal = causal and seq_len == kv_len and seq_len > 1
+        if use_causal:
+            return F.scaled_dot_product_attention(
+                query * qk_scale, key * qk_scale, value,
+                is_causal=True, scale=1.0,
+            )
+        elif causal and kv_len > seq_len:
+            # KV cache case: build explicit causal mask
+            causal_mask = torch.zeros(seq_len, kv_len, device=query.device, dtype=query.dtype)
+            q_pos = torch.arange(seq_len, device=query.device) + (kv_len - seq_len)
+            k_pos = torch.arange(kv_len, device=query.device)
+            causal_mask = torch.where(k_pos.unsqueeze(0) > q_pos.unsqueeze(1), float('-inf'), 0.0)
+            return F.scaled_dot_product_attention(
+                query * qk_scale, key * qk_scale, value,
+                attn_mask=causal_mask, scale=1.0,
+            )
+        else:
+            return F.scaled_dot_product_attention(
+                query * qk_scale, key * qk_scale, value,
+                is_causal=False, scale=1.0,
+            )
+    # Long path: chunked attention with online softmax (FlashAttention-style)
+    scale = head_dim ** -0.5
     output = torch.zeros_like(query)
     max_logits = torch.full((batch_size, num_heads, seq_len, 1), float('-inf'), device=query.device, dtype=query.dtype)
     sum_exp = torch.zeros((batch_size, num_heads, seq_len, 1), device=query.device, dtype=query.dtype)
+    # Pre-compute query positions for vectorized causal masking
+    if causal:
+        q_positions = torch.arange(seq_len, device=query.device)
+        if kv_len > seq_len:
+            q_positions = q_positions + (kv_len - seq_len)
     num_kv_chunks = (kv_len + chunk_size - 1) // chunk_size
     for kv_idx in range(num_kv_chunks):
         attn_chunk = torch.matmul(query, key_chunk.transpose(-1, -2)) * scale
         if causal:
+            # Vectorized causal mask — replaces O(n²) nested Python loop
+            k_positions = torch.arange(kv_start, kv_end, device=query.device)
+            # mask[i, j] = True where k_pos[j] > q_pos[i] (future tokens)
+            causal_mask = k_positions.unsqueeze(0) > q_positions.unsqueeze(1)  # [seq_len, chunk_len]
+            attn_chunk = attn_chunk.masked_fill(causal_mask.unsqueeze(0).unsqueeze(0), float('-inf'))
         chunk_max = attn_chunk.max(dim=-1, keepdim=True)[0]
         new_max = torch.maximum(max_logits, chunk_max)
                 self.ring_chunk_size if self.use_ring_attention else None
             )
         if self.use_ring_attention:
+            # Ring attention needs matched head counts — expand KV heads
+            if self.num_key_value_groups > 1:
+                key_expanded = key_states.repeat_interleave(self.num_key_value_groups, dim=1)
+                value_expanded = value_states.repeat_interleave(self.num_key_value_groups, dim=1)
+            else:
+                key_expanded = key_states
+                value_expanded = value_states
             attn_output = ring_attention(
+                query_states, key_expanded, value_expanded,
                 chunk_size=self.ring_chunk_size,
                 causal=True,
             )
         else:
+            # True GQA via SDPA — no repeat_interleave, O(N) memory
+            # SDPA natively handles N query heads with M KV heads via enable_gqa
+            qk_scale = self.head_dim ** -0.25
             kv_len = key_states.shape[2]
+            use_causal = (attention_mask is None and seq_len > 1 and seq_len == kv_len)
+            attn_output = F.scaled_dot_product_attention(
+                query_states * qk_scale,
+                key_states * qk_scale,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=use_causal,
+                scale=1.0,  # Already scaled Q and K
+                enable_gqa=(self.num_key_value_groups > 1),
             )
         attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
         attn_output = self.o_proj(attn_output)
         self.input_norm = LlamaRMSNorm(hidden_size)
         self.gate = nn.Linear(hidden_size, num_experts, bias=False)
         nn.init.normal_(self.gate.weight, mean=0.0, std=0.01)
+        # DeepSeek-style expert bias for aux-lossless load balancing
+        # This learnable bias steers token routing to underutilized experts
+        # without requiring an auxiliary loss term
+        self.expert_bias = nn.Parameter(torch.zeros(num_experts))
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         batch_size, seq_len, hidden_dim = hidden_states.shape
         hidden_norm = self.input_norm(hidden_flat)
         router_logits = self.gate(hidden_norm)
+        # Add expert bias for load balancing (aux-lossless mechanism)
+        biased_logits = router_logits + self.expert_bias
+        router_probs = F.softmax(biased_logits, dim=-1, dtype=hidden_states.dtype)
         top_k_probs, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
         batch_size, seq_len, hidden_size = hidden_states.shape
         original_dtype = hidden_states.dtype
         hidden_flat = hidden_states.view(-1, hidden_size)
+        num_tokens = hidden_flat.shape[0]
+        top_k_probs, top_k_indices, router_logits = self.router(hidden_states)
+        # Record expert utilization if tracker is attached
+        if hasattr(self, '_utilization_tracker'):
+            self._utilization_tracker.record(top_k_indices)
         final_output = torch.zeros_like(hidden_flat)
         final_output = final_output.view(batch_size, seq_len, hidden_size)
+        # Aux-lossless: z-loss only for router logit stability
+        # The expert_bias in the router handles load balancing architecturally
+        aux_loss = self._compute_aux_loss(router_logits, top_k_indices, num_tokens)
         return final_output, aux_loss
+    def _compute_aux_loss(
+        self,
+        router_logits: torch.Tensor,
+        top_k_indices: torch.Tensor,
+        num_tokens: int,
+    ) -> torch.Tensor:
+        """
+        Aux-lossless auxiliary loss.
+        Uses z-loss to keep router logits from growing unboundedly (FP16 stability),
+        plus a soft utilization penalty that activates only when experts go completely
+        cold. The expert_bias parameter handles routine load balancing.
+        """
+        # z-loss: prevents router logit explosion in FP16
+        z_loss = torch.logsumexp(router_logits, dim=-1).square().mean() * 0.0001
+        # Soft utilization penalty: only penalizes fully-dead experts
+        # This does NOT hurt convergence because it only activates at extremes
+        expert_mask = F.one_hot(top_k_indices, self.num_experts).float()
+        tokens_per_expert = expert_mask.sum(dim=(0, 1))  # [num_experts]
+        fraction_used = (tokens_per_expert > 0).float().mean()
+        utilization_loss = (1.0 - fraction_used) * 0.01  # Very soft penalty
+        return z_loss + utilization_loss
 MoELayer = AuxLosslessMoELayer

streaming_state.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "epoch": 26,
-  "unique_samples": 586,
-  "total_yields": 1172,
   "dataset_positions": {
     "WebSight": 386,
     "ScienceQA": 364,
@@ -18,16 +18,16 @@
     "CodeParrot-Clean": 200,
     "ShareGPT-Clean": 200,
     "Synth-Issues": 200,
-    "Dolly-15k": 200,
-    "Conversation-Summarization": 200,
     "Synth-ShellTimeout": 200,
     "Synth-Docker": 200,
-    "Synth-Documents": 200,
     "HumanEval-JavaScript": 164,
-    "OpenOrca": 200,
     "Synth-MultiStepExecution": 200,
     "Synth-Citation": 200,
-    "NoRobots": 200,
     "Synth-LanguageSetup": 200,
     "Function-Calling-ChatML": 200,
     "Synth-CoT": 200,
@@ -75,7 +75,7 @@
     "Synth-Debugging": 200,
     "Tool-Calls-SingleTurn": 200,
     "Tool-Calls-Multiturn": 200,
-    "OpenAssistant": 200,
     "T2V-Sora-Preferences-2": 200,
     "T2V-Human-Preferences": 200,
     "Sora-Alignment-Likert": 198,
@@ -84,7 +84,9 @@
     "WebVid-10M": 200,
     "Sora-Physics-Likert": 198,
     "TIP-I2V": 200,
-    "Pexels-I2V-350k": 200
   },
   "modality_positions": {
     "text": {
@@ -92,11 +94,11 @@
       "Midjourney-Prompts": 200,
       "CodeParrot-Clean": 200,
       "ShareGPT-Clean": 200,
-      "Dolly-15k": 200,
-      "Conversation-Summarization": 200,
       "HumanEval-JavaScript": 164,
-      "OpenOrca": 200,
-      "NoRobots": 200,
       "Function-Calling-ChatML": 200,
       "Python-Code-18k": 200,
       "Code-Feedback": 200,
@@ -119,7 +121,9 @@
       "HumanEval-Rust": 164,
       "Tool-Calls-SingleTurn": 200,
       "Tool-Calls-Multiturn": 200,
-      "OpenAssistant": 200
     },
     "image": {
       "WebSight": 386,
@@ -144,9 +148,9 @@
     "audio": {}
   },
   "modality_counts": {
-    "text": 0,
     "image": 0,
-    "video": 586,
     "audio": 0
   },
   "last_modality": null

 {
+  "epoch": 35,
+  "unique_samples": 400,
+  "total_yields": 800,
   "dataset_positions": {
     "WebSight": 386,
     "ScienceQA": 364,
     "CodeParrot-Clean": 200,
     "ShareGPT-Clean": 200,
     "Synth-Issues": 200,
+    "Dolly-15k": 450,
+    "Conversation-Summarization": 450,
     "Synth-ShellTimeout": 200,
     "Synth-Docker": 200,
+    "Synth-Documents": 450,
     "HumanEval-JavaScript": 164,
+    "OpenOrca": 450,
     "Synth-MultiStepExecution": 200,
     "Synth-Citation": 200,
+    "NoRobots": 450,
     "Synth-LanguageSetup": 200,
     "Function-Calling-ChatML": 200,
     "Synth-CoT": 200,
     "Synth-Debugging": 200,
     "Tool-Calls-SingleTurn": 200,
     "Tool-Calls-Multiturn": 200,
+    "OpenAssistant": 450,
     "T2V-Sora-Preferences-2": 200,
     "T2V-Human-Preferences": 200,
     "Sora-Alignment-Likert": 198,
     "WebVid-10M": 200,
     "Sora-Physics-Likert": 198,
     "TIP-I2V": 200,
+    "Pexels-I2V-350k": 200,
+    "SmolTalk-OpenHermes": 250,
+    "SmolTalk-All": 250
   },
   "modality_positions": {
     "text": {
       "Midjourney-Prompts": 200,
       "CodeParrot-Clean": 200,
       "ShareGPT-Clean": 200,
+      "Dolly-15k": 450,
+      "Conversation-Summarization": 450,
       "HumanEval-JavaScript": 164,
+      "OpenOrca": 450,
+      "NoRobots": 450,
       "Function-Calling-ChatML": 200,
       "Python-Code-18k": 200,
       "Code-Feedback": 200,
       "HumanEval-Rust": 164,
       "Tool-Calls-SingleTurn": 200,
       "Tool-Calls-Multiturn": 200,
+      "OpenAssistant": 450,
+      "SmolTalk-OpenHermes": 250,
+      "SmolTalk-All": 250
     },
     "image": {
       "WebSight": 386,
     "audio": {}
   },
   "modality_counts": {
+    "text": 400,
     "image": 0,
+    "video": 0,
     "audio": 0
   },
   "last_modality": null

trainer_state.json CHANGED Viewed

@@ -1,32 +1,32 @@
 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
-  "best_metric": 3.869171884744816,
-  "epoch": 4,
-  "epochs_completed": 4,
-  "global_step": 298,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
-  "max_steps": 298,
-  "num_train_epochs": 4,
   "total_flos": 0,
   "train_batch_size": 1,
   "effective_batch_size": 16,
   "learning_rate": 0.0001,
   "max_grad_norm": 1.0,
   "trainable_components": [
-    "vision",
-    "video",
     "llm",
     "cross_attention",
-    "video_generation",
     "modality_markers"
   ],
   "frozen_components": [
     "audio",
     "speech",
-    "image_generation"
   ],
   "trial_name": null,
   "trial_params": null

 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
+  "best_metric": 6.958861378133297,
+  "epoch": 5,
+  "epochs_completed": 5,
+  "global_step": 250,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
+  "max_steps": 250,
+  "num_train_epochs": 5,
   "total_flos": 0,
   "train_batch_size": 1,
   "effective_batch_size": 16,
   "learning_rate": 0.0001,
   "max_grad_norm": 1.0,
   "trainable_components": [
     "llm",
     "cross_attention",
     "modality_markers"
   ],
   "frozen_components": [
+    "vision",
+    "video",
     "audio",
     "speech",
+    "image_generation",
+    "video_generation"
   ],
   "trial_name": null,
   "trial_params": null

training_state.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9b37a03cba59de5ddbc9ab88c301e76b8a0fa5bc81d6d471cbefe513d0699cf
-size 724684421

 version https://git-lfs.github.com/spec/v1
+oid sha256:a751ecf22021470154d58846b700d04286522c14cda7393ece31f907eff5a2c7
+size 1514911851