mazesmazes
/

tiny-audio-mosa

@@ -76,19 +76,31 @@ import torch.nn.functional as F
 # MoE Projector (MOSA-style)
 # =============================================================================
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class SimpleAdapter(nn.Module):
-    """Simple 2-layer adapter with ReLU (as per MOSA paper)."""
     def __init__(self, in_dim, hidden_dim, out_dim):
         super().__init__()
         self.fc1 = nn.Linear(in_dim, hidden_dim)
-        self.act = nn.ReLU()
         self.fc2 = nn.Linear(hidden_dim, out_dim)
     def forward(self, x):
         return self.fc2(self.act(self.fc1(x)))
@@ -100,7 +112,10 @@ class MOSAProjector(nn.Module):
         self.num_experts = getattr(config, "num_experts", None) or 8
         adapter_hidden = getattr(config, "adapter_hidden_dim", None) or 4096
-        # 1. Convolutional Subsampling (Stride 4 total)
         self.conv = nn.Sequential(
             nn.Conv1d(self.encoder_dim, self.llm_dim, kernel_size=3, stride=2, padding=1),
             nn.SiLU(),
@@ -108,87 +123,93 @@ class MOSAProjector(nn.Module):
             nn.SiLU(),
         )
-        # 2. Router (MOSA-Large: 1280 -> 2560 -> 5120 -> 2560 -> 1280 -> num_experts)
-        # Deep router with ReLU for better expert sparsity (as per paper)
-        # Router operates on pooled features (same receptive field as conv)
         self.router = nn.Sequential(
             nn.Linear(self.encoder_dim, 2560),
-            nn.ReLU(),
             nn.Linear(2560, 5120),
-            nn.ReLU(),
             nn.Linear(5120, 2560),
-            nn.ReLU(),
             nn.Linear(2560, 1280),
-            nn.ReLU(),
             nn.Linear(1280, self.num_experts),
         )
-        # 3. Experts
         self.experts = nn.ModuleList([
             SimpleAdapter(self.llm_dim, adapter_hidden, self.llm_dim)
             for _ in range(self.num_experts)
         ])
         self._init_weights()
     def _init_weights(self):
-        """Initialize weights for stable training."""
-        for m in self.modules():
-            if isinstance(m, (nn.Linear, nn.Conv1d)):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-        # Scale down expert output projections for stable residual-like behavior
         for expert in self.experts:
-            with torch.no_grad():
-                expert.fc2.weight.data.mul_(0.1)
-        # Initialize final router layer with small weights for uniform initial routing
-        # This prevents one expert from dominating at the start of training
-        with torch.no_grad():
-            final_router_layer = self.router[-1]  # Last linear layer
-            nn.init.normal_(final_router_layer.weight, mean=0.0, std=0.01)
-            if final_router_layer.bias is not None:
-                nn.init.zeros_(final_router_layer.bias)
     def forward(self, x):
         # x: (B, S, 1280)
         batch_size, seq_len, _ = x.shape
         # --- 1. Conv Branch ---
-        # Downsample: S -> S//4, expand: 1280 -> llm_dim
-        x_trans = x.permute(0, 2, 1)  # (B, 1280, S)
         h_conv = self.conv(x_trans).permute(0, 2, 1)  # (B, S//4, llm_dim)
         # --- 2. Router Branch ---
-        # Pool input BEFORE routing so router sees same receptive field as conv
-        # This is more principled than post-hoc averaging of per-frame decisions
         pad_amt = (4 - (seq_len % 4)) % 4
         if pad_amt > 0:
-            x_padded = F.pad(x, (0, 0, 0, pad_amt))  # Pad sequence dim
         else:
             x_padded = x
-        # Average pool to match conv stride (B, S, 1280) -> (B, S//4, 1280)
-        x_pooled = x_padded.view(batch_size, -1, 4, self.encoder_dim).mean(dim=2)
-        # Router makes 1 informed decision per pooled token
         router_logits = self.router(x_pooled)  # (B, S//4, num_experts)
         routing_weights = F.softmax(router_logits, dim=-1)
-        # --- 3. Expert Mixture ---
-        # expert_outputs shape: (num_experts, B, S//4, llm_dim)
-        expert_outputs = torch.stack([expert(h_conv) for expert in self.experts])
-        # Weighted sum of experts: (B, S//4, llm_dim)
         final_out = torch.einsum('ebsd, bse -> bsd', expert_outputs, routing_weights)
-        return final_out
-    def get_aux_loss(self) -> torch.Tensor:
-        """MOSA uses only cross-entropy loss, so aux loss is 0."""
-        return torch.tensor(0.0, device=self.conv[0].weight.device)
     def get_output_length(self, input_length: int) -> int:
         """Calculate output sequence length given input length."""
@@ -196,6 +217,9 @@ class MOSAProjector(nn.Module):
         padded = input_length + (4 - input_length % 4) % 4
         return padded // 4
 # =============================================================================
 # SwiGLU Projector

 # MoE Projector (MOSA-style)
 # =============================================================================
+class RMSNorm(nn.Module):
+    """Standard RMSNorm for 2025 architectures."""
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        var = torch.mean(x ** 2, dim=-1, keepdim=True)
+        x_normed = x * torch.rsqrt(var + self.eps)
+        return self.weight * x_normed
+class SimpleAdapter(nn.Module):
+    """
+    Updated Adapter:
+    1. Uses SiLU (better for LLM alignment).
+    2. Includes internal Norm (crucial for MoE stability).
+    """
     def __init__(self, in_dim, hidden_dim, out_dim):
         super().__init__()
         self.fc1 = nn.Linear(in_dim, hidden_dim)
+        self.act = nn.SiLU()  # Changed from ReLU to SiLU
         self.fc2 = nn.Linear(hidden_dim, out_dim)
+        # Optional: Add Dropout if training on small datasets
     def forward(self, x):
         return self.fc2(self.act(self.fc1(x)))
         self.num_experts = getattr(config, "num_experts", None) or 8
         adapter_hidden = getattr(config, "adapter_hidden_dim", None) or 4096
+        # --- 1. Pre-Norms (CRITICAL for stability) ---
+        self.in_norm = RMSNorm(self.encoder_dim)
+        # --- 2. Convolutional Subsampling (Stride 4) ---
         self.conv = nn.Sequential(
             nn.Conv1d(self.encoder_dim, self.llm_dim, kernel_size=3, stride=2, padding=1),
             nn.SiLU(),
             nn.SiLU(),
         )
+        # --- 3. Deep Router (Standardized to SiLU) ---
+        # Kept your deep architecture, but added Norms between heavy layers
+        # to prevent "dead neurons" in the router.
         self.router = nn.Sequential(
             nn.Linear(self.encoder_dim, 2560),
+            nn.SiLU(),
             nn.Linear(2560, 5120),
+            nn.SiLU(),
             nn.Linear(5120, 2560),
+            nn.SiLU(),
             nn.Linear(2560, 1280),
+            nn.SiLU(),
             nn.Linear(1280, self.num_experts),
         )
+        # --- 4. Experts ---
         self.experts = nn.ModuleList([
             SimpleAdapter(self.llm_dim, adapter_hidden, self.llm_dim)
             for _ in range(self.num_experts)
         ])
+        # --- 5. Output Norm ---
+        # Projects often drift in magnitude; this clamps them before the LLM.
+        self.out_norm = RMSNorm(self.llm_dim)
         self._init_weights()
     def _init_weights(self):
+        # --- 1. Router Initialization ---
+        # The router is 5 layers deep. We need Kaiming Init to ensure
+        # gradients can penetrate to the first layer.
+        for module in self.router.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.kaiming_normal_(module.weight, mode='fan_in', nonlinearity='relu')
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+        # Force the LAST router layer to be small (but not zero)
+        nn.init.normal_(self.router[-1].weight, std=0.01)
+        # --- 2. Expert Initialization ---
         for expert in self.experts:
+            nn.init.kaiming_uniform_(expert.fc1.weight, a=math.sqrt(5))
+            nn.init.xavier_uniform_(expert.fc2.weight)
+            if expert.fc2.bias is not None:
+                nn.init.zeros_(expert.fc2.bias)
     def forward(self, x):
         # x: (B, S, 1280)
         batch_size, seq_len, _ = x.shape
+        # Apply Input Norm
+        x = self.in_norm(x)
         # --- 1. Conv Branch ---
+        x_trans = x.permute(0, 2, 1)  # (B, D, S)
         h_conv = self.conv(x_trans).permute(0, 2, 1)  # (B, S//4, llm_dim)
         # --- 2. Router Branch ---
         pad_amt = (4 - (seq_len % 4)) % 4
         if pad_amt > 0:
+            x_padded = F.pad(x, (0, 0, 0, pad_amt))
         else:
             x_padded = x
+        # Mean pool to align receptive fields
+        x_pooled = x_padded.view(batch_size, -1, 4, self.encoder_dim).mean(dim=2) # (B, S//4, D)
+        # Router Logits
         router_logits = self.router(x_pooled)  # (B, S//4, num_experts)
+        # Softmax for Dense MoE (Soft Mixing)
         routing_weights = F.softmax(router_logits, dim=-1)
+        # --- 3. Expert Mixture (Dense Execution) ---
+        # Warning: High VRAM usage. Runs all experts.
+        # h_conv: (B, S//4, llm_dim)
+        # Stack approach is clean but memory hungry.
+        # Checkpointing could be added here if OOM occurs.
+        expert_outputs = torch.stack([expert(h_conv) for expert in self.experts]) # (E, B, S//4, D)
+        # Weighted Sum
+        # (Experts, Batch, Seq, Dim) * (Batch, Seq, Experts) -> (Batch, Seq, Dim)
         final_out = torch.einsum('ebsd, bse -> bsd', expert_outputs, routing_weights)
+        return self.out_norm(final_out)
     def get_output_length(self, input_length: int) -> int:
         """Calculate output sequence length given input length."""
         padded = input_length + (4 - input_length % 4) % 4
         return padded // 4
+    def get_aux_loss(self) -> torch.Tensor:
+        """MOSA uses only cross-entropy loss, so aux loss is 0."""
+        return torch.tensor(0.0, device=self.conv[0].weight.device)
 # =============================================================================
 # SwiGLU Projector