mazesmazes
/

tiny-audio-mosa

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bfcf8f5279040512d65baa819b6fd783c141559fa591e3eb1de7d8ade6a05df0
-size 375027488

 version https://git-lfs.github.com/spec/v1
+oid sha256:c6cc1a109c001eab177849bb49fba0b584ab2bd29c03f209b74093d2cb9c1e9e
+size 509146304

projectors.py CHANGED Viewed

@@ -76,33 +76,21 @@ import torch.nn.functional as F
 # MoE Projector (MOSA-style)
 # =============================================================================
-class RMSNorm(nn.Module):
-    """Standard RMSNorm for 2025 architectures."""
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        var = torch.mean(x ** 2, dim=-1, keepdim=True)
-        x_normed = x * torch.rsqrt(var + self.eps)
-        return self.weight * x_normed
-class SimpleAdapter(nn.Module):
-    """
-    Updated Adapter:
-    1. Uses SiLU (better for LLM alignment).
-    2. Includes internal Norm (crucial for MoE stability).
-    """
-    def __init__(self, in_dim, hidden_dim, out_dim):
         super().__init__()
-        self.fc1 = nn.Linear(in_dim, hidden_dim)
-        self.act = nn.SiLU()  # Changed from ReLU to SiLU
-        self.fc2 = nn.Linear(hidden_dim, out_dim)
-        # Optional: Add Dropout if training on small datasets
-    def forward(self, x):
-        return self.fc2(self.act(self.fc1(x)))
 class MOSAProjector(nn.Module):
     def __init__(self, config):
@@ -112,8 +100,16 @@ class MOSAProjector(nn.Module):
         self.num_experts = getattr(config, "num_experts", None) or 8
         adapter_hidden = getattr(config, "adapter_hidden_dim", None) or 4096
         # --- 1. Pre-Norms (CRITICAL for stability) ---
-        self.in_norm = RMSNorm(self.encoder_dim)
         # --- 2. Convolutional Subsampling (Stride 4) ---
         self.conv = nn.Sequential(
@@ -138,15 +134,15 @@ class MOSAProjector(nn.Module):
             nn.Linear(1280, self.num_experts),
         )
-        # --- 4. Experts ---
         self.experts = nn.ModuleList([
-            SimpleAdapter(self.llm_dim, adapter_hidden, self.llm_dim)
             for _ in range(self.num_experts)
         ])
         # --- 5. Output Norm ---
         # Projects often drift in magnitude; this clamps them before the LLM.
-        self.out_norm = RMSNorm(self.llm_dim)
         self._init_weights()
@@ -163,12 +159,11 @@ class MOSAProjector(nn.Module):
         # Force the LAST router layer to be small (but not zero)
         nn.init.normal_(self.router[-1].weight, std=0.01)
-        # --- 2. Expert Initialization ---
         for expert in self.experts:
-            nn.init.kaiming_uniform_(expert.fc1.weight, a=math.sqrt(5))
-            nn.init.xavier_uniform_(expert.fc2.weight)
-            if expert.fc2.bias is not None:
-                nn.init.zeros_(expert.fc2.bias)
     def forward(self, x):
         # x: (B, S, 1280)
@@ -193,10 +188,14 @@ class MOSAProjector(nn.Module):
         # Router Logits
         router_logits = self.router(x_pooled)  # (B, S//4, num_experts)
         # Softmax for Dense MoE (Soft Mixing)
         routing_weights = F.softmax(router_logits, dim=-1)
         # --- 3. Expert Mixture (Dense Execution) ---
         # Warning: High VRAM usage. Runs all experts.
         # h_conv: (B, S//4, llm_dim)
@@ -218,8 +217,18 @@ class MOSAProjector(nn.Module):
         return padded // 4
     def get_aux_loss(self) -> torch.Tensor:
-        """MOSA uses only cross-entropy loss, so aux loss is 0."""
-        return torch.tensor(0.0, device=self.conv[0].weight.device)
 # =============================================================================
 # SwiGLU Projector
@@ -340,13 +349,13 @@ class ResidualAudioProjector(nn.Module):
         dropout_rate = getattr(config, "projector_dropout", 0.0)
         self.input_proj = nn.Linear(in_dim, out_dim)
-        self.ln_input = LlamaRMSNorm(out_dim, eps=1e-6)
         self.layers = nn.ModuleList(
             [ResidualMLP(out_dim, hidden_dim, dropout=dropout_rate) for _ in range(self.num_layers)]
         )
         self.layer_norms = nn.ModuleList(
-            [LlamaRMSNorm(out_dim, eps=1e-6) for _ in range(self.num_layers)]
         )
         self.output_dropout = nn.Dropout(dropout_rate)
@@ -408,35 +417,6 @@ class ResidualAudioProjector(nn.Module):
 # =============================================================================
-class RMSNorm(nn.Module):
-    """RMS Normalization (SOTA normalization for transformers)."""
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        var = x.pow(2).mean(-1, keepdim=True)
-        x_normed = x * torch.rsqrt(var + self.eps)
-        return self.weight * x_normed
-class SwiGLUExpert(nn.Module):
-    """SwiGLU expert MLP."""
-    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
-        super().__init__()
-        # Bias=False is strictly preferred for MoE experts to reduce memory/compute
-        self.gate_proj = nn.Linear(input_dim, hidden_dim, bias=False)
-        self.up_proj = nn.Linear(input_dim, hidden_dim, bias=False)
-        self.down_proj = nn.Linear(hidden_dim, output_dim, bias=False)
-        self.act = nn.SiLU()
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
 class SharedMoEBlock(nn.Module):
     """MoE block with Shared + Sigmoid-Routed Experts."""
@@ -454,7 +434,7 @@ class SharedMoEBlock(nn.Module):
         self.output_dim = output_dim
         # RMSNorm before routing
-        self.norm = RMSNorm(input_dim)
         self.router = nn.Linear(input_dim, num_experts, bias=False)
         nn.init.normal_(self.router.weight, mean=0.0, std=0.02)

 # MoE Projector (MOSA-style)
 # =============================================================================
+class SwiGLUExpert(nn.Module):
+    """SwiGLU expert MLP."""
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
         super().__init__()
+        # Bias=False is strictly preferred for MoE experts to reduce memory/compute
+        self.gate_proj = nn.Linear(input_dim, hidden_dim, bias=False)
+        self.up_proj = nn.Linear(input_dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, output_dim, bias=False)
+        self.act = nn.SiLU()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
 class MOSAProjector(nn.Module):
     def __init__(self, config):
         self.num_experts = getattr(config, "num_experts", None) or 8
         adapter_hidden = getattr(config, "adapter_hidden_dim", None) or 4096
+        # Auxiliary loss coefficients (same defaults as SharedMoE)
+        self.aux_loss_coef = getattr(config, "router_aux_loss_coef", 0.02)
+        self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.001)
+        # Store router state for aux loss computation
+        self.last_router_logits = None
+        self.last_routing_weights = None
         # --- 1. Pre-Norms (CRITICAL for stability) ---
+        self.in_norm = LlamaRMSNorm(self.encoder_dim, eps=1e-8)
         # --- 2. Convolutional Subsampling (Stride 4) ---
         self.conv = nn.Sequential(
             nn.Linear(1280, self.num_experts),
         )
+        # --- 4. Experts (SwiGLU for LLM compatibility) ---
         self.experts = nn.ModuleList([
+            SwiGLUExpert(self.llm_dim, adapter_hidden, self.llm_dim)
             for _ in range(self.num_experts)
         ])
         # --- 5. Output Norm ---
         # Projects often drift in magnitude; this clamps them before the LLM.
+        self.out_norm = LlamaRMSNorm(self.llm_dim, eps=1e-8)
         self._init_weights()
         # Force the LAST router layer to be small (but not zero)
         nn.init.normal_(self.router[-1].weight, std=0.01)
+        # --- 2. Expert Initialization (SwiGLU) ---
         for expert in self.experts:
+            nn.init.orthogonal_(expert.gate_proj.weight)
+            nn.init.orthogonal_(expert.up_proj.weight)
+            nn.init.orthogonal_(expert.down_proj.weight, gain=0.5)
     def forward(self, x):
         # x: (B, S, 1280)
         # Router Logits
         router_logits = self.router(x_pooled)  # (B, S//4, num_experts)
         # Softmax for Dense MoE (Soft Mixing)
         routing_weights = F.softmax(router_logits, dim=-1)
+        # Store for aux loss computation
+        self.last_router_logits = router_logits
+        self.last_routing_weights = routing_weights
         # --- 3. Expert Mixture (Dense Execution) ---
         # Warning: High VRAM usage. Runs all experts.
         # h_conv: (B, S//4, llm_dim)
         return padded // 4
     def get_aux_loss(self) -> torch.Tensor:
+        """Compute auxiliary losses: load balancing + z-loss."""
+        if self.last_router_logits is None:
+            return torch.tensor(0.0, device=self.conv[0].weight.device)
+        # Flatten for loss computation: (B, S, E) -> (B*S, E)
+        logits_flat = self.last_router_logits.view(-1, self.num_experts)
+        probs_flat = self.last_routing_weights.view(-1, self.num_experts)
+        balance = load_balancing_loss(probs_flat, self.num_experts, top_k=self.num_experts)
+        z = z_loss(logits_flat)
+        return self.aux_loss_coef * balance + self.z_loss_coef * z
 # =============================================================================
 # SwiGLU Projector
         dropout_rate = getattr(config, "projector_dropout", 0.0)
         self.input_proj = nn.Linear(in_dim, out_dim)
+        self.ln_input = LlamaRMSNorm(out_dim, eps=1e-8)
         self.layers = nn.ModuleList(
             [ResidualMLP(out_dim, hidden_dim, dropout=dropout_rate) for _ in range(self.num_layers)]
         )
         self.layer_norms = nn.ModuleList(
+            [LlamaRMSNorm(out_dim, eps=1e-8) for _ in range(self.num_layers)]
         )
         self.output_dropout = nn.Dropout(dropout_rate)
 # =============================================================================
 class SharedMoEBlock(nn.Module):
     """MoE block with Shared + Sigmoid-Routed Experts."""
         self.output_dim = output_dim
         # RMSNorm before routing
+        self.norm = LlamaRMSNorm(input_dim, eps=1e-8)
         self.router = nn.Linear(input_dim, num_experts, bias=False)
         nn.init.normal_(self.router.weight, mean=0.0, std=0.02)