mazesmazes
/

tiny-audio-mosa

@@ -77,19 +77,17 @@ import torch.nn.functional as F
 # =============================================================================
-class SwiGLUExpert(nn.Module):
-    """SwiGLU expert MLP."""
     def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
         super().__init__()
-        # Bias=False is strictly preferred for MoE experts to reduce memory/compute
-        self.gate_proj = nn.Linear(input_dim, hidden_dim, bias=False)
-        self.up_proj = nn.Linear(input_dim, hidden_dim, bias=False)
-        self.down_proj = nn.Linear(hidden_dim, output_dim, bias=False)
-        self.act = nn.SiLU()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
 class MOSAProjector(nn.Module):
@@ -100,9 +98,9 @@ class MOSAProjector(nn.Module):
         self.num_experts = getattr(config, "num_experts", None) or 8
         adapter_hidden = getattr(config, "adapter_hidden_dim", None) or 4096
-        # Auxiliary loss coefficients (same defaults as SharedMoE)
-        self.aux_loss_coef = getattr(config, "router_aux_loss_coef", 0.001)
-        self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.001)
         # Store router state for aux loss computation
         self.last_router_logits = None
@@ -119,24 +117,22 @@ class MOSAProjector(nn.Module):
             nn.SiLU(),
         )
-        # --- 3. Deep Router (Standardized to SiLU) ---
-        # Kept your deep architecture, but added Norms between heavy layers
-        # to prevent "dead neurons" in the router.
         self.router = nn.Sequential(
             nn.Linear(self.encoder_dim, 2560),
-            nn.SiLU(),
             nn.Linear(2560, 5120),
-            nn.SiLU(),
             nn.Linear(5120, 2560),
-            nn.SiLU(),
             nn.Linear(2560, 1280),
-            nn.SiLU(),
             nn.Linear(1280, self.num_experts),
         )
-        # --- 4. Experts (SwiGLU for LLM compatibility) ---
         self.experts = nn.ModuleList([
-            SwiGLUExpert(self.llm_dim, adapter_hidden, self.llm_dim)
             for _ in range(self.num_experts)
         ])
@@ -159,11 +155,14 @@ class MOSAProjector(nn.Module):
         # Force the LAST router layer to be small (but not zero)
         nn.init.normal_(self.router[-1].weight, std=0.01)
-        # --- 2. Expert Initialization (SwiGLU) ---
         for expert in self.experts:
-            nn.init.orthogonal_(expert.gate_proj.weight)
-            nn.init.orthogonal_(expert.up_proj.weight)
-            nn.init.orthogonal_(expert.down_proj.weight, gain=0.5)
     def forward(self, x):
         # x: (B, S, 1280)

 # =============================================================================
+class SimpleAdapter(nn.Module):
+    """Simple 2-layer ReLU adapter (from MOSA paper)."""
     def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
         super().__init__()
+        self.fc1 = nn.Linear(input_dim, hidden_dim)
+        self.act = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_dim, output_dim)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fc2(self.act(self.fc1(x)))
 class MOSAProjector(nn.Module):
         self.num_experts = getattr(config, "num_experts", None) or 8
         adapter_hidden = getattr(config, "adapter_hidden_dim", None) or 4096
+        # Auxiliary loss coefficients (MOSA paper uses only cross-entropy, no aux losses)
+        self.aux_loss_coef = getattr(config, "router_aux_loss_coef", 0.0)
+        self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.0)
         # Store router state for aux loss computation
         self.last_router_logits = None
             nn.SiLU(),
         )
+        # --- 3. Deep Router (ReLU per MOSA paper) ---
         self.router = nn.Sequential(
             nn.Linear(self.encoder_dim, 2560),
+            nn.ReLU(),
             nn.Linear(2560, 5120),
+            nn.ReLU(),
             nn.Linear(5120, 2560),
+            nn.ReLU(),
             nn.Linear(2560, 1280),
+            nn.ReLU(),
             nn.Linear(1280, self.num_experts),
         )
+        # --- 4. Experts (Simple 2-layer ReLU adapters per MOSA paper) ---
         self.experts = nn.ModuleList([
+            SimpleAdapter(self.llm_dim, adapter_hidden, self.llm_dim)
             for _ in range(self.num_experts)
         ])
         # Force the LAST router layer to be small (but not zero)
         nn.init.normal_(self.router[-1].weight, std=0.01)
+        # --- 2. Expert Initialization (Simple ReLU adapter) ---
         for expert in self.experts:
+            nn.init.kaiming_normal_(expert.fc1.weight, mode='fan_in', nonlinearity='relu')
+            nn.init.kaiming_normal_(expert.fc2.weight, mode='fan_in', nonlinearity='relu')
+            if expert.fc1.bias is not None:
+                nn.init.zeros_(expert.fc1.bias)
+            if expert.fc2.bias is not None:
+                nn.init.zeros_(expert.fc2.bias)
     def forward(self, x):
         # x: (B, S, 1280)