mazesmazes
/

tiny-audio-mosa

Feature Extraction

Model card Files Files and versions

mazesmazes commited on 23 days ago

Commit

c20eb37

·

verified ·

1 Parent(s): 7bc28c3

Training in progress - step 500

Files changed (1) hide show

projectors.py +1 -23

projectors.py CHANGED Viewed

@@ -140,29 +140,7 @@ class MOSAProjector(nn.Module):
         # Projects often drift in magnitude; this clamps them before the LLM.
         self.out_norm = LlamaRMSNorm(self.llm_dim, eps=1e-8)
-        self._init_weights()
-    def _init_weights(self):
-        # --- 1. Router Initialization ---
-        # The router is 5 layers deep. We need Kaiming Init to ensure
-        # gradients can penetrate to the first layer.
-        for module in self.router.modules():
-            if isinstance(module, nn.Linear):
-                nn.init.kaiming_normal_(module.weight, mode='fan_in', nonlinearity='relu')
-                if module.bias is not None:
-                    nn.init.zeros_(module.bias)
-        # Force the LAST router layer to be small (but not zero)
-        nn.init.normal_(self.router[-1].weight, std=0.01)
-        # --- 2. Expert Initialization (Simple ReLU adapter) ---
-        for expert in self.experts:
-            nn.init.kaiming_normal_(expert.fc1.weight, mode='fan_in', nonlinearity='relu')
-            nn.init.kaiming_normal_(expert.fc2.weight, mode='fan_in', nonlinearity='relu')
-            if expert.fc1.bias is not None:
-                nn.init.zeros_(expert.fc1.bias)
-            if expert.fc2.bias is not None:
-                nn.init.zeros_(expert.fc2.bias)
     def forward(self, x):
         # x: (B, S, 1280)

         # Projects often drift in magnitude; this clamps them before the LLM.
         self.out_norm = LlamaRMSNorm(self.llm_dim, eps=1e-8)
+        # Using PyTorch default initialization (like MOSA paper)
     def forward(self, x):
         # x: (B, S, 1280)