mazesmazes
/

tiny-audio-transformer

@@ -605,9 +605,9 @@ class TransformerAudioProjector(nn.Module):
     def __init__(self, config):
         super().__init__()
-        # Default stride 4: Whisper (2x) * Projector (4x) = 8x total → ~12.5 Hz
-        # Similar to FunASR's 6x total (~16.67 Hz)
-        self.k = getattr(config, "projector_pool_stride", 4)
         encoder_dim = config.encoder_dim
         llm_dim = config.llm_dim
@@ -615,8 +615,9 @@ class TransformerAudioProjector(nn.Module):
         # Input: Stacked frames (e.g. 1280 * 2 = 2560)
         in_dim = encoder_dim * self.k
-        # FFN hidden dim for initial projection (FunASR default: 2048)
-        ffn_dim = getattr(config, "projector_hidden_dim", None) or 2048
         # FunASR-style projection: linear1 -> relu -> linear2
         self.linear1 = nn.Linear(in_dim, ffn_dim)
@@ -629,27 +630,18 @@ class TransformerAudioProjector(nn.Module):
             encoder_layer = nn.TransformerEncoderLayer(
                 d_model=llm_dim,
                 nhead=getattr(config, "projector_num_heads", 8),
-                dim_feedforward=llm_dim // 4,  # FunASR uses quarter size
                 dropout=0.0,
                 activation="relu",
                 batch_first=True,
                 norm_first=True,
             )
-            self.blocks = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
         else:
             self.blocks = None
-        # Final Norm for stability when projecting to frozen LLM
-        self.norm = LlamaRMSNorm(llm_dim, eps=1e-8)
-        self.apply(self._init_weights)
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            if m.bias is not None:
-                nn.init.zeros_(m.bias)
     def forward(self, x):
         # x: [Batch, Seq, Dim]
         batch, seq, dim = x.shape
@@ -672,7 +664,7 @@ class TransformerAudioProjector(nn.Module):
         if self.blocks is not None:
             x = self.blocks(x)
-        return self.norm(x)
     def get_output_length(self, input_length: int) -> int:
         return (input_length - 1) // self.k + 1

     def __init__(self, config):
         super().__init__()
+        # Default stride 6: Whisper (2x) * Projector (6x) = 12x total → ~8 Hz
+        # Matches FunASR's total stride (6x encoder * 2x projector = 12x)
+        self.k = getattr(config, "projector_pool_stride", 6)
         encoder_dim = config.encoder_dim
         llm_dim = config.llm_dim
         # Input: Stacked frames (e.g. 1280 * 2 = 2560)
         in_dim = encoder_dim * self.k
+        # FFN hidden dim for initial projection (balanced compression)
+        # 7680 → 4096 → 2048 distributes compression evenly (~2x each layer)
+        ffn_dim = getattr(config, "projector_hidden_dim", None) or 4096
         # FunASR-style projection: linear1 -> relu -> linear2
         self.linear1 = nn.Linear(in_dim, ffn_dim)
             encoder_layer = nn.TransformerEncoderLayer(
                 d_model=llm_dim,
                 nhead=getattr(config, "projector_num_heads", 8),
+                dim_feedforward=1024,  # Match FunASR (audio complexity is LLM-independent)
                 dropout=0.0,
                 activation="relu",
                 batch_first=True,
                 norm_first=True,
             )
+            self.blocks = nn.TransformerEncoder(
+                encoder_layer, num_layers=num_layers, enable_nested_tensor=False
+            )
         else:
             self.blocks = None
     def forward(self, x):
         # x: [Batch, Seq, Dim]
         batch, seq, dim = x.shape
         if self.blocks is not None:
             x = self.blocks(x)
+        return x
     def get_output_length(self, input_length: int) -> int:
         return (input_length - 1) // self.k + 1