mazesmazes
/

tiny-audio-swiglu

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1fbff9f5ed1bd4099f58cd2e093052409854375c4cde75c051e28ad58ae1122
-size 124082792

 version https://git-lfs.github.com/spec/v1
+oid sha256:720402f0e8107015c77907789cc8b20307741b0008412ad5e2cffc08462ae5c9
+size 265642600

projectors.py CHANGED Viewed

@@ -237,8 +237,8 @@ class SwiGLU(nn.Module):
 class SwiGLUAudioProjector(nn.Module):
     """
-    Optimized for Frozen LLM + 2500h Data.
-    Target: 12.5 Hz Output (Stride 4) with 8/3 SwiGLU Expansion.
     """
     def __init__(self, config):
@@ -247,154 +247,44 @@ class SwiGLUAudioProjector(nn.Module):
         encoder_dim = config.encoder_dim
         llm_dim = config.llm_dim
-        # Conv Expansion (Compensating for Time Compression)
-        # We compress time by 4x, so we expand width by 2x to preserve info density.
-        hidden_dim = int(encoder_dim * 2)
-        # SwiGLU Internal Expansion (The 8/3 Ratio)
-        # To match standard FFN capacity: 4 * (2/3) = 8/3
-        swiglu_inner = int(hidden_dim * 8 / 3)
-        self.downsample = nn.Conv1d(
-            in_channels=encoder_dim,
-            out_channels=hidden_dim,
-            kernel_size=self.k,
-            stride=self.k,
-            padding=0,
-        )
         self.norm = LlamaRMSNorm(hidden_dim, eps=1e-8)
         self.proj = SwiGLU(hidden_dim, swiglu_inner, llm_dim)
-        self.apply(self._init_weights)
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Linear, nn.Conv1d)):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            if m.bias is not None:
-                nn.init.constant_(m.bias, 0)
     def forward(self, x):
         # x: [Batch, Seq, Dim]
         batch, seq, dim = x.shape
-        # Manual Padding (prevents frame dropping)
-        if seq % self.k != 0:
-            pad_len = self.k - (seq % self.k)
-            x = F.pad(x, (0, 0, 0, pad_len))
-        # [B, S, D] -> [B, D, S]
-        x = x.transpose(1, 2)
-        # Downsample (50Hz -> 12.5Hz)
-        x = self.downsample(x)
-        # [B, D, S] -> [B, S, D]
-        x = x.transpose(1, 2)
-        # Norm & Project
         x = self.norm(x)
         return self.proj(x)
     def get_output_length(self, input_length: int) -> int:
-        return (input_length + self.k - 1) // self.k
-# =============================================================================
-# Residual Projector
-# =============================================================================
-class ResidualMLP(nn.Module):
-    """MLP block with residual connection: Output = x + MLP(x)."""
-    def __init__(self, dim, hidden_dim):
-        super().__init__()
-        self.fc1 = nn.Linear(dim, hidden_dim)
-        self.fc2 = nn.Linear(hidden_dim, dim)
-        self.act = nn.GELU()
-    def forward(self, x):
-        residual = x
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.fc2(x)
-        return residual + x
-class ResidualAudioProjector(nn.Module):
-    """Residual MLP projector for audio-to-LLM feature translation."""
-    def __init__(self, config):
-        super().__init__()
-        self.k = getattr(config, "projector_pool_stride", 4)
-        in_dim = config.encoder_dim * self.k
-        out_dim = config.llm_dim
-        hidden_dim = getattr(config, "projector_hidden_dim", None) or out_dim * 4
-        self.num_layers = getattr(config, "projector_num_layers", 2)
-        self.input_proj = nn.Linear(in_dim, out_dim)
-        self.ln_input = LlamaRMSNorm(out_dim, eps=1e-8)
-        self.layers = nn.ModuleList(
-            [ResidualMLP(out_dim, hidden_dim) for _ in range(self.num_layers)]
-        )
-        self.layer_norms = nn.ModuleList(
-            [LlamaRMSNorm(out_dim, eps=1e-8) for _ in range(self.num_layers)]
-        )
-        self._init_weights(config)
-    def _init_weights(self, config):
-        std = getattr(config, "projector_init_std", 0.02)
-        with torch.no_grad():
-            nn.init.normal_(self.input_proj.weight, mean=0.0, std=std)
-            if self.input_proj.bias is not None:
-                nn.init.zeros_(self.input_proj.bias)
-            self.ln_input.weight.data.fill_(1.0)
-            for ln in self.layer_norms:
-                ln.weight.data.fill_(1.0)
-            for layer in self.layers:
-                nn.init.normal_(layer.fc1.weight, mean=0.0, std=std)
-                nn.init.normal_(layer.fc2.weight, mean=0.0, std=std * 0.1)
-                if layer.fc1.bias is not None:
-                    nn.init.zeros_(layer.fc1.bias)
-                if layer.fc2.bias is not None:
-                    nn.init.zeros_(layer.fc2.bias)
-    def get_output_length(self, input_length: int) -> int:
-        """Calculate output sequence length given input length."""
-        # Temporal pooling with stride k
-        remainder = input_length % self.k
-        if remainder:
-            input_length += self.k - remainder
-        return input_length // self.k
-    def forward(self, x):
-        batch_size, seq_len, dim = x.size()
-        target_dtype = self.input_proj.weight.dtype
-        if x.dtype != target_dtype:
-            x = x.to(target_dtype)
-        remainder = seq_len % self.k
-        if remainder:
-            pad_len = self.k - remainder
-            x = F.pad(x, (0, 0, 0, pad_len))
-        x = x.contiguous().view(batch_size, -1, dim * self.k)
-        x = self.input_proj(x)
-        x = self.ln_input(x)
-        for layer, ln in zip(self.layers, self.layer_norms):
-            x = layer(x)
-            x = ln(x)
-        return x
 # =============================================================================
@@ -688,6 +578,84 @@ class QFormerAudioProjector(nn.Module):
         return self.linear(query_proj)
 # =============================================================================
 # Projector Registry
 # =============================================================================
@@ -696,7 +664,7 @@ PROJECTOR_CLASSES = {
     "mlp": MLPAudioProjector,
     "mosa": MOSAProjector,
     "swiglu": SwiGLUAudioProjector,
-    "residual": ResidualAudioProjector,
     "shared_moe": SharedMoEAudioProjector,
     "qformer": QFormerAudioProjector,
 }

 class SwiGLUAudioProjector(nn.Module):
     """
+    SwiGLU projector with frame stacking (FunASR-style).
+    Uses frame stacking for downsampling, linear projection, then SwiGLU.
     """
     def __init__(self, config):
         encoder_dim = config.encoder_dim
         llm_dim = config.llm_dim
+        # Frame stacking input dimension
+        in_dim = encoder_dim * self.k  # 1280 * 4 = 5120
+        # Hidden dim after initial projection (balanced compression like transformer)
+        hidden_dim = getattr(config, "projector_hidden_dim", None) or 4096
+        # Initial linear projection (frame stacking → hidden)
+        self.linear = nn.Linear(in_dim, hidden_dim)
+        # Norm before SwiGLU
         self.norm = LlamaRMSNorm(hidden_dim, eps=1e-8)
+        # SwiGLU with 8/3 expansion ratio
+        swiglu_inner = int(hidden_dim * 8 / 3)
         self.proj = SwiGLU(hidden_dim, swiglu_inner, llm_dim)
     def forward(self, x):
         # x: [Batch, Seq, Dim]
         batch, seq, dim = x.shape
+        # Padding to multiple of k
+        chunk_num = (seq - 1) // self.k + 1
+        pad_num = chunk_num * self.k - seq
+        if pad_num > 0:
+            x = F.pad(x, (0, 0, 0, pad_num))
+        # Frame stacking: [B, S, D] -> [B, S/k, D*k]
+        x = x.contiguous().view(batch, chunk_num, dim * self.k)
+        # Linear projection
+        x = self.linear(x)
+        # Norm & SwiGLU
         x = self.norm(x)
         return self.proj(x)
     def get_output_length(self, input_length: int) -> int:
+        return (input_length - 1) // self.k + 1
 # =============================================================================
         return self.linear(query_proj)
+# =============================================================================
+# Transformer Projector
+# =============================================================================
+class TransformerAudioProjector(nn.Module):
+    """
+    Transformer Projector (FunASR Style).
+    Projects to LLM dim first, then applies transformer blocks for context mixing.
+    """
+    def __init__(self, config):
+        super().__init__()
+        # Default stride 6: Whisper (2x) * Projector (6x) = 12x total → ~8 Hz
+        # Matches FunASR's total stride (6x encoder * 2x projector = 12x)
+        self.k = getattr(config, "projector_pool_stride", 6)
+        encoder_dim = config.encoder_dim
+        llm_dim = config.llm_dim
+        # Input: Stacked frames (e.g. 1280 * 2 = 2560)
+        in_dim = encoder_dim * self.k
+        # FFN hidden dim for initial projection (balanced compression)
+        # 7680 → 4096 → 2048 distributes compression evenly (~2x each layer)
+        ffn_dim = getattr(config, "projector_hidden_dim", None) or 4096
+        # FunASR-style projection: linear1 -> relu -> linear2
+        self.linear1 = nn.Linear(in_dim, ffn_dim)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(ffn_dim, llm_dim)
+        # Transformer blocks operating at llm_dim
+        num_layers = getattr(config, "projector_num_layers", 2)
+        if num_layers > 0:
+            encoder_layer = nn.TransformerEncoderLayer(
+                d_model=llm_dim,
+                nhead=getattr(config, "projector_num_heads", 8),
+                dim_feedforward=1024,  # Match FunASR (audio complexity is LLM-independent)
+                dropout=0.0,
+                activation="relu",
+                batch_first=True,
+                norm_first=True,
+            )
+            self.blocks = nn.TransformerEncoder(
+                encoder_layer, num_layers=num_layers, enable_nested_tensor=False
+            )
+        else:
+            self.blocks = None
+    def forward(self, x):
+        # x: [Batch, Seq, Dim]
+        batch, seq, dim = x.shape
+        # Padding to multiple of k
+        chunk_num = (seq - 1) // self.k + 1
+        pad_num = chunk_num * self.k - seq
+        if pad_num > 0:
+            x = F.pad(x, (0, 0, 0, pad_num))
+        # Frame stacking: [B, S, D] -> [B, S/k, D*k]
+        x = x.contiguous().view(batch, chunk_num, dim * self.k)
+        # FunASR-style projection to LLM dim
+        x = self.linear1(x)
+        x = self.relu(x)
+        x = self.linear2(x)
+        # Transformer context mixing
+        if self.blocks is not None:
+            x = self.blocks(x)
+        return x
+    def get_output_length(self, input_length: int) -> int:
+        return (input_length - 1) // self.k + 1
 # =============================================================================
 # Projector Registry
 # =============================================================================
     "mlp": MLPAudioProjector,
     "mosa": MOSAProjector,
     "swiglu": SwiGLUAudioProjector,
     "shared_moe": SharedMoEAudioProjector,
     "qformer": QFormerAudioProjector,
+    "transformer": TransformerAudioProjector,
 }