Training in progress - step 500

Browse files

Files changed (4) hide show

asr_config.py +0 -2
asr_modeling.py +0 -53
model.safetensors +2 -2
projectors.py +44 -66

asr_config.py CHANGED Viewed

@@ -30,7 +30,6 @@ class ASRConfig(transformers.PretrainedConfig):
         num_experts: int = 4,  # Number of experts in MoE projectors
         num_experts_per_tok: int = 2,  # Top-k experts per token
         router_aux_loss_coef: float = 0.01,  # Auxiliary loss coefficient for load balancing
-        use_specaugment: bool = True,  # Apply SpecAugment during training
         # QFormer-specific configuration (Granite defaults)
         qformer_window_size: int = 15,  # Window size for QFormer processing
         qformer_hidden_size: Optional[int] = None,  # QFormer hidden size (defaults to encoder_dim)
@@ -79,7 +78,6 @@ class ASRConfig(transformers.PretrainedConfig):
         self.num_experts = num_experts
         self.num_experts_per_tok = num_experts_per_tok
         self.router_aux_loss_coef = router_aux_loss_coef
-        self.use_specaugment = use_specaugment
         # QFormer-specific configuration
         self.qformer_window_size = qformer_window_size
         self.qformer_hidden_size = qformer_hidden_size

         num_experts: int = 4,  # Number of experts in MoE projectors
         num_experts_per_tok: int = 2,  # Top-k experts per token
         router_aux_loss_coef: float = 0.01,  # Auxiliary loss coefficient for load balancing
         # QFormer-specific configuration (Granite defaults)
         qformer_window_size: int = 15,  # Window size for QFormer processing
         qformer_hidden_size: Optional[int] = None,  # QFormer hidden size (defaults to encoder_dim)
         self.num_experts = num_experts
         self.num_experts_per_tok = num_experts_per_tok
         self.router_aux_loss_coef = router_aux_loss_coef
         # QFormer-specific configuration
         self.qformer_window_size = qformer_window_size
         self.qformer_hidden_size = qformer_hidden_size

asr_modeling.py CHANGED Viewed

@@ -13,9 +13,6 @@ from transformers import (
 )
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.whisper.modeling_whisper import (
-    _compute_mask_indices,
-)
 try:
     from .asr_config import ASRConfig
@@ -269,53 +266,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         """Only save trainable projector weights."""
         return {f"projector.{k}": v for k, v in self.projector.state_dict().items()}
-    def _apply_specaugment(self, input_features: torch.Tensor) -> torch.Tensor:
-        if not getattr(self.config, "use_specaugment", False):
-            return input_features
-        if not self.training:
-            return input_features
-        # Input shape: (batch_size, num_mel_bins, sequence_length) for Whisper
-        batch_size, hidden_size, sequence_length = input_features.size()
-        mask_time_prob = getattr(self.config, "mask_time_prob", 0.05)
-        mask_time_length = getattr(self.config, "mask_time_length", 10)
-        mask_feature_prob = getattr(self.config, "mask_feature_prob", 0.0)
-        mask_feature_length = getattr(self.config, "mask_feature_length", 10)
-        # Time masking
-        if mask_time_prob > 0:
-            mask_time_np = _compute_mask_indices(
-                (batch_size, sequence_length),
-                mask_prob=mask_time_prob,
-                mask_length=mask_time_length,
-                min_masks=2,
-            )
-            mask_time_indices = torch.tensor(
-                mask_time_np, device=input_features.device, dtype=torch.bool
-            )
-            # Expand to cover all features: (batch, seq) -> (batch, features, seq)
-            mask_time_expanded = mask_time_indices[:, None].expand(-1, hidden_size, -1)
-            input_features = input_features.masked_fill(mask_time_expanded, 0.0)
-        # Feature masking
-        if mask_feature_prob > 0:
-            mask_feature_np = _compute_mask_indices(
-                (batch_size, hidden_size),
-                mask_prob=mask_feature_prob,
-                mask_length=mask_feature_length,
-                min_masks=2,
-            )
-            mask_feature_indices = torch.tensor(
-                mask_feature_np, device=input_features.device, dtype=torch.bool
-            )
-            # Expand: (batch, features) -> (batch, features, seq)
-            mask_feature_expanded = mask_feature_indices[:, :, None].expand(-1, -1, sequence_length)
-            input_features = input_features.masked_fill(mask_feature_expanded, 0.0)
-        return input_features
     def _encode_audio(
         self,
         audio_features: torch.Tensor,
@@ -330,9 +280,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         Returns:
             Flattened audio embeddings of shape (total_audio_tokens, hidden_dim).
         """
-        # Apply SpecAugment during training (before encoding)
-        audio_features = self._apply_specaugment(audio_features)
         with torch.no_grad():
             encoder_out = self.audio_tower(input_features=audio_features)
             hidden_states = encoder_out.last_hidden_state

 )
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import CausalLMOutputWithPast
 try:
     from .asr_config import ASRConfig
         """Only save trainable projector weights."""
         return {f"projector.{k}": v for k, v in self.projector.state_dict().items()}
     def _encode_audio(
         self,
         audio_features: torch.Tensor,
         Returns:
             Flattened audio embeddings of shape (total_audio_tokens, hidden_dim).
         """
         with torch.no_grad():
             encoder_out = self.audio_tower(input_features=audio_features)
             hidden_states = encoder_out.last_hidden_state

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:141150eea397b1108cb83d033075f10dfdce3d3d652d48d647d34cf7b804bb50
-size 721963128

 version https://git-lfs.github.com/spec/v1
+oid sha256:82969523f91eb82b594c3669afa1c1cc8d1c49d5e3414997e659c8214b8f5942
+size 124082792

projectors.py CHANGED Viewed

@@ -222,28 +222,23 @@ class MOSAProjector(nn.Module):
 class SwiGLU(nn.Module):
-    def __init__(self, in_features, hidden_features, out_features, bias=False, dropout=0.0):
         super().__init__()
-        self.w1 = nn.Linear(in_features, hidden_features, bias=bias)
-        self.w2 = nn.Linear(in_features, hidden_features, bias=bias)
-        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
         self.act = nn.SiLU()
-        self.dropout = nn.Dropout(dropout)
     def forward(self, x):
-        x_gate = self.act(self.w1(x))
-        x_val = self.w2(x)
-        x = x_gate * x_val
-        x = self.dropout(x)
-        return self.w3(x)
 class SwiGLUAudioProjector(nn.Module):
     """
-    SwiGLU projector with:
-    1. C-Abstractor style dual-path context (Conv + AvgPool).
-    2. Llama 3 style hidden dimension calculation.
-    3. RMSNorm for training stability.
     """
     def __init__(self, config):
@@ -252,69 +247,58 @@ class SwiGLUAudioProjector(nn.Module):
         encoder_dim = config.encoder_dim
         llm_dim = config.llm_dim
-        # 1. C-Abstractor Style Dual-Path Context
-        # Path A: Depthwise Conv (Phonetic features)
-        self.local_context = nn.Conv1d(
-            encoder_dim, encoder_dim, kernel_size=3, padding=1, groups=encoder_dim, bias=False
         )
-        # Path B: Mean Pooling (Energy/Prosody features)
-        # We use a kernel of 3 to match the Conv1d's receptive field
-        self.energy_pool = nn.AvgPool1d(kernel_size=3, stride=1, padding=1)
-        # 2. Llama 3 Style Dimension Calculation
-        d_model = encoder_dim * self.k
-        hidden_dim = int(2 * (d_model * 4) / 3)
-        multiple_of = 256
-        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-        # 3. Normalization and SwiGLU
-        self.pre_norm = LlamaRMSNorm(d_model, eps=1e-8)
-        self.proj1 = SwiGLU(d_model, hidden_dim, hidden_dim)
-        self.proj2 = nn.Linear(hidden_dim, llm_dim, bias=False)
         self.apply(self._init_weights)
     def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
             nn.init.trunc_normal_(m.weight, std=0.02)
             if m.bias is not None:
-                nn.init.zeros_(m.bias)
     def forward(self, x):
         # x: [Batch, Seq, Dim]
         batch, seq, dim = x.shape
-        # --- Dual-Path Context Injection ---
-        x_trans = x.transpose(1, 2)  # [B, D, S]
-        # Branch A: Convolutional Detail
-        x_conv = self.local_context(x_trans)
-        # Branch B: Energy Abstraction
-        x_energy = self.energy_pool(x_trans)
-        # Combine and Add Residual (Summing the branches)
-        # This gives the model a multi-resolution view of the audio
-        x = (x_conv + x_energy).transpose(1, 2) + x
-        # --- Frame Concatenation ---
-        if seq % self.k:
-            x = F.pad(x, (0, 0, 0, self.k - (seq % self.k)))
-        x = x.reshape(batch, -1, dim * self.k)
-        # --- Projection ---
-        x = self.pre_norm(x)
-        x = self.proj1(x)
-        return self.proj2(x)
     def get_output_length(self, input_length: int) -> int:
-        remainder = input_length % self.k
-        return (input_length + self.k - 1) // self.k if remainder else input_length // self.k
-# Alias for backwards compatibility
-AudioProjector = SwiGLUAudioProjector
 # =============================================================================
 # Residual Projector
@@ -324,20 +308,17 @@ AudioProjector = SwiGLUAudioProjector
 class ResidualMLP(nn.Module):
     """MLP block with residual connection: Output = x + MLP(x)."""
-    def __init__(self, dim, hidden_dim, dropout=0.0):
         super().__init__()
         self.fc1 = nn.Linear(dim, hidden_dim)
         self.fc2 = nn.Linear(hidden_dim, dim)
         self.act = nn.GELU()
-        self.dropout = nn.Dropout(dropout)
     def forward(self, x):
         residual = x
         x = self.fc1(x)
         x = self.act(x)
-        x = self.dropout(x)
         x = self.fc2(x)
-        x = self.dropout(x)
         return residual + x
@@ -352,19 +333,17 @@ class ResidualAudioProjector(nn.Module):
         out_dim = config.llm_dim
         hidden_dim = getattr(config, "projector_hidden_dim", None) or out_dim * 4
         self.num_layers = getattr(config, "projector_num_layers", 2)
-        dropout_rate = getattr(config, "projector_dropout", 0.0)
         self.input_proj = nn.Linear(in_dim, out_dim)
         self.ln_input = LlamaRMSNorm(out_dim, eps=1e-8)
         self.layers = nn.ModuleList(
-            [ResidualMLP(out_dim, hidden_dim, dropout=dropout_rate) for _ in range(self.num_layers)]
         )
         self.layer_norms = nn.ModuleList(
             [LlamaRMSNorm(out_dim, eps=1e-8) for _ in range(self.num_layers)]
         )
-        self.output_dropout = nn.Dropout(dropout_rate)
         self._init_weights(config)
     def _init_weights(self, config):
@@ -415,7 +394,7 @@ class ResidualAudioProjector(nn.Module):
             x = layer(x)
             x = ln(x)
-        return self.output_dropout(x)
 # =============================================================================
@@ -526,7 +505,6 @@ class SharedMoEAudioProjector(nn.Module):
     def __init__(self, config):
         super().__init__()
-        # Default stride is now 2 (was 4)
         self.k = getattr(config, "projector_pool_stride", 4)
         encoder_dim = config.encoder_dim

 class SwiGLU(nn.Module):
+    """SwiGLU activation block (Llama-style: SiLU(Gate) * Value -> Output)."""
+    def __init__(self, in_features, hidden_features, out_features):
         super().__init__()
+        self.w1 = nn.Linear(in_features, hidden_features, bias=False)  # Gate
+        self.w2 = nn.Linear(in_features, hidden_features, bias=False)  # Value
+        self.w3 = nn.Linear(hidden_features, out_features, bias=False)  # Output
         self.act = nn.SiLU()
     def forward(self, x):
+        return self.w3(self.act(self.w1(x)) * self.w2(x))
 class SwiGLUAudioProjector(nn.Module):
     """
+    Optimized for Frozen LLM + 2500h Data.
+    Target: 12.5 Hz Output (Stride 4) with 8/3 SwiGLU Expansion.
     """
     def __init__(self, config):
         encoder_dim = config.encoder_dim
         llm_dim = config.llm_dim
+        # Conv Expansion (Compensating for Time Compression)
+        # We compress time by 4x, so we expand width by 2x to preserve info density.
+        hidden_dim = int(encoder_dim * 2)
+        # SwiGLU Internal Expansion (The 8/3 Ratio)
+        # To match standard FFN capacity: 4 * (2/3) = 8/3
+        swiglu_inner = int(hidden_dim * 8 / 3)
+        self.downsample = nn.Conv1d(
+            in_channels=encoder_dim,
+            out_channels=hidden_dim,
+            kernel_size=self.k,
+            stride=self.k,
+            padding=0,
         )
+        self.norm = LlamaRMSNorm(hidden_dim, eps=1e-8)
+        self.proj = SwiGLU(hidden_dim, swiglu_inner, llm_dim)
         self.apply(self._init_weights)
     def _init_weights(self, m):
+        if isinstance(m, (nn.Linear, nn.Conv1d)):
             nn.init.trunc_normal_(m.weight, std=0.02)
             if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
     def forward(self, x):
         # x: [Batch, Seq, Dim]
         batch, seq, dim = x.shape
+        # Manual Padding (prevents frame dropping)
+        if seq % self.k != 0:
+            pad_len = self.k - (seq % self.k)
+            x = F.pad(x, (0, 0, 0, pad_len))
+        # [B, S, D] -> [B, D, S]
+        x = x.transpose(1, 2)
+        # Downsample (50Hz -> 12.5Hz)
+        x = self.downsample(x)
+        # [B, D, S] -> [B, S, D]
+        x = x.transpose(1, 2)
+        # Norm & Project
+        x = self.norm(x)
+        return self.proj(x)
     def get_output_length(self, input_length: int) -> int:
+        return (input_length + self.k - 1) // self.k
 # =============================================================================
 # Residual Projector
 class ResidualMLP(nn.Module):
     """MLP block with residual connection: Output = x + MLP(x)."""
+    def __init__(self, dim, hidden_dim):
         super().__init__()
         self.fc1 = nn.Linear(dim, hidden_dim)
         self.fc2 = nn.Linear(hidden_dim, dim)
         self.act = nn.GELU()
     def forward(self, x):
         residual = x
         x = self.fc1(x)
         x = self.act(x)
         x = self.fc2(x)
         return residual + x
         out_dim = config.llm_dim
         hidden_dim = getattr(config, "projector_hidden_dim", None) or out_dim * 4
         self.num_layers = getattr(config, "projector_num_layers", 2)
         self.input_proj = nn.Linear(in_dim, out_dim)
         self.ln_input = LlamaRMSNorm(out_dim, eps=1e-8)
         self.layers = nn.ModuleList(
+            [ResidualMLP(out_dim, hidden_dim) for _ in range(self.num_layers)]
         )
         self.layer_norms = nn.ModuleList(
             [LlamaRMSNorm(out_dim, eps=1e-8) for _ in range(self.num_layers)]
         )
         self._init_weights(config)
     def _init_weights(self, config):
             x = layer(x)
             x = ln(x)
+        return x
 # =============================================================================
     def __init__(self, config):
         super().__init__()
         self.k = getattr(config, "projector_pool_stride", 4)
         encoder_dim = config.encoder_dim