Training in progress - step 1000

Browse files

Files changed (5) hide show

asr_config.py +8 -13
asr_modeling.py +0 -56
config.json +1 -2
model.safetensors +1 -1
projectors.py +7 -0

asr_config.py CHANGED Viewed

@@ -50,6 +50,13 @@ class ASRConfig(transformers.PretrainedConfig):
         projector_pool_stride: int = 4,
         downsample_rate: int = 5,  # Granite default
         projector_hidden_dim: Optional[int] = None,
         projector_type: str = "mlp",  # "mlp", "mosa", "moe", "qformer"
         # MoE-specific configuration
         num_experts: int = 4,  # Number of experts in MoE projectors
@@ -69,17 +76,6 @@ class ASRConfig(transformers.PretrainedConfig):
         lora_target_modules: Optional[list] = None,  # Default: all linear layers
         freeze_projector: bool = False,  # True for Stage 2 (LoRA-only training)
         freeze_language_model: bool = True,  # False = full decoder fine-tuning
-        # Encoder-output time masking — SpecAugment-style time masking applied
-        # AFTER the frozen encoder, BEFORE the projector. The actual SOTA-
-        # equivalent regularizer for frozen-encoder projector training: mel-
-        # side SpecAugment (the NeMo / OWSM default at ~F=2,T=10 for trainable
-        # encoders) would push a frozen Whisper encoder OOD, so we instead
-        # mask the encoder's output features and let the projector learn to
-        # reconstruct missing time positions. Disabled by default (0 / 0.0);
-        # canonical setting is num=5 masks of max_width_ratio=0.04 (up to
-        # ~20% of encoder output time positions masked per sample).
-        encoder_output_time_mask_num: int = 0,
-        encoder_output_time_mask_max_width_ratio: float = 0.0,
         do_sample: bool = False,
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
@@ -127,6 +123,7 @@ class ASRConfig(transformers.PretrainedConfig):
         self.projector_pool_stride = projector_pool_stride
         self.downsample_rate = downsample_rate
         self.projector_hidden_dim = projector_hidden_dim
         self.projector_type = projector_type
         # MoE-specific configuration
         self.num_experts = num_experts
@@ -154,8 +151,6 @@ class ASRConfig(transformers.PretrainedConfig):
         ]
         self.freeze_projector = freeze_projector
         self.freeze_language_model = freeze_language_model
-        self.encoder_output_time_mask_num = encoder_output_time_mask_num
-        self.encoder_output_time_mask_max_width_ratio = encoder_output_time_mask_max_width_ratio
         explicit_generation_args = {
             "num_beams": num_beams,

         projector_pool_stride: int = 4,
         downsample_rate: int = 5,  # Granite default
         projector_hidden_dim: Optional[int] = None,
+        # Projector dropout — applied between activation and the second
+        # linear in MLPAudioProjector. Matches Granite-Speech 4.1's
+        # Q-Former dropout (hidden_dropout_prob=0.1) used in its frozen-
+        # encoder + LoRA-LLM training stage. Default 0.0 for backward
+        # compatibility with existing checkpoints; experiment configs
+        # opt in to 0.1.
+        projector_dropout: float = 0.0,
         projector_type: str = "mlp",  # "mlp", "mosa", "moe", "qformer"
         # MoE-specific configuration
         num_experts: int = 4,  # Number of experts in MoE projectors
         lora_target_modules: Optional[list] = None,  # Default: all linear layers
         freeze_projector: bool = False,  # True for Stage 2 (LoRA-only training)
         freeze_language_model: bool = True,  # False = full decoder fine-tuning
         do_sample: bool = False,
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         self.projector_pool_stride = projector_pool_stride
         self.downsample_rate = downsample_rate
         self.projector_hidden_dim = projector_hidden_dim
+        self.projector_dropout = projector_dropout
         self.projector_type = projector_type
         # MoE-specific configuration
         self.num_experts = num_experts
         ]
         self.freeze_projector = freeze_projector
         self.freeze_language_model = freeze_language_model
         explicit_generation_args = {
             "num_beams": num_beams,

asr_modeling.py CHANGED Viewed

@@ -44,41 +44,6 @@ def _gather_audio_embeds(audio_embeds: torch.Tensor, token_counts: torch.Tensor)
     return audio_embeds[mask]
-def _time_mask_encoder_output(
-    hidden_states: torch.Tensor,
-    num_masks: int,
-    max_width_ratio: float,
-) -> torch.Tensor:
-    """SpecAugment-style time masking on encoder output features.
-    Zero-fills ``num_masks`` random contiguous time spans per sample. Each
-    span has width sampled uniformly from ``[0, max_width]`` where
-    ``max_width = max(1, int(time_len * max_width_ratio))``. Returns the
-    input unchanged when ``num_masks <= 0`` or ``max_width_ratio <= 0``.
-    Args:
-        hidden_states: ``(batch, time, dim)`` encoder output.
-        num_masks: Number of time-mask spans applied per sample.
-        max_width_ratio: Maximum mask width as a fraction of ``time``.
-    """
-    if num_masks <= 0 or max_width_ratio <= 0.0:
-        return hidden_states
-    batch, time_len, _ = hidden_states.shape
-    max_width = max(1, int(time_len * max_width_ratio))
-    device = hidden_states.device
-    widths = torch.randint(0, max_width + 1, (batch, num_masks), device=device)
-    max_starts = (time_len - widths).clamp(min=1)
-    starts = (torch.rand(batch, num_masks, device=device) * max_starts).long()
-    indices = torch.arange(time_len, device=device).view(1, 1, -1)
-    starts_e = starts.unsqueeze(-1)
-    ends_e = (starts + widths).unsqueeze(-1)
-    in_any_mask = ((indices >= starts_e) & (indices < ends_e)).any(dim=1)
-    keep = (~in_any_mask).to(dtype=hidden_states.dtype).unsqueeze(-1)
-    return hidden_states * keep
 class ASRModel(PreTrainedModel, GenerationMixin):
     """Audio-to-text model combining an audio encoder, projector, and language model."""
@@ -466,26 +431,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             self.config.encoder_conv_layers,
         )
-    def _apply_encoder_output_time_masking(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """SpecAugment-style time masking on encoder OUTPUT features.
-        For frozen-encoder projector training, mel-side SpecAugment would
-        push the encoder OOD (it was never trained with masked mels), so
-        we mask the encoder output instead. The projector learns to be
-        robust to missing encoder-output time positions. Disabled outside
-        training; otherwise delegates to ``_time_mask_encoder_output`` with
-        the config knobs.
-        """
-        if not self.training:
-            return hidden_states
-        return _time_mask_encoder_output(
-            hidden_states,
-            num_masks=int(getattr(self.config, "encoder_output_time_mask_num", 0)),
-            max_width_ratio=float(
-                getattr(self.config, "encoder_output_time_mask_max_width_ratio", 0.0)
-            ),
-        )
     def _encode_audio(
         self,
         audio_features: torch.Tensor,
@@ -504,7 +449,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             encoder_out = self.audio_tower(input_features=audio_features)
             hidden_states = encoder_out.last_hidden_state
-        hidden_states = self._apply_encoder_output_time_masking(hidden_states)
         audio_embeds = self.projector(hidden_states)
         token_counts = expected_token_counts.to(device=audio_embeds.device, dtype=torch.long)

     return audio_embeds[mask]
 class ASRModel(PreTrainedModel, GenerationMixin):
     """Audio-to-text model combining an audio encoder, projector, and language model."""
             self.config.encoder_conv_layers,
         )
     def _encode_audio(
         self,
         audio_features: torch.Tensor,
             encoder_out = self.audio_tower(input_features=audio_features)
             hidden_states = encoder_out.last_hidden_state
         audio_embeds = self.projector(hidden_states)
         token_counts = expected_token_counts.to(device=audio_embeds.device, dtype=torch.long)

config.json CHANGED Viewed

@@ -234,8 +234,6 @@
     ]
   ],
   "encoder_dim": 1280,
-  "encoder_output_time_mask_max_width_ratio": 0.04,
-  "encoder_output_time_mask_num": 5,
   "eos_token_id": 151645,
   "freeze_language_model": false,
   "freeze_projector": false,
@@ -264,6 +262,7 @@
   "pad_token_id": 151643,
   "pipeline_tag": "automatic-speech-recognition",
   "pretrained_model_path": "mazesmazes/tiny-audio-next",
   "projector_hidden_dim": 2048,
   "projector_pool_stride": 4,
   "projector_type": "mlp",

     ]
   ],
   "encoder_dim": 1280,
   "eos_token_id": 151645,
   "freeze_language_model": false,
   "freeze_projector": false,
   "pad_token_id": 151643,
   "pipeline_tag": "automatic-speech-recognition",
   "pretrained_model_path": "mazesmazes/tiny-audio-next",
+  "projector_dropout": 0.1,
   "projector_hidden_dim": 2048,
   "projector_pool_stride": 4,
   "projector_type": "mlp",

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f20696bf71ed993d8320bb51ae89bd1f3dee0392c2f69fd0578b7095e4ee9d2b
 size 2433494416

 version https://git-lfs.github.com/spec/v1
+oid sha256:97b1eda3c22a7e702033952c30ab1de35166bb22100f243c283d980978e1a8bd
 size 2433494416

projectors.py CHANGED Viewed

@@ -55,6 +55,12 @@ class MLPAudioProjector(nn.Module):
         self.norm = LlamaRMSNorm(hidden_dim, eps=1e-6)
         self.norm.weight.data.fill_(self._NORM_INIT)
         self.act = nn.GELU()
         self.linear_2 = nn.Linear(hidden_dim, llm_dim, bias=False)
         # Output norm aligns the projector's RMS with the LM's embed_tokens
         # distribution. See _NORM_INIT comment above for the magnitude
@@ -80,6 +86,7 @@ class MLPAudioProjector(nn.Module):
         x = self.linear_1(x)
         x = self.norm(x)
         x = self.act(x)
         x = self.linear_2(x)
         return self.norm_2(x)

         self.norm = LlamaRMSNorm(hidden_dim, eps=1e-6)
         self.norm.weight.data.fill_(self._NORM_INIT)
         self.act = nn.GELU()
+        # Dropout matches Granite-Speech 4.1's Q-Former hidden_dropout_prob=0.1
+        # in its frozen-encoder modality-alignment stage — the closest
+        # published precedent for our regime. Default 0.0 in config means
+        # nn.Dropout(0.0) is a no-op for existing experiments.
+        projector_dropout = float(getattr(config, "projector_dropout", 0.0))
+        self.dropout = nn.Dropout(projector_dropout)
         self.linear_2 = nn.Linear(hidden_dim, llm_dim, bias=False)
         # Output norm aligns the projector's RMS with the LM's embed_tokens
         # distribution. See _NORM_INIT comment above for the magnitude
         x = self.linear_1(x)
         x = self.norm(x)
         x = self.act(x)
+        x = self.dropout(x)
         x = self.linear_2(x)
         return self.norm_2(x)