Training in progress - step 1000

Browse files

Files changed (5) hide show

asr_config.py +0 -7
asr_modeling.py +0 -24
config.json +0 -1
model.safetensors +1 -1
projectors.py +16 -3

asr_config.py CHANGED Viewed

@@ -51,12 +51,6 @@ class ASRConfig(transformers.PretrainedConfig):
         downsample_rate: int = 5,  # Granite default
         projector_hidden_dim: Optional[int] = None,
         projector_type: str = "mlp",  # "mlp", "mosa", "moe", "qformer"
-        # Per-time-step Bernoulli zero-mask on encoder output before the
-        # projector (training-only). 0.05–0.15 is the SpecAugment-equivalent
-        # range for frozen-encoder setups; drops whole encoder frames so
-        # the projector learns robustness to missing context. No magnitude
-        # rescaling. 0.0 disables.
-        audio_token_dropout: float = 0.0,
         # MoE-specific configuration
         num_experts: int = 4,  # Number of experts in MoE projectors
         num_experts_per_tok: int = 2,  # Top-k experts per token
@@ -123,7 +117,6 @@ class ASRConfig(transformers.PretrainedConfig):
         self.downsample_rate = downsample_rate
         self.projector_hidden_dim = projector_hidden_dim
         self.projector_type = projector_type
-        self.audio_token_dropout = audio_token_dropout
         # MoE-specific configuration
         self.num_experts = num_experts
         self.num_experts_per_tok = num_experts_per_tok

         downsample_rate: int = 5,  # Granite default
         projector_hidden_dim: Optional[int] = None,
         projector_type: str = "mlp",  # "mlp", "mosa", "moe", "qformer"
         # MoE-specific configuration
         num_experts: int = 4,  # Number of experts in MoE projectors
         num_experts_per_tok: int = 2,  # Top-k experts per token
         self.downsample_rate = downsample_rate
         self.projector_hidden_dim = projector_hidden_dim
         self.projector_type = projector_type
         # MoE-specific configuration
         self.num_experts = num_experts
         self.num_experts_per_tok = num_experts_per_tok

asr_modeling.py CHANGED Viewed

@@ -449,35 +449,11 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             encoder_out = self.audio_tower(input_features=audio_features)
             hidden_states = encoder_out.last_hidden_state
-        hidden_states = self._maybe_drop_audio_tokens(hidden_states)
         audio_embeds = self.projector(hidden_states)
         token_counts = expected_token_counts.to(device=audio_embeds.device, dtype=torch.long)
         return _gather_audio_embeds(audio_embeds, token_counts)
-    def _maybe_drop_audio_tokens(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """Per-time-step Bernoulli zero-mask on encoder output (train-only).
-        SpecAugment-equivalent for frozen-encoder setups: drops whole frames
-        from the encoder output sequence so the projector learns robustness
-        to missing context. Length-preserving (zeros, not deletions) so
-        audio token counts in the prompt stay consistent. No magnitude
-        rescaling — the projector should not learn to compensate.
-        """
-        p = float(getattr(self.config, "audio_token_dropout", 0.0))
-        if not self.training or p <= 0.0:
-            return hidden_states
-        keep = 1.0 - p
-        mask = torch.bernoulli(
-            torch.full(
-                hidden_states.shape[:-1],
-                keep,
-                device=hidden_states.device,
-                dtype=hidden_states.dtype,
-            )
-        ).unsqueeze(-1)
-        return hidden_states * mask
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,

             encoder_out = self.audio_tower(input_features=audio_features)
             hidden_states = encoder_out.last_hidden_state
         audio_embeds = self.projector(hidden_states)
         token_counts = expected_token_counts.to(device=audio_embeds.device, dtype=torch.long)
         return _gather_audio_embeds(audio_embeds, token_counts)
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,

config.json CHANGED Viewed

@@ -103,7 +103,6 @@
   },
   "audio_model_id": "zai-org/GLM-ASR-Nano-2512",
   "audio_sample_rate": 16000,
-  "audio_token_dropout": 0.1,
   "auto_map": {
     "AutoConfig": "asr_config.ASRConfig",
     "AutoModel": "asr_modeling.ASRModel",

   },
   "audio_model_id": "zai-org/GLM-ASR-Nano-2512",
   "audio_sample_rate": 16000,
   "auto_map": {
     "AutoConfig": "asr_config.ASRConfig",
     "AutoModel": "asr_modeling.ASRModel",

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a78c24b6c8640d257dee04d0bf3c63c2020dcc24ed38b1112de8c0d77d930384
 size 2433494416

 version https://git-lfs.github.com/spec/v1
+oid sha256:69814e5212595ce41dbec818e0a9fcbc59fb014dd15e4470c5ecc9acb33fff17
 size 2433494416

projectors.py CHANGED Viewed

@@ -23,6 +23,18 @@ from transformers.models.llama.modeling_llama import LlamaRMSNorm
 class MLPAudioProjector(nn.Module):
     """2-layer MLP projector with frame-stacking downsampling (matches GLM-ASR)."""
     def __init__(self, config):
         """Initialize MLP projector.
@@ -41,13 +53,14 @@ class MLPAudioProjector(nn.Module):
         hidden_dim = getattr(config, "projector_hidden_dim", None) or llm_dim
         self.linear_1 = nn.Linear(in_dim, hidden_dim, bias=False)
         self.norm = LlamaRMSNorm(hidden_dim, eps=1e-6)
         self.act = nn.GELU()
         self.linear_2 = nn.Linear(hidden_dim, llm_dim, bias=False)
         # Output norm aligns the projector's RMS with the LM's embed_tokens
-        # distribution. Without it, linear_2's Kaiming-uniform init produces
-        # outputs ~30× quieter than embed rows, which saturates softmax at
-        # audio positions and starves them of gradient.
         self.norm_2 = LlamaRMSNorm(llm_dim, eps=1e-6)
     def get_output_length(self, input_length: int) -> int:
         """Calculate output sequence length given input length (matches GLM-ASR)."""

 class MLPAudioProjector(nn.Module):
     """2-layer MLP projector with frame-stacking downsampling (matches GLM-ASR)."""
+    # RMSNorm init weight chosen to match Qwen3-0.6B's embed_tokens median
+    # RMS (empirically 0.0292 across 151,936 tokens × dim=1024). With this
+    # init the projector outputs enter the LM at the same per-position
+    # residual-stream magnitude as text embed_tokens — avoiding the
+    # ~34× over-magnitude that LlamaRMSNorm's default weight=1.0 produces.
+    # Adam's per-parameter normalization means this small init does NOT
+    # starve projector gradient flow; the norm-before-GELU placement keeps
+    # gradients healthy regardless of init magnitude. If you swap to a
+    # different LM, re-measure with
+    # `model.get_input_embeddings().weight.pow(2).mean().sqrt()` and update.
+    _NORM_INIT = 0.029
     def __init__(self, config):
         """Initialize MLP projector.
         hidden_dim = getattr(config, "projector_hidden_dim", None) or llm_dim
         self.linear_1 = nn.Linear(in_dim, hidden_dim, bias=False)
         self.norm = LlamaRMSNorm(hidden_dim, eps=1e-6)
+        self.norm.weight.data.fill_(self._NORM_INIT)
         self.act = nn.GELU()
         self.linear_2 = nn.Linear(hidden_dim, llm_dim, bias=False)
         # Output norm aligns the projector's RMS with the LM's embed_tokens
+        # distribution. See _NORM_INIT comment above for the magnitude
+        # derivation.
         self.norm_2 = LlamaRMSNorm(llm_dim, eps=1e-6)
+        self.norm_2.weight.data.fill_(self._NORM_INIT)
     def get_output_length(self, input_length: int) -> int:
         """Calculate output sequence length given input length (matches GLM-ASR)."""