Training in progress - step 1000

Browse files

Files changed (3) hide show

config.json +1 -1
model.safetensors +2 -2
projectors.py +9 -18

config.json CHANGED Viewed

@@ -262,7 +262,7 @@
   "pad_token_id": 151643,
   "pipeline_tag": "automatic-speech-recognition",
   "pretrained_model_path": "mazesmazes/tiny-audio-next",
-  "projector_hidden_dim": 2048,
   "projector_pool_stride": 4,
   "projector_type": "mlp",
   "qformer_hidden_size": null,

   "pad_token_id": 151643,
   "pipeline_tag": "automatic-speech-recognition",
   "pretrained_model_path": "mazesmazes/tiny-audio-next",
+  "projector_hidden_dim": 4096,
   "projector_pool_stride": 4,
   "projector_type": "mlp",
   "qformer_hidden_size": null,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2877043b9b29a7fa8dd432ea544360d012f401fdc6f29ec07847d65a7d206413
-size 2433494416

 version https://git-lfs.github.com/spec/v1
+oid sha256:1a0b0a7491589ab19652cb41bb5aad27c3a78296bb3a8a66ea5beb3f99a3a81a
+size 2483834256

projectors.py CHANGED Viewed

@@ -21,19 +21,15 @@ from transformers.models.llama.modeling_llama import LlamaRMSNorm
 class MLPAudioProjector(nn.Module):
-    """2-layer MLP projector with frame-stacking downsampling (matches GLM-ASR)."""
-    # RMSNorm init weight chosen to match Qwen3-0.6B's embed_tokens median
-    # RMS (empirically 0.0292 across 151,936 tokens × dim=1024). With this
-    # init the projector outputs enter the LM at the same per-position
-    # residual-stream magnitude as text embed_tokens — avoiding the
-    # ~34× over-magnitude that LlamaRMSNorm's default weight=1.0 produces.
-    # Adam's per-parameter normalization means this small init does NOT
-    # starve projector gradient flow; the norm-before-GELU placement keeps
-    # gradients healthy regardless of init magnitude. If you swap to a
-    # different LM, re-measure with
-    # `model.get_input_embeddings().weight.pow(2).mean().sqrt()` and update.
-    _NORM_INIT = 0.029
     def __init__(self, config):
         """Initialize MLP projector.
@@ -53,14 +49,9 @@ class MLPAudioProjector(nn.Module):
         hidden_dim = getattr(config, "projector_hidden_dim", None) or llm_dim
         self.linear_1 = nn.Linear(in_dim, hidden_dim, bias=False)
         self.norm = LlamaRMSNorm(hidden_dim, eps=1e-6)
-        self.norm.weight.data.fill_(self._NORM_INIT)
         self.act = nn.GELU()
         self.linear_2 = nn.Linear(hidden_dim, llm_dim, bias=False)
-        # Output norm aligns the projector's RMS with the LM's embed_tokens
-        # distribution. See _NORM_INIT comment above for the magnitude
-        # derivation.
         self.norm_2 = LlamaRMSNorm(llm_dim, eps=1e-6)
-        self.norm_2.weight.data.fill_(self._NORM_INIT)
     def get_output_length(self, input_length: int) -> int:
         """Calculate output sequence length given input length (matches GLM-ASR)."""

 class MLPAudioProjector(nn.Module):
+    """2-layer MLP projector with frame-stacking downsampling (matches GLM-ASR).
+    Both RMSNorms use LlamaRMSNorm's default weight=1.0 init. A prior version
+    initialized both to 0.029 (Qwen3-0.6B's embed_tokens RMS) to put projector
+    outputs at residual-stream scale on step 1. Empirically, after training the
+    model drifted both norms back to ~1.0 (norm) and ~1.2 (norm_2) — the small
+    init wasted compute on a 35× scale-correction phase the optimizer would
+    have skipped from default init.
+    """
     def __init__(self, config):
         """Initialize MLP projector.
         hidden_dim = getattr(config, "projector_hidden_dim", None) or llm_dim
         self.linear_1 = nn.Linear(in_dim, hidden_dim, bias=False)
         self.norm = LlamaRMSNorm(hidden_dim, eps=1e-6)
         self.act = nn.GELU()
         self.linear_2 = nn.Linear(hidden_dim, llm_dim, bias=False)
         self.norm_2 = LlamaRMSNorm(llm_dim, eps=1e-6)
     def get_output_length(self, input_length: int) -> int:
         """Calculate output sequence length given input length (matches GLM-ASR)."""