Training in progress - step 1000

Files changed (4) hide show

asr_config.py CHANGED Viewed

@@ -50,13 +50,6 @@ class ASRConfig(transformers.PretrainedConfig):
         projector_pool_stride: int = 4,
         downsample_rate: int = 5,  # Granite default
         projector_hidden_dim: Optional[int] = None,
-        # Projector dropout — applied between activation and the second
-        # linear in MLPAudioProjector. Matches Granite-Speech 4.1's
-        # Q-Former dropout (hidden_dropout_prob=0.1) used in its frozen-
-        # encoder + LoRA-LLM training stage. Default 0.0 for backward
-        # compatibility with existing checkpoints; experiment configs
-        # opt in to 0.1.
-        projector_dropout: float = 0.0,
         projector_type: str = "mlp",  # "mlp", "mosa", "moe", "qformer"
         # MoE-specific configuration
         num_experts: int = 4,  # Number of experts in MoE projectors
@@ -123,7 +116,6 @@ class ASRConfig(transformers.PretrainedConfig):
         self.projector_pool_stride = projector_pool_stride
         self.downsample_rate = downsample_rate
         self.projector_hidden_dim = projector_hidden_dim
-        self.projector_dropout = projector_dropout
         self.projector_type = projector_type
         # MoE-specific configuration
         self.num_experts = num_experts

         projector_pool_stride: int = 4,
         downsample_rate: int = 5,  # Granite default
         projector_hidden_dim: Optional[int] = None,
         projector_type: str = "mlp",  # "mlp", "mosa", "moe", "qformer"
         # MoE-specific configuration
         num_experts: int = 4,  # Number of experts in MoE projectors
         self.projector_pool_stride = projector_pool_stride
         self.downsample_rate = downsample_rate
         self.projector_hidden_dim = projector_hidden_dim
         self.projector_type = projector_type
         # MoE-specific configuration
         self.num_experts = num_experts

config.json CHANGED Viewed

@@ -262,7 +262,6 @@
   "pad_token_id": 151643,
   "pipeline_tag": "automatic-speech-recognition",
   "pretrained_model_path": "mazesmazes/tiny-audio-next",
-  "projector_dropout": 0.1,
   "projector_hidden_dim": 2048,
   "projector_pool_stride": 4,
   "projector_type": "mlp",

   "pad_token_id": 151643,
   "pipeline_tag": "automatic-speech-recognition",
   "pretrained_model_path": "mazesmazes/tiny-audio-next",
   "projector_hidden_dim": 2048,
   "projector_pool_stride": 4,
   "projector_type": "mlp",

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8e6a3611eaa107e17233a3b4b06d53d1bcd0076b0bd76b436bac9cf4156a6db0
 size 2433494416

 version https://git-lfs.github.com/spec/v1
+oid sha256:fa0caab16d99a3c3d7ef0289332b36eee96b9a3602c1e02f8d2bf2c9f38e7b21
 size 2433494416

projectors.py CHANGED Viewed

@@ -55,12 +55,6 @@ class MLPAudioProjector(nn.Module):
         self.norm = LlamaRMSNorm(hidden_dim, eps=1e-6)
         self.norm.weight.data.fill_(self._NORM_INIT)
         self.act = nn.GELU()
-        # Dropout matches Granite-Speech 4.1's Q-Former hidden_dropout_prob=0.1
-        # in its frozen-encoder modality-alignment stage — the closest
-        # published precedent for our regime. Default 0.0 in config means
-        # nn.Dropout(0.0) is a no-op for existing experiments.
-        projector_dropout = float(getattr(config, "projector_dropout", 0.0))
-        self.dropout = nn.Dropout(projector_dropout)
         self.linear_2 = nn.Linear(hidden_dim, llm_dim, bias=False)
         # Output norm aligns the projector's RMS with the LM's embed_tokens
         # distribution. See _NORM_INIT comment above for the magnitude
@@ -86,7 +80,6 @@ class MLPAudioProjector(nn.Module):
         x = self.linear_1(x)
         x = self.norm(x)
         x = self.act(x)
-        x = self.dropout(x)
         x = self.linear_2(x)
         return self.norm_2(x)

         self.norm = LlamaRMSNorm(hidden_dim, eps=1e-6)
         self.norm.weight.data.fill_(self._NORM_INIT)
         self.act = nn.GELU()
         self.linear_2 = nn.Linear(hidden_dim, llm_dim, bias=False)
         # Output norm aligns the projector's RMS with the LM's embed_tokens
         # distribution. See _NORM_INIT comment above for the magnitude
         x = self.linear_1(x)
         x = self.norm(x)
         x = self.act(x)
         x = self.linear_2(x)
         return self.norm_2(x)