mazesmazes commited on
Commit
ae8ce74
·
verified ·
1 Parent(s): d2cc415

Training in progress - step 500

Browse files
Files changed (2) hide show
  1. asr_config.py +1 -1
  2. projectors.py +3 -3
asr_config.py CHANGED
@@ -20,7 +20,7 @@ class ASRConfig(transformers.PretrainedConfig):
20
  llm_dim: Optional[int] = None,
21
  audio_sample_rate: int = 16000,
22
  projector_init_std: float = 0.02,
23
- projector_pool_stride: int = 2,
24
  downsample_rate: int = 16,
25
  projector_hidden_dim: Optional[int] = None,
26
  projector_type: str = "moe", # "moe", "swiglu", "residual", "shared_moe", "mlp", "qformer"
 
20
  llm_dim: Optional[int] = None,
21
  audio_sample_rate: int = 16000,
22
  projector_init_std: float = 0.02,
23
+ projector_pool_stride: int = 4,
24
  downsample_rate: int = 16,
25
  projector_hidden_dim: Optional[int] = None,
26
  projector_type: str = "moe", # "moe", "swiglu", "residual", "shared_moe", "mlp", "qformer"
projectors.py CHANGED
@@ -522,7 +522,7 @@ class SharedMoEAudioProjector(nn.Module):
522
  super().__init__()
523
 
524
  # Default stride is now 2 (was 4)
525
- self.k = getattr(config, "projector_pool_stride", 2)
526
  encoder_dim = config.encoder_dim
527
 
528
  # Depthwise Conv for temporal mixing
@@ -617,8 +617,8 @@ class QFormerAudioProjector(nn.Module):
617
  # QFormer hidden size (matches encoder for cross-attention)
618
  qformer_hidden = getattr(config, "qformer_hidden_size", None) or encoder_dim
619
  qformer_num_layers = getattr(config, "qformer_num_layers", 2)
620
- # Default heads must divide hidden size evenly (1280 / 8 = 160)
621
- qformer_num_heads = getattr(config, "qformer_num_heads", 8)
622
  qformer_intermediate = getattr(config, "qformer_intermediate_size", None) or (qformer_hidden * 4)
623
 
624
  # Learnable query embeddings (Granite uses std=1.0)
 
522
  super().__init__()
523
 
524
  # Default stride is now 2 (was 4)
525
+ self.k = getattr(config, "projector_pool_stride", 4)
526
  encoder_dim = config.encoder_dim
527
 
528
  # Depthwise Conv for temporal mixing
 
617
  # QFormer hidden size (matches encoder for cross-attention)
618
  qformer_hidden = getattr(config, "qformer_hidden_size", None) or encoder_dim
619
  qformer_num_layers = getattr(config, "qformer_num_layers", 2)
620
+ # Default heads must divide hidden size evenly (1280 / 16 = 80)
621
+ qformer_num_heads = getattr(config, "qformer_num_heads", 16)
622
  qformer_intermediate = getattr(config, "qformer_intermediate_size", None) or (qformer_hidden * 4)
623
 
624
  # Learnable query embeddings (Granite uses std=1.0)