Training in progress - step 500
Browse files- asr_config.py +1 -1
- projectors.py +3 -3
asr_config.py
CHANGED
|
@@ -20,7 +20,7 @@ class ASRConfig(transformers.PretrainedConfig):
|
|
| 20 |
llm_dim: Optional[int] = None,
|
| 21 |
audio_sample_rate: int = 16000,
|
| 22 |
projector_init_std: float = 0.02,
|
| 23 |
-
projector_pool_stride: int =
|
| 24 |
downsample_rate: int = 16,
|
| 25 |
projector_hidden_dim: Optional[int] = None,
|
| 26 |
projector_type: str = "moe", # "moe", "swiglu", "residual", "shared_moe", "mlp", "qformer"
|
|
|
|
| 20 |
llm_dim: Optional[int] = None,
|
| 21 |
audio_sample_rate: int = 16000,
|
| 22 |
projector_init_std: float = 0.02,
|
| 23 |
+
projector_pool_stride: int = 4,
|
| 24 |
downsample_rate: int = 16,
|
| 25 |
projector_hidden_dim: Optional[int] = None,
|
| 26 |
projector_type: str = "moe", # "moe", "swiglu", "residual", "shared_moe", "mlp", "qformer"
|
projectors.py
CHANGED
|
@@ -522,7 +522,7 @@ class SharedMoEAudioProjector(nn.Module):
|
|
| 522 |
super().__init__()
|
| 523 |
|
| 524 |
# Default stride is now 2 (was 4)
|
| 525 |
-
self.k = getattr(config, "projector_pool_stride",
|
| 526 |
encoder_dim = config.encoder_dim
|
| 527 |
|
| 528 |
# Depthwise Conv for temporal mixing
|
|
@@ -617,8 +617,8 @@ class QFormerAudioProjector(nn.Module):
|
|
| 617 |
# QFormer hidden size (matches encoder for cross-attention)
|
| 618 |
qformer_hidden = getattr(config, "qformer_hidden_size", None) or encoder_dim
|
| 619 |
qformer_num_layers = getattr(config, "qformer_num_layers", 2)
|
| 620 |
-
# Default heads must divide hidden size evenly (1280 /
|
| 621 |
-
qformer_num_heads = getattr(config, "qformer_num_heads",
|
| 622 |
qformer_intermediate = getattr(config, "qformer_intermediate_size", None) or (qformer_hidden * 4)
|
| 623 |
|
| 624 |
# Learnable query embeddings (Granite uses std=1.0)
|
|
|
|
| 522 |
super().__init__()
|
| 523 |
|
| 524 |
# Default stride is now 2 (was 4)
|
| 525 |
+
self.k = getattr(config, "projector_pool_stride", 4)
|
| 526 |
encoder_dim = config.encoder_dim
|
| 527 |
|
| 528 |
# Depthwise Conv for temporal mixing
|
|
|
|
| 617 |
# QFormer hidden size (matches encoder for cross-attention)
|
| 618 |
qformer_hidden = getattr(config, "qformer_hidden_size", None) or encoder_dim
|
| 619 |
qformer_num_layers = getattr(config, "qformer_num_layers", 2)
|
| 620 |
+
# Default heads must divide hidden size evenly (1280 / 16 = 80)
|
| 621 |
+
qformer_num_heads = getattr(config, "qformer_num_heads", 16)
|
| 622 |
qformer_intermediate = getattr(config, "qformer_intermediate_size", None) or (qformer_hidden * 4)
|
| 623 |
|
| 624 |
# Learnable query embeddings (Granite uses std=1.0)
|