mazesmazes commited on
Commit
4f761fe
·
verified ·
1 Parent(s): ccb5c19

Training in progress - step 1000

Browse files
Files changed (1) hide show
  1. projectors.py +5 -5
projectors.py CHANGED
@@ -28,7 +28,7 @@ class MLPAudioProjector(nn.Module):
28
 
29
  encoder_dim = getattr(config, "encoder_dim", 768)
30
  llm_dim = getattr(config, "llm_dim", 2048)
31
- self.k = getattr(config, "projector_pool_stride", 4)
32
 
33
  # Frame stacking: concat k adjacent frames then project
34
  # Matches GLM-ASR: in_dim -> 2*llm_dim -> llm_dim
@@ -63,12 +63,12 @@ class MLPAudioProjector(nn.Module):
63
 
64
 
65
  class SimpleAdapter(nn.Module):
66
- """Simple 2-layer ReLU adapter (from MOSA paper)."""
67
 
68
  def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
69
  super().__init__()
70
  self.fc1 = nn.Linear(input_dim, hidden_dim)
71
- self.act = nn.ReLU()
72
  self.fc2 = nn.Linear(hidden_dim, output_dim)
73
 
74
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -89,7 +89,7 @@ class SwiGLUExpert(nn.Module):
89
 
90
 
91
  class MOSAProjector(nn.Module):
92
- """MOSA-Base projector: simple 2-layer router with 4 simple adapters.
93
 
94
  Based on "MOSA: Mixtures of Simple Adapters" (arXiv:2508.18998).
95
  Uses softmax gating over all experts (dense MoE) with only cross-entropy loss.
@@ -116,7 +116,7 @@ class MOSAProjector(nn.Module):
116
  nn.Linear(router_hidden, self.num_experts),
117
  )
118
 
119
- # --- 2. Experts (Simple 2-layer ReLU adapters per MOSA paper) ---
120
  # Each expert: in_dim (stacked frames) -> hidden -> llm_dim
121
  self.experts = nn.ModuleList(
122
  [SimpleAdapter(in_dim, adapter_hidden, self.llm_dim) for _ in range(self.num_experts)]
 
28
 
29
  encoder_dim = getattr(config, "encoder_dim", 768)
30
  llm_dim = getattr(config, "llm_dim", 2048)
31
+ self.k = getattr(config, "projector_pool_stride", 2)
32
 
33
  # Frame stacking: concat k adjacent frames then project
34
  # Matches GLM-ASR: in_dim -> 2*llm_dim -> llm_dim
 
63
 
64
 
65
  class SimpleAdapter(nn.Module):
66
+ """Simple 2-layer GELU adapter (from MOSA paper)."""
67
 
68
  def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
69
  super().__init__()
70
  self.fc1 = nn.Linear(input_dim, hidden_dim)
71
+ self.act = nn.GELU()
72
  self.fc2 = nn.Linear(hidden_dim, output_dim)
73
 
74
  def forward(self, x: torch.Tensor) -> torch.Tensor:
 
89
 
90
 
91
  class MOSAProjector(nn.Module):
92
+ """MOSA-Base projector: simple 2-layer ReLU router with 4 simple adapters.
93
 
94
  Based on "MOSA: Mixtures of Simple Adapters" (arXiv:2508.18998).
95
  Uses softmax gating over all experts (dense MoE) with only cross-entropy loss.
 
116
  nn.Linear(router_hidden, self.num_experts),
117
  )
118
 
119
+ # --- 2. Experts (Simple 2-layer GELU adapters) ---
120
  # Each expert: in_dim (stacked frames) -> hidden -> llm_dim
121
  self.experts = nn.ModuleList(
122
  [SimpleAdapter(in_dim, adapter_hidden, self.llm_dim) for _ in range(self.num_experts)]