Training in progress - step 1000
Browse files- projectors.py +5 -5
projectors.py
CHANGED
|
@@ -28,7 +28,7 @@ class MLPAudioProjector(nn.Module):
|
|
| 28 |
|
| 29 |
encoder_dim = getattr(config, "encoder_dim", 768)
|
| 30 |
llm_dim = getattr(config, "llm_dim", 2048)
|
| 31 |
-
self.k = getattr(config, "projector_pool_stride",
|
| 32 |
|
| 33 |
# Frame stacking: concat k adjacent frames then project
|
| 34 |
# Matches GLM-ASR: in_dim -> 2*llm_dim -> llm_dim
|
|
@@ -63,12 +63,12 @@ class MLPAudioProjector(nn.Module):
|
|
| 63 |
|
| 64 |
|
| 65 |
class SimpleAdapter(nn.Module):
|
| 66 |
-
"""Simple 2-layer
|
| 67 |
|
| 68 |
def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
|
| 69 |
super().__init__()
|
| 70 |
self.fc1 = nn.Linear(input_dim, hidden_dim)
|
| 71 |
-
self.act = nn.
|
| 72 |
self.fc2 = nn.Linear(hidden_dim, output_dim)
|
| 73 |
|
| 74 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
@@ -89,7 +89,7 @@ class SwiGLUExpert(nn.Module):
|
|
| 89 |
|
| 90 |
|
| 91 |
class MOSAProjector(nn.Module):
|
| 92 |
-
"""MOSA-Base projector: simple 2-layer router with 4 simple adapters.
|
| 93 |
|
| 94 |
Based on "MOSA: Mixtures of Simple Adapters" (arXiv:2508.18998).
|
| 95 |
Uses softmax gating over all experts (dense MoE) with only cross-entropy loss.
|
|
@@ -116,7 +116,7 @@ class MOSAProjector(nn.Module):
|
|
| 116 |
nn.Linear(router_hidden, self.num_experts),
|
| 117 |
)
|
| 118 |
|
| 119 |
-
# --- 2. Experts (Simple 2-layer
|
| 120 |
# Each expert: in_dim (stacked frames) -> hidden -> llm_dim
|
| 121 |
self.experts = nn.ModuleList(
|
| 122 |
[SimpleAdapter(in_dim, adapter_hidden, self.llm_dim) for _ in range(self.num_experts)]
|
|
|
|
| 28 |
|
| 29 |
encoder_dim = getattr(config, "encoder_dim", 768)
|
| 30 |
llm_dim = getattr(config, "llm_dim", 2048)
|
| 31 |
+
self.k = getattr(config, "projector_pool_stride", 2)
|
| 32 |
|
| 33 |
# Frame stacking: concat k adjacent frames then project
|
| 34 |
# Matches GLM-ASR: in_dim -> 2*llm_dim -> llm_dim
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
class SimpleAdapter(nn.Module):
|
| 66 |
+
"""Simple 2-layer GELU adapter (from MOSA paper)."""
|
| 67 |
|
| 68 |
def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
|
| 69 |
super().__init__()
|
| 70 |
self.fc1 = nn.Linear(input_dim, hidden_dim)
|
| 71 |
+
self.act = nn.GELU()
|
| 72 |
self.fc2 = nn.Linear(hidden_dim, output_dim)
|
| 73 |
|
| 74 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
class MOSAProjector(nn.Module):
|
| 92 |
+
"""MOSA-Base projector: simple 2-layer ReLU router with 4 simple adapters.
|
| 93 |
|
| 94 |
Based on "MOSA: Mixtures of Simple Adapters" (arXiv:2508.18998).
|
| 95 |
Uses softmax gating over all experts (dense MoE) with only cross-entropy loss.
|
|
|
|
| 116 |
nn.Linear(router_hidden, self.num_experts),
|
| 117 |
)
|
| 118 |
|
| 119 |
+
# --- 2. Experts (Simple 2-layer GELU adapters) ---
|
| 120 |
# Each expert: in_dim (stacked frames) -> hidden -> llm_dim
|
| 121 |
self.experts = nn.ModuleList(
|
| 122 |
[SimpleAdapter(in_dim, adapter_hidden, self.llm_dim) for _ in range(self.num_experts)]
|