OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 22, 2025

Commit

f3ba35c

verified ·

1 Parent(s): 83f6465

Update AlphaS2S.py

Browse files

Files changed (1) hide show

AlphaS2S.py +19 -17

AlphaS2S.py CHANGED Viewed

@@ -191,48 +191,50 @@ class CrossBlock(layers.Layer):
         a = self.alpha(x)
         y = a * x + (1.0 - a) * z
         return y
 class gMLPBlock(layers.Layer):
     def __init__(self, d_model, seq_len, dropout=0.1):
         super().__init__()
         self.norm = layers.LayerNormalization(epsilon=1e-6)
         self.channel_proj = layers.Dense(d_model * 4, use_bias=True)
         self.dropout = layers.Dropout(dropout)
         # Spatial Gating Unit (SGU)
         self.sgu_norm = layers.LayerNormalization(epsilon=1e-6)
         self.sgu_proj = layers.Dense(seq_len, use_bias=False)
-        self.sgu_final = layers.Dense(d_model, use_bias=True)
         self.out_proj = layers.Dense(d_model, use_bias=True)
     def call(self, x, training=False):
-        # 1. Channel Projection (Expansion)
         residual = x
         x = self.norm(x)
-        x = self.channel_proj(x)
-        # 2. Split into Gated and Value Streams
-        u, v = tf.split(x, 2, axis=-1)
-        # 3. Spatial Gating Unit (SGU)
-        # SGU는 채널(d_model) 축으로 순전파하며, 시퀀스(seq_len) 축으로 게이팅을 수행
         v_norm = self.sgu_norm(v)
-        v_norm_T = tf.transpose(v_norm, perm=[0, 2, 1])
-        v_proj = self.sgu_proj(v_norm_T)
-        v_proj_T = tf.transpose(v_proj, perm=[0, 2, 1])
-        v_gate = self.sgu_final(v_proj_T)
-        # 4. Gating (Element-wise multiplication)
-        z = u * v_gate
-        # 5. Output Projection (Contraction)
         z = self.dropout(z, training=training)
-        out = self.out_proj(z)
-        # 6. Residual Connection
         return residual + out
 class LoU(layers.Layer):
     def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()

         a = self.alpha(x)
         y = a * x + (1.0 - a) * z
         return y
 class gMLPBlock(layers.Layer):
     def __init__(self, d_model, seq_len, dropout=0.1):
         super().__init__()
+        self.d_model = d_model
+        self.seq_len = seq_len
         self.norm = layers.LayerNormalization(epsilon=1e-6)
+        # d_model * 4로 확장
         self.channel_proj = layers.Dense(d_model * 4, use_bias=True)
         self.dropout = layers.Dropout(dropout)
         # Spatial Gating Unit (SGU)
         self.sgu_norm = layers.LayerNormalization(epsilon=1e-6)
         self.sgu_proj = layers.Dense(seq_len, use_bias=False)
+        # 💡 수정: 출력 차원을 d_model * 2 (u의 차원)로 설정하여 u와 브로드캐스팅 가능하게 함
+        self.sgu_final = layers.Dense(d_model * 2, use_bias=True)
         self.out_proj = layers.Dense(d_model, use_bias=True)
     def call(self, x, training=False):
         residual = x
         x = self.norm(x)
+        x = self.channel_proj(x) # Shape: (B, L, 4*D)
+        u, v = tf.split(x, 2, axis=-1) # u, v Shape: (B, L, 2*D)
+        # SGU
         v_norm = self.sgu_norm(v)
+        v_norm_T = tf.transpose(v_norm, perm=[0, 2, 1]) # Shape: (B, 2*D, L)
+        v_proj = self.sgu_proj(v_norm_T) # Shape: (B, 2*D, L)
+        v_proj_T = tf.transpose(v_proj, perm=[0, 2, 1]) # Shape: (B, L, 2*D)
+        v_gate = self.sgu_final(v_proj_T) # Shape: (B, L, 2*D)
+        # Gating (Shape: (B, L, 2*D) * (B, L, 2*D) -> (B, L, 2*D))
+        z = u * v_gate
+        # Output Projection (Contraction)
         z = self.dropout(z, training=training)
+        out = self.out_proj(z) # Shape: (B, L, D)
         return residual + out
 class LoU(layers.Layer):
     def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()