OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 22, 2025

Commit

68d51a1

verified ·

1 Parent(s): f311e85

Update AlphaS2S.py

Browse files

Files changed (1) hide show

AlphaS2S.py +47 -41

AlphaS2S.py CHANGED Viewed

@@ -182,62 +182,68 @@ class SwiGLU(layers.Layer):
         x_proj = self.proj(x)
         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
-class DilatedConvBlock(layers.Layer):
-    def __init__(self, d_model, num_layers=5, kernel_size=3):
         super().__init__()
         self.d_model = d_model
-        self.kernel_size = kernel_size
-        self.conv_layers = []
-        # 딜레이션 레이트: 1, 2, 4, 8, 16
-        dilation_rates = [2**i for i in range(num_layers)]
-        for i, rate in enumerate(dilation_rates):
-            # 💡 filters=d_model, kernel_size=3, padding='same', activation='relu' 조건 유지
-            conv = layers.Conv1D(
-                filters=d_model,
-                kernel_size=kernel_size,
-                padding='same',
-                activation='relu',
-                dilation_rate=rate, # 딜레이션 레이트 적용
-                name=f"dconv_{i+1}_rate_{rate}"
-            )
-            self.conv_layers.append(conv)
-    def call(self, x):
-        # 입력 x를 첫 번째 레이어의 출력과 합산하기 위해 residual로 저장
         residual = x
-        # 5개의 딜레이티드 컨볼루션 레이어를 순차적으로 적용하고 출력을 누적
-        outputs = []
-        # 각 레이어의 출력을 다음 레이어의 입력으로 사용하지 않고,
-        # 원본 입력 X에 대해 독립적으로 Conv 연산을 수행한 후 출력을 합산하는 방식 (Residual/Parallel)
-        for conv in self.conv_layers:
-            conv_out = conv(x)
-            outputs.append(conv_out)
-        # 5개 레이어의 출력을 모두 합산하여 최종 결과를 얻음
-        # (이는 Skip Connection이나 Residual Connection의 일반적인 형태입니다.)
-        final_output = tf.add_n(outputs)
-        # 필요하다면 최종 결과에 잔차 연결 (residual connection)을 추가할 수도 있습니다.
-        final_output = final_output + residual
-        return final_output
 class CrossBlock(layers.Layer):
-    def __init__(self, d_model): # 💡 d_model 인자 추가
         super().__init__()
-        # 💡 수정: 출력 차원을 1에서 d_model로 변경 (채널별 게이팅 허용)
-        self.alpha = layers.Dense(d_model, activation='sigmoid', dtype='float32')
     def call(self, x, z):
         # a의 shape: (Batch, Seq_len, D_model)
-        a = self.alpha(x)
-        # y: 각 채널이 독립적인 가중치 (a)로 X와 Z를 융합
-        y = a * x + (1.0 - a) * z
         return y
 class LoU(layers.Layer):
@@ -254,7 +260,7 @@ class LoU(layers.Layer):
         self.alpha_linear = layers.Dense(1, activation='sigmoid', dtype='float32')
         self.glu = SwiGLU(d_model, d_model)
-        self.cross = CrossBlock(d_model)
     def _ema_over_time(self, score, alpha_dynamic):
         seq = tf.transpose(score, perm=[1, 0, 2])
@@ -320,7 +326,7 @@ class AlphaS2S(tf.keras.Model):
         self.dec_pos_embedding = layers.Embedding(max_len, d_model)
         # EncoderBlock과 LoU는 기존 코드와 동일한 구조
-        self.enc_layers = [DilatedConvBlock(d_model) for _ in range(num_layers)]
         self.dec_layers = [LoU(d_model) for _ in range(num_layers)]
         self.final_layer = layers.Dense(target_vocab_size, use_bias=False)

         x_proj = self.proj(x)
         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
+class gMLPBlock(layers.Layer):
+    def __init__(self, d_model, seq_len, dropout=0.1):
         super().__init__()
         self.d_model = d_model
+        self.seq_len = seq_len
+        self.norm = layers.LayerNormalization(epsilon=1e-6)
+        # FFN: Channel Expansion
+        # d_model * 4로 확장
+        self.channel_proj = layers.Dense(d_model * 4, use_bias=True)
+        self.dropout = layers.Dropout(dropout)
+        # Spatial Gating Unit (SGU)
+        self.sgu_norm = layers.LayerNormalization(epsilon=1e-6)
+        self.sgu_proj = layers.Dense(seq_len, use_bias=False)
+        # 출력 차원을 d_model * 2 (U의 차원)로 설정
+        self.sgu_final = layers.Dense(d_model * 2, use_bias=True)
+        self.out_proj = layers.Dense(d_model, use_bias=True)
+    def call(self, x, training=False):
+        # 1. Norm and Channel Expansion
         residual = x
+        x_norm = self.norm(x)
+        x_proj = self.channel_proj(x_norm) # Shape: (B, L, 4*D)
+        # 2. Split (U and V streams)
+        u, v = tf.split(x_proj, 2, axis=-1) # u, v Shape: (B, L, 2*D)
+        # 3. Spatial Gating Unit (SGU)
+        v_norm = self.sgu_norm(v)
+        v_norm_T = tf.transpose(v_norm, perm=[0, 2, 1]) # (B, 2D, L)
+        # 💡 토큰 믹싱 발생 (시퀀스 축으로 Dense 적용)
+        v_proj = self.sgu_proj(v_norm_T) # (B, 2D, L)
+        v_proj_T = tf.transpose(v_proj, perm=[0, 2, 1]) # (B, L, 2D)
+        # 4. Activation and Gate Generation
+        # 표준 gMLP는 U에 GELU를 적용하고 V는 선형 게이트로 사용
+        # 여기서는 U에 GELU를 적용
+        u_act = tf.nn.gelu(u)
+        v_gate = self.sgu_final(v_proj_T) # Shape: (B, L, 2*D)
+        # 5. Gating and Contraction
+        z = u_act * v_gate # 게이팅
+        z = self.dropout(z, training=training)
+        out = self.out_proj(z) # Shape: (B, L, D)
+        # 6. Residual Connection
+        return residual + out
 class CrossBlock(layers.Layer):
+    def __init__(self): # 💡 d_model 인자 추가
         super().__init__()
+        # 💡 수정: 출력 차원을 1에서 d_model로 변경
     def call(self, x, z):
         # a의 shape: (Batch, Seq_len, D_model)
+        g_q = (tf.nn.tanh(x) + 1.0) / 2.0
+        g_k = (tf.nn.tanh(z) + 1.0) / 2.0
+        y = (g_q * g_k) * z
         return y
 class LoU(layers.Layer):
         self.alpha_linear = layers.Dense(1, activation='sigmoid', dtype='float32')
         self.glu = SwiGLU(d_model, d_model)
+        self.cross = CrossBlock()
     def _ema_over_time(self, score, alpha_dynamic):
         seq = tf.transpose(score, perm=[1, 0, 2])
         self.dec_pos_embedding = layers.Embedding(max_len, d_model)
         # EncoderBlock과 LoU는 기존 코드와 동일한 구조
+        self.enc_layers = [gMLPBlock(d_model, seq_len=max_len) for _ in range(num_layers)]
         self.dec_layers = [LoU(d_model) for _ in range(num_layers)]
         self.final_layer = layers.Dense(target_vocab_size, use_bias=False)