OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 22, 2025

Commit

ac9ca0c

verified ·

1 Parent(s): 87b968d

Update AlphaS2S.py

Browse files

Files changed (1) hide show

AlphaS2S.py +36 -35

AlphaS2S.py CHANGED Viewed

@@ -182,49 +182,51 @@ class SwiGLU(layers.Layer):
         x_proj = self.proj(x)
         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
-class gMLPBlock(layers.Layer):
-    def __init__(self, d_model, seq_len, dropout=0.1):
         super().__init__()
         self.d_model = d_model
-        self.seq_len = seq_len
-        self.norm = layers.LayerNormalization(epsilon=1e-6)
-        # d_model * 4로 확장
-        self.channel_proj = layers.Dense(d_model * 4, use_bias=True)
-        self.dropout = layers.Dropout(dropout)
-        # Spatial Gating Unit (SGU)
-        self.sgu_norm = layers.LayerNormalization(epsilon=1e-6)
-        self.sgu_proj = layers.Dense(seq_len, use_bias=False)
-        # 💡 수정: 출력 차원을 d_model * 2 (u의 차원)로 설정하여 u와 브로드캐스팅 가능하게 함
-        self.sgu_final = layers.Dense(d_model * 2, use_bias=True)
-        self.out_proj = layers.Dense(d_model, use_bias=True)
-    def call(self, x, training=False):
         residual = x
-        x = self.norm(x)
-        x = self.channel_proj(x) # Shape: (B, L, 4*D)
-        u, v = tf.split(x, 2, axis=-1) # u, v Shape: (B, L, 2*D)
-        # SGU
-        v_norm = self.sgu_norm(v)
-        v_norm_T = tf.transpose(v_norm, perm=[0, 2, 1]) # Shape: (B, 2*D, L)
-        v_proj = self.sgu_proj(v_norm_T) # Shape: (B, 2*D, L)
-        v_proj_T = tf.transpose(v_proj, perm=[0, 2, 1]) # Shape: (B, L, 2*D)
-        v_gate = self.sgu_final(v_proj_T) # Shape: (B, L, 2*D)
-        # Gating (Shape: (B, L, 2*D) * (B, L, 2*D) -> (B, L, 2*D))
-        z = u * v_gate
-        # Output Projection (Contraction)
-        z = self.dropout(z, training=training)
-        out = self.out_proj(z) # Shape: (B, L, D)
-        return residual + out
 class CrossBlock(layers.Layer):
     def __init__(self, d_model): # 💡 d_model 인자 추가
@@ -238,7 +240,6 @@ class CrossBlock(layers.Layer):
         y = a * x + (1.0 - a) * z
         return y
 class LoU(layers.Layer):
     def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()
@@ -319,7 +320,7 @@ class AlphaS2S(tf.keras.Model):
         self.dec_pos_embedding = layers.Embedding(max_len, d_model)
         # EncoderBlock과 LoU는 기존 코드와 동일한 구조
-        self.enc_layers = [gMLPBlock(d_model, seq_len=max_len) for _ in range(num_layers)]
         self.dec_layers = [LoU(d_model) for _ in range(num_layers)]
         self.final_layer = layers.Dense(target_vocab_size, use_bias=False)
@@ -405,7 +406,7 @@ with strategy.scope():
             masked_perplexity
         ]
     )
     print("✅ 모델 컴파일 완료, 학습 시작...")
     # ⚠️ 학습 실행
     history = chat_model.fit(dataset, epochs=1, verbose=1)

         x_proj = self.proj(x)
         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
+class DilatedConvBlock(layers.Layer):
+    def __init__(self, d_model, num_layers=5, kernel_size=3):
         super().__init__()
         self.d_model = d_model
+        self.kernel_size = kernel_size
+        self.conv_layers = []
+        # 딜레이션 레이트: 1, 2, 4, 8, 16
+        dilation_rates = [2**i for i in range(num_layers)]
+        for i, rate in enumerate(dilation_rates):
+            # 💡 filters=d_model, kernel_size=3, padding='same', activation='relu' 조건 유지
+            conv = layers.Conv1D(
+                filters=d_model,
+                kernel_size=kernel_size,
+                padding='same',
+                activation='relu',
+                dilation_rate=rate, # 딜레이션 레이트 적용
+                name=f"dconv_{i+1}_rate_{rate}"
+            )
+            self.conv_layers.append(conv)
+    def call(self, x):
+        # 입력 x를 첫 번째 레이어의 출력과 합산하기 위해 residual로 저장
         residual = x
+        # 5개의 딜레이티드 컨볼루션 레이어를 순차적으로 적용하고 출력을 누적
+        outputs = []
+        # 각 레이어의 출력을 다음 레이어의 입력으로 사용하지 않고,
+        # 원본 입력 X에 대해 독립적으로 Conv 연산을 수행한 후 출력을 합산하는 방식 (Residual/Parallel)
+        for conv in self.conv_layers:
+            conv_out = conv(x)
+            outputs.append(conv_out)
+        # 5개 레이어의 출력을 모두 합산하여 최종 결과를 얻음
+        # (이는 Skip Connection이나 Residual Connection의 일반적인 형태입니다.)
+        final_output = tf.add_n(outputs)
+        # 필요하다면 최종 결과에 잔차 연결 (residual connection)을 추가할 수도 있습니다.
+        final_output = final_output + residual
+        return final_output
 class CrossBlock(layers.Layer):
     def __init__(self, d_model): # 💡 d_model 인자 추가
         y = a * x + (1.0 - a) * z
         return y
 class LoU(layers.Layer):
     def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()
         self.dec_pos_embedding = layers.Embedding(max_len, d_model)
         # EncoderBlock과 LoU는 기존 코드와 동일한 구조
+        self.enc_layers = [DilatedConvBlock(d_model) for _ in range(num_layers)]
         self.dec_layers = [LoU(d_model) for _ in range(num_layers)]
         self.final_layer = layers.Dense(target_vocab_size, use_bias=False)
             masked_perplexity
         ]
     )
+    chat_model.summary()
     print("✅ 모델 컴파일 완료, 학습 시작...")
     # ⚠️ 학습 실행
     history = chat_model.fit(dataset, epochs=1, verbose=1)