Yuchan
commited on
Update AlphaS2S.py
Browse files- AlphaS2S.py +6 -6
AlphaS2S.py
CHANGED
|
@@ -13,7 +13,7 @@ tf.get_logger().setLevel("ERROR")
|
|
| 13 |
SEED = 42
|
| 14 |
tf.random.set_seed(SEED)
|
| 15 |
np.random.seed(SEED)
|
| 16 |
-
max_len =
|
| 17 |
batch_size = 48
|
| 18 |
|
| 19 |
# TPU 초기화 (기존 코드와 동일)
|
|
@@ -180,7 +180,7 @@ class SwiGLU(layers.Layer):
|
|
| 180 |
class EncoderBlock(layers.Layer):
|
| 181 |
def __init__(self, d_model, num_heads, dff, dropout=0.1):
|
| 182 |
super().__init__()
|
| 183 |
-
self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
|
| 184 |
self.ffn = SwiGLU(d_model, dff)
|
| 185 |
self.norm1 = layers.LayerNormalization(epsilon=1e-6)
|
| 186 |
self.norm2 = layers.LayerNormalization(epsilon=1e-6)
|
|
@@ -195,8 +195,8 @@ class EncoderBlock(layers.Layer):
|
|
| 195 |
class DecoderBlock(layers.Layer):
|
| 196 |
def __init__(self, d_model, num_heads, dff, dropout=0.1):
|
| 197 |
super().__init__()
|
| 198 |
-
self.self_mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
|
| 199 |
-
self.cross_mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
|
| 200 |
self.ffn = SwiGLU(d_model, dff)
|
| 201 |
self.norm1 = layers.LayerNormalization(epsilon=1e-6)
|
| 202 |
self.norm2 = layers.LayerNormalization(epsilon=1e-6)
|
|
@@ -213,7 +213,7 @@ class DecoderBlock(layers.Layer):
|
|
| 213 |
return self.norm3(out2 + ffn_out)
|
| 214 |
|
| 215 |
class Transformer(tf.keras.Model):
|
| 216 |
-
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, max_len=
|
| 217 |
super().__init__()
|
| 218 |
self.max_len = max_len
|
| 219 |
self.d_model = d_model
|
|
@@ -271,7 +271,7 @@ def create_lr_schedule(initial_lr=5e-5, decay_steps=10000, decay_rate=0.9):
|
|
| 271 |
|
| 272 |
with strategy.scope():
|
| 273 |
# ⚠️ 수정: chat_vocab_size 대신 정의된 vocab_size 사용
|
| 274 |
-
chat_model = Transformer(num_layers=2, d_model=
|
| 275 |
|
| 276 |
dummy_input = {
|
| 277 |
"enc_inputs": tf.zeros((1, max_len), dtype=tf.int32),
|
|
|
|
| 13 |
SEED = 42
|
| 14 |
tf.random.set_seed(SEED)
|
| 15 |
np.random.seed(SEED)
|
| 16 |
+
max_len = 128 # 기존 코드에서 200으로 설정됨
|
| 17 |
batch_size = 48
|
| 18 |
|
| 19 |
# TPU 초기화 (기존 코드와 동일)
|
|
|
|
| 180 |
class EncoderBlock(layers.Layer):
|
| 181 |
def __init__(self, d_model, num_heads, dff, dropout=0.1):
|
| 182 |
super().__init__()
|
| 183 |
+
self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model//num_heads)
|
| 184 |
self.ffn = SwiGLU(d_model, dff)
|
| 185 |
self.norm1 = layers.LayerNormalization(epsilon=1e-6)
|
| 186 |
self.norm2 = layers.LayerNormalization(epsilon=1e-6)
|
|
|
|
| 195 |
class DecoderBlock(layers.Layer):
|
| 196 |
def __init__(self, d_model, num_heads, dff, dropout=0.1):
|
| 197 |
super().__init__()
|
| 198 |
+
self.self_mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model//num_heads)
|
| 199 |
+
self.cross_mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model//num_heads)
|
| 200 |
self.ffn = SwiGLU(d_model, dff)
|
| 201 |
self.norm1 = layers.LayerNormalization(epsilon=1e-6)
|
| 202 |
self.norm2 = layers.LayerNormalization(epsilon=1e-6)
|
|
|
|
| 213 |
return self.norm3(out2 + ffn_out)
|
| 214 |
|
| 215 |
class Transformer(tf.keras.Model):
|
| 216 |
+
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, max_len=128, dropout=0.1):
|
| 217 |
super().__init__()
|
| 218 |
self.max_len = max_len
|
| 219 |
self.d_model = d_model
|
|
|
|
| 271 |
|
| 272 |
with strategy.scope():
|
| 273 |
# ⚠️ 수정: chat_vocab_size 대신 정의된 vocab_size 사용
|
| 274 |
+
chat_model = Transformer(num_layers=2, d_model=160, num_heads=4, dff=640, input_vocab_size=vocab_size, target_vocab_size=evocab_size, max_len=128, dropout=0.1)
|
| 275 |
|
| 276 |
dummy_input = {
|
| 277 |
"enc_inputs": tf.zeros((1, max_len), dtype=tf.int32),
|