Yuchan
commited on
Update Model.py
Browse files
Model.py
CHANGED
|
@@ -158,7 +158,6 @@ class LoSoU(layers.Layer):
|
|
| 158 |
self.K = layers.Dense(96, dtype='float32')
|
| 159 |
self.V = Lo(d_model) # Lo already handles casting to model dtype; we'll cast back to float32
|
| 160 |
self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
|
| 161 |
-
self.O = layers.Dense(d_model, dtype='float32')
|
| 162 |
self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
|
| 163 |
|
| 164 |
# 동적 alpha 계산을 위한 레이어
|
|
@@ -242,17 +241,7 @@ class LoSoU(layers.Layer):
|
|
| 242 |
x_comb = score_clipped * V # (B, L, d_model)
|
| 243 |
|
| 244 |
out = self.proj(x_comb) # (B, L, d_model)
|
| 245 |
-
|
| 246 |
-
# ensure out dim even for split
|
| 247 |
-
d = out.shape[-1] # this is an int (static shape)
|
| 248 |
-
if d is not None and d % 2 == 1:
|
| 249 |
-
out = tf.pad(out, [[0,0],[0,0],[0,1]])
|
| 250 |
-
|
| 251 |
-
a, b = tf.split(out, 2, axis=-1)
|
| 252 |
-
gated = tf.nn.silu(a) * b
|
| 253 |
-
out = self.O(gated)
|
| 254 |
-
|
| 255 |
-
out = self.norm(out + residual)
|
| 256 |
|
| 257 |
# cast back to original dtype for downstream layers
|
| 258 |
return tf.cast(out, x.dtype)
|
|
@@ -271,10 +260,9 @@ class ReLaM(tf.keras.Model):
|
|
| 271 |
def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
|
| 272 |
super().__init__()
|
| 273 |
self.token_embedding = layers.Embedding(vocab_size, 128)
|
| 274 |
-
self.pos_embedding = layers.Embedding(max_seq_len,
|
| 275 |
self.blocks = [Block(d_model, hyper_n=1) for _ in range(n_layers)]
|
| 276 |
self.proj = layers.Dense(128)
|
| 277 |
-
# LayerNormalization은 float32로 해서 정밀도 문제 방지
|
| 278 |
self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
|
| 279 |
|
| 280 |
def call(self, x, training=False):
|
|
|
|
| 158 |
self.K = layers.Dense(96, dtype='float32')
|
| 159 |
self.V = Lo(d_model) # Lo already handles casting to model dtype; we'll cast back to float32
|
| 160 |
self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
|
|
|
|
| 161 |
self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
|
| 162 |
|
| 163 |
# 동적 alpha 계산을 위한 레이어
|
|
|
|
| 241 |
x_comb = score_clipped * V # (B, L, d_model)
|
| 242 |
|
| 243 |
out = self.proj(x_comb) # (B, L, d_model)
|
| 244 |
+
out = self.norm(out)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
# cast back to original dtype for downstream layers
|
| 247 |
return tf.cast(out, x.dtype)
|
|
|
|
| 260 |
def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
|
| 261 |
super().__init__()
|
| 262 |
self.token_embedding = layers.Embedding(vocab_size, 128)
|
| 263 |
+
self.pos_embedding = layers.Embedding(max_seq_len, 128)
|
| 264 |
self.blocks = [Block(d_model, hyper_n=1) for _ in range(n_layers)]
|
| 265 |
self.proj = layers.Dense(128)
|
|
|
|
| 266 |
self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
|
| 267 |
|
| 268 |
def call(self, x, training=False):
|