Yuchan commited on
Commit
feb187f
·
verified ·
1 Parent(s): cfbfe0f

Update Model.py

Browse files
Files changed (1) hide show
  1. Model.py +2 -14
Model.py CHANGED
@@ -158,7 +158,6 @@ class LoSoU(layers.Layer):
158
  self.K = layers.Dense(96, dtype='float32')
159
  self.V = Lo(d_model) # Lo already handles casting to model dtype; we'll cast back to float32
160
  self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
161
- self.O = layers.Dense(d_model, dtype='float32')
162
  self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
163
 
164
  # 동적 alpha 계산을 위한 레이어
@@ -242,17 +241,7 @@ class LoSoU(layers.Layer):
242
  x_comb = score_clipped * V # (B, L, d_model)
243
 
244
  out = self.proj(x_comb) # (B, L, d_model)
245
-
246
- # ensure out dim even for split
247
- d = out.shape[-1] # this is an int (static shape)
248
- if d is not None and d % 2 == 1:
249
- out = tf.pad(out, [[0,0],[0,0],[0,1]])
250
-
251
- a, b = tf.split(out, 2, axis=-1)
252
- gated = tf.nn.silu(a) * b
253
- out = self.O(gated)
254
-
255
- out = self.norm(out + residual)
256
 
257
  # cast back to original dtype for downstream layers
258
  return tf.cast(out, x.dtype)
@@ -271,10 +260,9 @@ class ReLaM(tf.keras.Model):
271
  def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
272
  super().__init__()
273
  self.token_embedding = layers.Embedding(vocab_size, 128)
274
- self.pos_embedding = layers.Embedding(max_seq_len, d_model)
275
  self.blocks = [Block(d_model, hyper_n=1) for _ in range(n_layers)]
276
  self.proj = layers.Dense(128)
277
- # LayerNormalization은 float32로 해서 정밀도 문제 방지
278
  self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
279
 
280
  def call(self, x, training=False):
 
158
  self.K = layers.Dense(96, dtype='float32')
159
  self.V = Lo(d_model) # Lo already handles casting to model dtype; we'll cast back to float32
160
  self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
 
161
  self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
162
 
163
  # 동적 alpha 계산을 위한 레이어
 
241
  x_comb = score_clipped * V # (B, L, d_model)
242
 
243
  out = self.proj(x_comb) # (B, L, d_model)
244
+ out = self.norm(out)
 
 
 
 
 
 
 
 
 
 
245
 
246
  # cast back to original dtype for downstream layers
247
  return tf.cast(out, x.dtype)
 
260
  def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
261
  super().__init__()
262
  self.token_embedding = layers.Embedding(vocab_size, 128)
263
+ self.pos_embedding = layers.Embedding(max_seq_len, 128)
264
  self.blocks = [Block(d_model, hyper_n=1) for _ in range(n_layers)]
265
  self.proj = layers.Dense(128)
 
266
  self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
267
 
268
  def call(self, x, training=False):