Yuchan commited on
Commit
feb8044
Β·
verified Β·
1 Parent(s): d4d9fe3

Update Mo.py

Browse files
Files changed (1) hide show
  1. Mo.py +22 -23
Mo.py CHANGED
@@ -130,13 +130,22 @@ class LoU(layers.Layer):
130
  self.d_model = d_model
131
  self.clip_value = float(clip_value)
132
  self.eps = float(eps)
 
 
133
  self.Q = layers.Dense(d_model, dtype='float32')
134
  self.K = layers.Dense(d_model, dtype='float32')
135
  self.V = layers.Dense(d_model, dtype='float32')
 
 
136
  self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
137
  self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
138
-
 
139
  self.glu = SwiGLU(d_model, 3500)
 
 
 
 
140
  def call(self, x):
141
  x_f32 = tf.cast(x, tf.float32)
142
  residual = x_f32
@@ -145,34 +154,24 @@ class LoU(layers.Layer):
145
  q = self.Q(x_f32)
146
  k = self.K(x_f32)
147
  V = self.V(x_f32)
 
148
  g_q = (tf.nn.tanh(q) + 1.0) / 2.0
149
  g_k = (tf.nn.tanh(k) + 1.0) / 2.0
150
- score = g_q * g_k
151
 
152
- score = tf.cumsum(score, axis=1) # (B, L, D)
153
-
154
- # πŸ’‘ μˆ˜μ •λœ λΆ€λΆ„: ν˜„μž¬ ν† ν°κΉŒμ§€μ˜ λˆ„μ ν•© ν‰κ· μœΌλ‘œ μ •κ·œν™”
155
- seq_len = tf.shape(score)[1]
156
- # [1, 2, 3, ..., L]을 D_model μ°¨μ›μœΌλ‘œ ν™•μž₯
157
- count_for_mean = tf.cast(tf.range(seq_len) + 1, score.dtype)
158
- count_for_mean = tf.reshape(count_for_mean, (1, seq_len, 1))
159
-
160
- # λˆ„μ ν•©μ„ ν˜„μž¬κΉŒμ§€μ˜ 토큰 개수둜 λ‚˜λˆ„μ–΄ 평균 λˆ„μ ν•© 계산 (B, L, D)
161
- score_mean = score / count_for_mean
162
-
163
- # μ •κ·œν™” λΆ„λͺ¨ μ„€μ •
164
- denom = tf.maximum(score_mean, self.eps)
165
- score_norm = score / denom
166
- # -----------------------------------------------
167
-
168
- score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
169
- x_comb = score_clipped * V
170
-
171
- out = self.norm(x_comb + residual)
172
  out = self.glu(out)
173
  return tf.cast(out, x.dtype)
174
 
175
-
176
  class Lo(layers.Layer):
177
  def __init__(self, d_model):
178
  super().__init__()
 
130
  self.d_model = d_model
131
  self.clip_value = float(clip_value)
132
  self.eps = float(eps)
133
+
134
+ # Q/K/V λ³€ν™˜
135
  self.Q = layers.Dense(d_model, dtype='float32')
136
  self.K = layers.Dense(d_model, dtype='float32')
137
  self.V = layers.Dense(d_model, dtype='float32')
138
+
139
+ # μ •κ·œν™”
140
  self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
141
  self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
142
+
143
+ # λΉ„μ„ ν˜• ν‘œν˜„λ ₯
144
  self.glu = SwiGLU(d_model, 3500)
145
+
146
+ # ν•™μŠ΅ κ°€λŠ₯ν•œ κ³Όκ±° 토큰 κ°€μ€‘μΉ˜
147
+ self.alpha = self.add_weight(shape=(d_model,), initializer='ones', trainable=True)
148
+
149
  def call(self, x):
150
  x_f32 = tf.cast(x, tf.float32)
151
  residual = x_f32
 
154
  q = self.Q(x_f32)
155
  k = self.K(x_f32)
156
  V = self.V(x_f32)
157
+
158
  g_q = (tf.nn.tanh(q) + 1.0) / 2.0
159
  g_k = (tf.nn.tanh(k) + 1.0) / 2.0
 
160
 
161
+ # κ³Όκ±° 토큰 κ°€μ€‘μΉ˜ 반영 점수
162
+ score = g_q * g_k * self.alpha # element-wise scaling
163
+ # λˆ„μ ν•© λŒ€μ‹  가쀑 평균
164
+ # score_t = sum_{i=0}^{t} alpha_i * V_i / sum_{i=0}^{t} alpha_i
165
+ score_cum = tf.math.cumsum(score * V, axis=1)
166
+ alpha_cum = tf.math.cumsum(score, axis=1)
167
+ score_weighted = score_cum / tf.maximum(alpha_cum, self.eps)
168
+
169
+ # μ •κ·œν™” + 클리핑
170
+ score_norm = tf.clip_by_value(score_weighted, -self.clip_value, self.clip_value)
171
+ out = self.norm(score_norm + residual)
 
 
 
 
 
 
 
 
 
172
  out = self.glu(out)
173
  return tf.cast(out, x.dtype)
174
 
 
175
  class Lo(layers.Layer):
176
  def __init__(self, d_model):
177
  super().__init__()