Yuchan commited on
Commit
fc704cf
Β·
verified Β·
1 Parent(s): aeb1443

Update Mo.py

Browse files
Files changed (1) hide show
  1. Mo.py +16 -22
Mo.py CHANGED
@@ -125,57 +125,51 @@ class SwiGLU(layers.Layer):
125
  return self.out(x_val * tf.nn.silu(x_gate))
126
 
127
  class LoU(layers.Layer):
128
- def __init__(self, d_model, clip_value=5.0, eps=1e-6, dropout_rate=0.1):
129
  super().__init__()
130
  self.d_model = d_model
131
  self.clip_value = float(clip_value)
132
  self.eps = float(eps)
133
- self.dropout_rate = dropout_rate
134
-
135
  self.Q = layers.Dense(d_model, dtype='float32')
136
  self.K = layers.Dense(d_model, dtype='float32')
137
  self.V = layers.Dense(d_model, dtype='float32')
138
-
139
  self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
140
- self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
 
141
  self.glu = SwiGLU(d_model, 320)
142
- self.dropout = layers.Dropout(dropout_rate)
143
-
144
- def call(self, x, training=False):
145
  x_f32 = tf.cast(x, tf.float32)
146
  residual = x_f32
147
-
148
- x_f32 = self.norm1(x_f32)
149
  q = self.Q(x_f32)
150
  k = self.K(x_f32)
151
  V = self.V(x_f32)
152
-
153
- # gating
154
  g_q = (tf.nn.tanh(q) + 1.0) / 2.0
155
  g_k = (tf.nn.tanh(k) + 1.0) / 2.0
156
-
157
- # cumulative score
158
  score = g_q * g_k
159
- score = tf.cumsum(score, axis=1)
 
160
 
 
161
  seq_len = tf.shape(score)[1]
 
162
  count_for_mean = tf.cast(tf.range(seq_len) + 1, score.dtype)
163
  count_for_mean = tf.reshape(count_for_mean, (1, seq_len, 1))
 
 
164
  score_mean = score / count_for_mean
165
 
166
- # normalization + softmax-ish
167
  denom = tf.maximum(score_mean, self.eps)
168
  score_norm = score / denom
169
- score_norm = tf.nn.softmax(score_norm, axis=1)
170
-
171
- # clipping + dropout
172
  score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
173
- score_clipped = self.dropout(score_clipped, training=training)
174
-
175
  x_comb = score_clipped * V
 
176
  out = self.norm(x_comb + residual)
177
  out = self.glu(out)
178
-
179
  return tf.cast(out, x.dtype)
180
 
181
 
 
125
  return self.out(x_val * tf.nn.silu(x_gate))
126
 
127
  class LoU(layers.Layer):
128
+ def __init__(self, d_model, clip_value=5.0, eps=1e-6):
129
  super().__init__()
130
  self.d_model = d_model
131
  self.clip_value = float(clip_value)
132
  self.eps = float(eps)
 
 
133
  self.Q = layers.Dense(d_model, dtype='float32')
134
  self.K = layers.Dense(d_model, dtype='float32')
135
  self.V = layers.Dense(d_model, dtype='float32')
 
136
  self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
137
+ self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
138
+
139
  self.glu = SwiGLU(d_model, 320)
140
+ def call(self, x):
 
 
141
  x_f32 = tf.cast(x, tf.float32)
142
  residual = x_f32
143
+ x_f32 = self.norm1(x)
144
+
145
  q = self.Q(x_f32)
146
  k = self.K(x_f32)
147
  V = self.V(x_f32)
 
 
148
  g_q = (tf.nn.tanh(q) + 1.0) / 2.0
149
  g_k = (tf.nn.tanh(k) + 1.0) / 2.0
 
 
150
  score = g_q * g_k
151
+
152
+ score = tf.cumsum(score, axis=1) # (B, L, D)
153
 
154
+ # πŸ’‘ μˆ˜μ •λœ λΆ€λΆ„: ν˜„μž¬ ν† ν°κΉŒμ§€μ˜ λˆ„μ ν•© ν‰κ· μœΌλ‘œ μ •κ·œν™”
155
  seq_len = tf.shape(score)[1]
156
+ # [1, 2, 3, ..., L]을 D_model μ°¨μ›μœΌλ‘œ ν™•μž₯
157
  count_for_mean = tf.cast(tf.range(seq_len) + 1, score.dtype)
158
  count_for_mean = tf.reshape(count_for_mean, (1, seq_len, 1))
159
+
160
+ # λˆ„μ ν•©μ„ ν˜„μž¬κΉŒμ§€μ˜ 토큰 개수둜 λ‚˜λˆ„μ–΄ 평균 λˆ„μ ν•© 계산 (B, L, D)
161
  score_mean = score / count_for_mean
162
 
163
+ # μ •κ·œν™” λΆ„λͺ¨ μ„€μ •
164
  denom = tf.maximum(score_mean, self.eps)
165
  score_norm = score / denom
166
+ # -----------------------------------------------
167
+
 
168
  score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
 
 
169
  x_comb = score_clipped * V
170
+
171
  out = self.norm(x_comb + residual)
172
  out = self.glu(out)
 
173
  return tf.cast(out, x.dtype)
174
 
175