Yuchan
commited on
Update Mo.py
Browse files
Mo.py
CHANGED
|
@@ -125,57 +125,51 @@ class SwiGLU(layers.Layer):
|
|
| 125 |
return self.out(x_val * tf.nn.silu(x_gate))
|
| 126 |
|
| 127 |
class LoU(layers.Layer):
|
| 128 |
-
def __init__(self, d_model, clip_value=5.0, eps=1e-6
|
| 129 |
super().__init__()
|
| 130 |
self.d_model = d_model
|
| 131 |
self.clip_value = float(clip_value)
|
| 132 |
self.eps = float(eps)
|
| 133 |
-
self.dropout_rate = dropout_rate
|
| 134 |
-
|
| 135 |
self.Q = layers.Dense(d_model, dtype='float32')
|
| 136 |
self.K = layers.Dense(d_model, dtype='float32')
|
| 137 |
self.V = layers.Dense(d_model, dtype='float32')
|
| 138 |
-
|
| 139 |
self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
|
| 140 |
-
self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
|
|
|
|
| 141 |
self.glu = SwiGLU(d_model, 320)
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
def call(self, x, training=False):
|
| 145 |
x_f32 = tf.cast(x, tf.float32)
|
| 146 |
residual = x_f32
|
| 147 |
-
|
| 148 |
-
|
| 149 |
q = self.Q(x_f32)
|
| 150 |
k = self.K(x_f32)
|
| 151 |
V = self.V(x_f32)
|
| 152 |
-
|
| 153 |
-
# gating
|
| 154 |
g_q = (tf.nn.tanh(q) + 1.0) / 2.0
|
| 155 |
g_k = (tf.nn.tanh(k) + 1.0) / 2.0
|
| 156 |
-
|
| 157 |
-
# cumulative score
|
| 158 |
score = g_q * g_k
|
| 159 |
-
|
|
|
|
| 160 |
|
|
|
|
| 161 |
seq_len = tf.shape(score)[1]
|
|
|
|
| 162 |
count_for_mean = tf.cast(tf.range(seq_len) + 1, score.dtype)
|
| 163 |
count_for_mean = tf.reshape(count_for_mean, (1, seq_len, 1))
|
|
|
|
|
|
|
| 164 |
score_mean = score / count_for_mean
|
| 165 |
|
| 166 |
-
#
|
| 167 |
denom = tf.maximum(score_mean, self.eps)
|
| 168 |
score_norm = score / denom
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
# clipping + dropout
|
| 172 |
score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
|
| 173 |
-
score_clipped = self.dropout(score_clipped, training=training)
|
| 174 |
-
|
| 175 |
x_comb = score_clipped * V
|
|
|
|
| 176 |
out = self.norm(x_comb + residual)
|
| 177 |
out = self.glu(out)
|
| 178 |
-
|
| 179 |
return tf.cast(out, x.dtype)
|
| 180 |
|
| 181 |
|
|
|
|
| 125 |
return self.out(x_val * tf.nn.silu(x_gate))
|
| 126 |
|
| 127 |
class LoU(layers.Layer):
|
| 128 |
+
def __init__(self, d_model, clip_value=5.0, eps=1e-6):
|
| 129 |
super().__init__()
|
| 130 |
self.d_model = d_model
|
| 131 |
self.clip_value = float(clip_value)
|
| 132 |
self.eps = float(eps)
|
|
|
|
|
|
|
| 133 |
self.Q = layers.Dense(d_model, dtype='float32')
|
| 134 |
self.K = layers.Dense(d_model, dtype='float32')
|
| 135 |
self.V = layers.Dense(d_model, dtype='float32')
|
|
|
|
| 136 |
self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
|
| 137 |
+
self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
|
| 138 |
+
|
| 139 |
self.glu = SwiGLU(d_model, 320)
|
| 140 |
+
def call(self, x):
|
|
|
|
|
|
|
| 141 |
x_f32 = tf.cast(x, tf.float32)
|
| 142 |
residual = x_f32
|
| 143 |
+
x_f32 = self.norm1(x)
|
| 144 |
+
|
| 145 |
q = self.Q(x_f32)
|
| 146 |
k = self.K(x_f32)
|
| 147 |
V = self.V(x_f32)
|
|
|
|
|
|
|
| 148 |
g_q = (tf.nn.tanh(q) + 1.0) / 2.0
|
| 149 |
g_k = (tf.nn.tanh(k) + 1.0) / 2.0
|
|
|
|
|
|
|
| 150 |
score = g_q * g_k
|
| 151 |
+
|
| 152 |
+
score = tf.cumsum(score, axis=1) # (B, L, D)
|
| 153 |
|
| 154 |
+
# π‘ μμ λ λΆλΆ: νμ¬ ν ν°κΉμ§μ λμ ν© νκ· μΌλ‘ μ κ·ν
|
| 155 |
seq_len = tf.shape(score)[1]
|
| 156 |
+
# [1, 2, 3, ..., L]μ D_model μ°¨μμΌλ‘ νμ₯
|
| 157 |
count_for_mean = tf.cast(tf.range(seq_len) + 1, score.dtype)
|
| 158 |
count_for_mean = tf.reshape(count_for_mean, (1, seq_len, 1))
|
| 159 |
+
|
| 160 |
+
# λμ ν©μ νμ¬κΉμ§μ ν ν° κ°μλ‘ λλμ΄ νκ· λμ ν© κ³μ° (B, L, D)
|
| 161 |
score_mean = score / count_for_mean
|
| 162 |
|
| 163 |
+
# μ κ·ν λΆλͺ¨ μ€μ
|
| 164 |
denom = tf.maximum(score_mean, self.eps)
|
| 165 |
score_norm = score / denom
|
| 166 |
+
# -----------------------------------------------
|
| 167 |
+
|
|
|
|
| 168 |
score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
|
|
|
|
|
|
|
| 169 |
x_comb = score_clipped * V
|
| 170 |
+
|
| 171 |
out = self.norm(x_comb + residual)
|
| 172 |
out = self.glu(out)
|
|
|
|
| 173 |
return tf.cast(out, x.dtype)
|
| 174 |
|
| 175 |
|