Yuchan
commited on
Update Model.py
Browse files
Model.py
CHANGED
|
@@ -120,118 +120,154 @@ dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
|
|
| 120 |
|
| 121 |
print("โ
TF Dataset ์์ฑ ์๋ฃ!")
|
| 122 |
|
| 123 |
-
class
|
| 124 |
def __init__(self, d_model):
|
| 125 |
super().__init__()
|
| 126 |
# ๋ด๋ถ ๊ณ์ฐ์ float32๋ก ์ ์ง
|
| 127 |
self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
|
| 128 |
self.p = layers.Dense(128, use_bias=True, dtype='float32')
|
| 129 |
self._out_dtype = 'float32'
|
| 130 |
-
|
| 131 |
-
self.ln1 = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
|
| 132 |
-
|
| 133 |
def call(self, x):
|
| 134 |
# x may be bfloat16; cast to float32 for stable intermediate computation
|
| 135 |
x_f32 = tf.cast(x, tf.float32)
|
| 136 |
-
|
| 137 |
-
x_f32 = self.ln(x_f32)
|
| 138 |
-
x = self.p(x_f32)
|
| 139 |
x = tf.nn.gelu(x)
|
| 140 |
-
x = self.
|
| 141 |
-
x = self.ln1(x) + re
|
| 142 |
# cast back to model dtype for consistency
|
| 143 |
return tf.cast(x, self._out_dtype)
|
| 144 |
|
| 145 |
-
class
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
super().__init__()
|
| 148 |
-
|
| 149 |
-
self.
|
| 150 |
-
self.
|
| 151 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
def call(self, x):
|
| 154 |
-
x
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
#
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
#
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
def
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
return config
|
| 214 |
-
|
| 215 |
-
class Respiso(tf.keras.Model):
|
| 216 |
def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
|
| 217 |
super().__init__()
|
| 218 |
self.token_embedding = layers.Embedding(vocab_size, d_model)
|
| 219 |
-
self.
|
| 220 |
-
self.
|
| 221 |
-
|
|
|
|
| 222 |
self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
|
| 223 |
-
self.lm_head = layers.Dense(vocab_size, use_bias=False)
|
| 224 |
|
| 225 |
def call(self, x, training=False):
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
x = self.
|
|
|
|
|
|
|
|
|
|
| 230 |
x = self.ln_f(x)
|
| 231 |
-
|
|
|
|
|
|
|
| 232 |
return tf.cast(logits, tf.float32)
|
| 233 |
|
| 234 |
-
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
|
| 235 |
|
| 236 |
def masked_loss(y_true, y_pred):
|
| 237 |
loss = loss_fn(y_true, y_pred)
|
|
@@ -254,7 +290,7 @@ def create_lr_schedule(initial_lr=5e-5, decay_steps=10000, decay_rate=0.9):
|
|
| 254 |
)
|
| 255 |
|
| 256 |
# ๋ชจ๋ธ ์์ฑ
|
| 257 |
-
model =
|
| 258 |
vocab_size=vocab_size,
|
| 259 |
max_seq_len=max_len,
|
| 260 |
d_model=256,
|
|
|
|
| 120 |
|
| 121 |
print("โ
TF Dataset ์์ฑ ์๋ฃ!")
|
| 122 |
|
| 123 |
+
class Lo(layers.Layer):
|
| 124 |
def __init__(self, d_model):
|
| 125 |
super().__init__()
|
| 126 |
# ๋ด๋ถ ๊ณ์ฐ์ float32๋ก ์ ์ง
|
| 127 |
self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
|
| 128 |
self.p = layers.Dense(128, use_bias=True, dtype='float32')
|
| 129 |
self._out_dtype = 'float32'
|
| 130 |
+
|
|
|
|
|
|
|
| 131 |
def call(self, x):
|
| 132 |
# x may be bfloat16; cast to float32 for stable intermediate computation
|
| 133 |
x_f32 = tf.cast(x, tf.float32)
|
| 134 |
+
x = self.proj(x_f32)
|
|
|
|
|
|
|
| 135 |
x = tf.nn.gelu(x)
|
| 136 |
+
x = self.p(x)
|
|
|
|
| 137 |
# cast back to model dtype for consistency
|
| 138 |
return tf.cast(x, self._out_dtype)
|
| 139 |
|
| 140 |
+
class LoSoU(layers.Layer):
|
| 141 |
+
"""
|
| 142 |
+
์์ ํ๋ LoSoU ๋ ์ด์ด
|
| 143 |
+
- ๋์ ํฉ ๋์ ์ง์์ด๋ํ๊ท (EMA) ์ฌ์ฉ (alpha: smoothing factor)
|
| 144 |
+
- ๋ด๋ถ ๊ณ์ฐ์ float32๋ก ์ํ (TPU bfloat16 ์์ ์ฑ ํฅ์)
|
| 145 |
+
- EMA ๊ฒฐ๊ณผ ํด๋ฆฌํ ๋ฐ ์์ epsilon ์ ์ฉ
|
| 146 |
+
- ์์ ํ split ์ฒ๋ฆฌ (์ง์ ์ฐจ์ ๊ฐ์ ; ์๋๋ผ๋ฉด ๋ง์ง๋ง ์ฐจ์ pad ํ์)
|
| 147 |
+
"""
|
| 148 |
+
def __init__(self, d_model, alpha=0.15, clip_value=5.0, eps=1e-6):
|
| 149 |
super().__init__()
|
| 150 |
+
# ๋๋ถ๋ถ ์ฐ์ฐ์ float32๋ก ์ํ
|
| 151 |
+
self.d_model = d_model
|
| 152 |
+
self.alpha = float(alpha)
|
| 153 |
+
self.clip_value = float(clip_value)
|
| 154 |
+
self.eps = float(eps)
|
| 155 |
+
|
| 156 |
+
# projection / gating layers in float32
|
| 157 |
+
self.Q = layers.Dense(128, dtype='float32')
|
| 158 |
+
self.K = layers.Dense(128, dtype='float32')
|
| 159 |
+
# V produces d_model so keep it float32 internally
|
| 160 |
+
self.V = Lo(d_model) # Lo already handles casting to model dtype; we'll cast back to float32
|
| 161 |
+
self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
|
| 162 |
+
self.O = layers.Dense(d_model, dtype='float32')
|
| 163 |
+
self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
|
| 164 |
+
|
| 165 |
+
def _ema_over_time(self, score):
|
| 166 |
+
# score: (B, L, D) float32 in [0,1] roughly
|
| 167 |
+
alpha = tf.constant(self.alpha, dtype=score.dtype)
|
| 168 |
+
|
| 169 |
+
# transpose to (L, B, D) to scan over time steps
|
| 170 |
+
seq = tf.transpose(score, perm=[1, 0, 2])
|
| 171 |
+
|
| 172 |
+
def step(prev_ema, x_t):
|
| 173 |
+
# prev_ema: (B, D), x_t: (B, D)
|
| 174 |
+
new = alpha * x_t + (1.0 - alpha) * prev_ema
|
| 175 |
+
return new
|
| 176 |
+
|
| 177 |
+
# ์ด๊ธฐ๊ฐ์ ์ฒซ step ๊ฐ์ผ๋ก ์ค์
|
| 178 |
+
init = seq[0]
|
| 179 |
+
|
| 180 |
+
ema_seq = tf.scan(fn=step, elems=seq[1:], initializer=init)
|
| 181 |
+
ema_seq = tf.concat([tf.expand_dims(init, 0), ema_seq], axis=0) # (L, B, D)
|
| 182 |
+
|
| 183 |
+
# transpose back to (B, L, D)
|
| 184 |
+
ema = tf.transpose(ema_seq, perm=[1, 0, 2])
|
| 185 |
+
return ema
|
| 186 |
+
|
| 187 |
|
| 188 |
def call(self, x):
|
| 189 |
+
# x: (B, L, d_model) maybe bfloat16 or float32
|
| 190 |
+
# cast to float32 for all internal computations
|
| 191 |
+
x_f32 = tf.cast(x, tf.float32)
|
| 192 |
+
residual = x_f32
|
| 193 |
+
|
| 194 |
+
# Q, K, V
|
| 195 |
+
q = self.Q(x_f32) # (B, L, 128)
|
| 196 |
+
k = self.K(x_f32) # (B, L, 128)
|
| 197 |
+
V = tf.cast(self.V(x), tf.float32) # ensure V's output is float32
|
| 198 |
+
|
| 199 |
+
# gating signals in (0,1)
|
| 200 |
+
g_q = tf.nn.sigmoid(q)
|
| 201 |
+
g_k = tf.nn.sigmoid(k)
|
| 202 |
+
|
| 203 |
+
# elementwise product -> bounded roughly [0,1]
|
| 204 |
+
score = g_q * g_k
|
| 205 |
+
|
| 206 |
+
# EMA across time (stable alternative to cumsum)
|
| 207 |
+
score_ema = self._ema_over_time(score)
|
| 208 |
+
|
| 209 |
+
# optionally normalize by (mean + eps) across last dim to reduce scale variations
|
| 210 |
+
mean_last = tf.reduce_mean(score_ema, axis=-1, keepdims=True) # (B, L, 1)
|
| 211 |
+
denom = tf.maximum(mean_last, self.eps)
|
| 212 |
+
score_norm = score_ema / denom
|
| 213 |
+
|
| 214 |
+
# clip to avoid extremes
|
| 215 |
+
score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
|
| 216 |
+
|
| 217 |
+
# combine with V
|
| 218 |
+
x_comb = score_clipped * V # (B, L, d_model)
|
| 219 |
+
|
| 220 |
+
out = self.proj(x_comb) # (B, L, d_model)
|
| 221 |
+
|
| 222 |
+
# ensure out dim even for split
|
| 223 |
+
d = out.shape[-1] # this is an int (static shape)
|
| 224 |
+
if d is not None and d % 2 == 1:
|
| 225 |
+
out = tf.pad(out, [[0,0],[0,0],[0,1]])
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
a, b = tf.split(out, 2, axis=-1)
|
| 229 |
+
gated = tf.nn.silu(a) * b
|
| 230 |
+
out = self.O(gated)
|
| 231 |
+
|
| 232 |
+
out = self.norm(out + residual)
|
| 233 |
+
|
| 234 |
+
# cast back to original dtype for downstream layers
|
| 235 |
+
return tf.cast(out, x.dtype)
|
| 236 |
+
|
| 237 |
+
class Block(layers.Layer):
|
| 238 |
+
def __init__(self, d_model, r, hyper_n, num_heads, num_groups):
|
| 239 |
+
super().__init__()
|
| 240 |
+
self.losou = [LoSoU(d_model) for _ in range(hyper_n)]
|
| 241 |
+
|
| 242 |
+
def call(self, x):
|
| 243 |
+
for losou in self.losou:
|
| 244 |
+
x = losou(x)
|
| 245 |
+
return x
|
| 246 |
+
|
| 247 |
+
class ReLaM(tf.keras.Model):
|
|
|
|
|
|
|
|
|
|
| 248 |
def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
|
| 249 |
super().__init__()
|
| 250 |
self.token_embedding = layers.Embedding(vocab_size, d_model)
|
| 251 |
+
self.pos_embedding = layers.Embedding(max_seq_len, d_model)
|
| 252 |
+
self.blocks = [Block(d_model, r=204, hyper_n=3, num_heads=8, num_groups=2) for _ in range(n_layers)]
|
| 253 |
+
|
| 254 |
+
# LayerNormalization์ float32๋ก ํด์ ์ ๋ฐ๋ ๋ฌธ์ ๋ฐฉ์ง
|
| 255 |
self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
|
|
|
|
| 256 |
|
| 257 |
def call(self, x, training=False):
|
| 258 |
+
batch_size, seq_len = tf.shape(x)[0], tf.shape(x)[1]
|
| 259 |
+
positions = tf.range(seq_len)[tf.newaxis, :]
|
| 260 |
+
|
| 261 |
+
x = self.token_embedding(x) + self.pos_embedding(positions)
|
| 262 |
+
for block in self.blocks:
|
| 263 |
+
x = block(x)
|
| 264 |
+
|
| 265 |
x = self.ln_f(x)
|
| 266 |
+
|
| 267 |
+
embedding_matrix = tf.cast(self.token_embedding.embeddings, x.dtype)
|
| 268 |
+
logits = tf.matmul(x, embedding_matrix, transpose_b=True)
|
| 269 |
return tf.cast(logits, tf.float32)
|
| 270 |
|
|
|
|
| 271 |
|
| 272 |
def masked_loss(y_true, y_pred):
|
| 273 |
loss = loss_fn(y_true, y_pred)
|
|
|
|
| 290 |
)
|
| 291 |
|
| 292 |
# ๋ชจ๋ธ ์์ฑ
|
| 293 |
+
model = ReLaM(
|
| 294 |
vocab_size=vocab_size,
|
| 295 |
max_seq_len=max_len,
|
| 296 |
d_model=256,
|