Yuchan
commited on
Update Mo.py
Browse files
Mo.py
CHANGED
|
@@ -6,7 +6,11 @@ import requests
|
|
| 6 |
from tensorflow import keras
|
| 7 |
from tensorflow.keras import layers
|
| 8 |
import tensorflow.keras.backend as K
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
print('1')
|
| 11 |
tf.get_logger().setLevel("ERROR")
|
| 12 |
SEED = 42
|
|
@@ -63,8 +67,8 @@ unk_id = sp.piece_to_id("<unk>")
|
|
| 63 |
vocab_size = sp.get_piece_size()
|
| 64 |
print(f"โ
Vocabulary size: {vocab_size}")
|
| 65 |
|
| 66 |
-
max_len =
|
| 67 |
-
batch_size =
|
| 68 |
|
| 69 |
def text_to_ids(text):
|
| 70 |
return sp.encode(text, out_type=int)
|
|
@@ -169,20 +173,22 @@ class MHLA(layers.Layer):
|
|
| 169 |
class Lo(layers.Layer):
|
| 170 |
def __init__(self, d_model):
|
| 171 |
super().__init__()
|
| 172 |
-
self.d = layers.Dense(64, activation='silu')
|
| 173 |
-
self.w = layers.Dense(d_model)
|
| 174 |
-
self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
|
| 175 |
|
| 176 |
def call(self, x):
|
| 177 |
p = self.d(x)
|
| 178 |
p = self.w(p)
|
| 179 |
-
|
|
|
|
|
|
|
| 180 |
|
| 181 |
class Block(layers.Layer):
|
| 182 |
def __init__(self, d_model):
|
| 183 |
super().__init__()
|
| 184 |
self.lou = MHLA(d_model, 8)
|
| 185 |
-
self.glu = SwiGLU(d_model,
|
| 186 |
self.lo = Lo(d_model)
|
| 187 |
|
| 188 |
def call(self, x):
|
|
@@ -193,10 +199,10 @@ class Block(layers.Layer):
|
|
| 193 |
class LaSLM(tf.keras.Model):
|
| 194 |
def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
|
| 195 |
super().__init__()
|
| 196 |
-
self.token_embedding = layers.Embedding(vocab_size, d_model)
|
| 197 |
-
self.pos_embedding = layers.Embedding(max_seq_len, d_model)
|
| 198 |
self.blocks = [Block(d_model) for _ in range(n_layers)]
|
| 199 |
-
self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=
|
| 200 |
|
| 201 |
def call(self, x, training=False):
|
| 202 |
batch_size, seq_len = tf.shape(x)[0], tf.shape(x)[1]
|
|
@@ -207,7 +213,7 @@ class LaSLM(tf.keras.Model):
|
|
| 207 |
x = self.ln_f(x)
|
| 208 |
embedding_matrix = tf.cast(self.token_embedding.embeddings, x.dtype)
|
| 209 |
logits = tf.matmul(x, embedding_matrix, transpose_b=True)
|
| 210 |
-
return tf.cast(logits, tf.float32)
|
| 211 |
|
| 212 |
def smoothed_loss_keras(y_true, y_pred, eps=0.1):
|
| 213 |
y_true = tf.cast(y_true, tf.int32)
|
|
|
|
| 6 |
from tensorflow import keras
|
| 7 |
from tensorflow.keras import layers
|
| 8 |
import tensorflow.keras.backend as K
|
| 9 |
+
# ===============================
|
| 10 |
+
from tensorflow.keras import mixed_precision
|
| 11 |
+
policy = mixed_precision.Policy('mixed_float16') # fp16
|
| 12 |
+
mixed_precision.set_global_policy(policy)
|
| 13 |
+
print("โ
Mixed precision ์ ์ฉ:", policy)
|
| 14 |
print('1')
|
| 15 |
tf.get_logger().setLevel("ERROR")
|
| 16 |
SEED = 42
|
|
|
|
| 67 |
vocab_size = sp.get_piece_size()
|
| 68 |
print(f"โ
Vocabulary size: {vocab_size}")
|
| 69 |
|
| 70 |
+
max_len = 200
|
| 71 |
+
batch_size = 96
|
| 72 |
|
| 73 |
def text_to_ids(text):
|
| 74 |
return sp.encode(text, out_type=int)
|
|
|
|
| 173 |
class Lo(layers.Layer):
|
| 174 |
def __init__(self, d_model):
|
| 175 |
super().__init__()
|
| 176 |
+
self.d = layers.Dense(64, activation='silu', dtype='float16') # fp16 ์ฐ์ฐ
|
| 177 |
+
self.w = layers.Dense(d_model, dtype='float16') # fp16 ์ฐ์ฐ
|
| 178 |
+
self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32') # fp32
|
| 179 |
|
| 180 |
def call(self, x):
|
| 181 |
p = self.d(x)
|
| 182 |
p = self.w(p)
|
| 183 |
+
p = self.norm(p) # fp32
|
| 184 |
+
return tf.cast(p, x.dtype) + x # ๋ค์ fp16๋ก ๋ง์ถฐ์ Add
|
| 185 |
+
|
| 186 |
|
| 187 |
class Block(layers.Layer):
|
| 188 |
def __init__(self, d_model):
|
| 189 |
super().__init__()
|
| 190 |
self.lou = MHLA(d_model, 8)
|
| 191 |
+
self.glu = SwiGLU(d_model, 1048)
|
| 192 |
self.lo = Lo(d_model)
|
| 193 |
|
| 194 |
def call(self, x):
|
|
|
|
| 199 |
class LaSLM(tf.keras.Model):
|
| 200 |
def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
|
| 201 |
super().__init__()
|
| 202 |
+
self.token_embedding = layers.Embedding(vocab_size, d_model, dtype=policy.compute_dtype)
|
| 203 |
+
self.pos_embedding = layers.Embedding(max_seq_len, d_model, dtype=policy.compute_dtype)
|
| 204 |
self.blocks = [Block(d_model) for _ in range(n_layers)]
|
| 205 |
+
self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype='float32') # ln_f๋ fp32
|
| 206 |
|
| 207 |
def call(self, x, training=False):
|
| 208 |
batch_size, seq_len = tf.shape(x)[0], tf.shape(x)[1]
|
|
|
|
| 213 |
x = self.ln_f(x)
|
| 214 |
embedding_matrix = tf.cast(self.token_embedding.embeddings, x.dtype)
|
| 215 |
logits = tf.matmul(x, embedding_matrix, transpose_b=True)
|
| 216 |
+
return tf.cast(logits, tf.float32) # loss ๊ณ์ฐ์ ์ํด fp32๋ก ๋ณํ
|
| 217 |
|
| 218 |
def smoothed_loss_keras(y_true, y_pred, eps=0.1):
|
| 219 |
y_true = tf.cast(y_true, tf.int32)
|