Yuchan
Update Mo.py
b917a71 verified
raw
history blame
10.6 kB
!pip install sentencepiece
import sentencepiece as spm
import os, json, numpy as np, tensorflow as tf
from tensorflow.keras import layers, Model
import requests
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow.keras.backend as K
# ===============================
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16') # fp16
mixed_precision.set_global_policy(policy)
print("βœ… Mixed precision 적용:", policy)
print('1')
tf.get_logger().setLevel("ERROR")
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
# TPU μ΄ˆκΈ°ν™”
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
strategy = tf.distribute.MirroredStrategy(devices=[f"/GPU:{i}" for i in range(len(gpus))])
print(f"βœ… GPU {len(gpus)}개 μ‚¬μš©: {strategy.num_replicas_in_sync} replicas")
except RuntimeError as e:
print("⚠️ GPU μ„€μ • μ—λŸ¬:", e)
else:
strategy = tf.distribute.get_strategy()
print("⚠️ GPU μ—†μŒ, CPU μ‚¬μš©")
# =======================
# 1) 파일 λ‹€μš΄λ‘œλ“œ
# =======================
def download_file(url, save_path):
r = requests.get(url, stream=True)
r.raise_for_status()
with open(save_path, "wb") as f:
for chunk in r.iter_content(8192*2):
f.write(chunk)
print(f"βœ… {save_path} μ €μž₯됨")
DATA_PATH = "corpus.txt"
TOKENIZER_PATH = "ko_unigram.model"
if not os.path.exists(DATA_PATH):
download_file(
"https://huggingface.co/datasets/Yuchan5386/1/resolve/main/shuffled_corpus.txt?download=true",
DATA_PATH
)
if not os.path.exists(TOKENIZER_PATH):
download_file(
"https://huggingface.co/Yuchan5386/inlam-100m/resolve/main/ko_unigram.model?download=true",
TOKENIZER_PATH
)
sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
start_id = sp.piece_to_id("<start>")
sep_id = sp.piece_to_id("<sep>")
end_id = sp.piece_to_id("<end>")
unk_id = sp.piece_to_id("<unk>")
vocab_size = sp.get_piece_size()
print(f"βœ… Vocabulary size: {vocab_size}")
max_len = 200
batch_size = 96
def text_to_ids(text):
return sp.encode(text, out_type=int)
def ids_to_text(ids):
return sp.decode(ids)
def txt_stream(file_path):
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
text = line.strip()
if not text:
continue
ids = text_to_ids(text)
ids = ids[:max_len - 1] # λ§ˆμ§€λ§‰μ— <end> λ„£κΈ° μœ„ν•΄ -1
full_input = ids + [end_id]
pad_len = max_len - len(full_input)
full_input += [pad_id] * pad_len
# target = next-token shifted sequence
target = full_input[1:] + [pad_id]
yield (
tf.convert_to_tensor(full_input, dtype=tf.int32),
tf.convert_to_tensor(target, dtype=tf.int32)
)
steps_per_epoch = 23119910 // batch_size
LIMIT = 23119910
dataset = tf.data.Dataset.from_generator(
lambda: txt_stream(DATA_PATH),
output_signature=(
tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
)
)
dataset = dataset.take(LIMIT).shuffle(2000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
with strategy.scope():
dist_dataset = strategy.experimental_distribute_dataset(dataset)
class SwiGLU(layers.Layer):
def __init__(self, d_model, d_ff):
super().__init__()
self.proj = layers.Dense(d_ff)
self.out = layers.Dense(d_model)
def call(self, x):
x_proj = self.proj(x)
x_val, x_gate = tf.split(x_proj, 2, axis=-1)
return self.out(x_val * tf.nn.silu(x_gate))
class MHLA(layers.Layer):
def __init__(self, embed_dim, num_heads=8, dropout=0.0):
super().__init__()
assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.Wq = layers.Dense(embed_dim, use_bias=False)
self.Wk = layers.Dense(embed_dim, use_bias=False)
self.Wv = layers.Dense(embed_dim, use_bias=False)
self.out = layers.Dense(embed_dim)
self.dropout = layers.Dropout(dropout)
def split_heads(self, x):
# [B, L, D] -> [B, num_heads, L, head_dim]
B, L, D = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]
x = tf.reshape(x, (B, L, self.num_heads, self.head_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def combine_heads(self, x):
# [B, num_heads, L, head_dim] -> [B, L, D]
x = tf.transpose(x, perm=[0, 2, 1, 3])
B, L, H, D = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], tf.shape(x)[3]
return tf.reshape(x, (B, L, H*D))
def call(self, x, training=False):
q = tf.nn.elu(self.Wq(x)) + 1
k = tf.nn.elu(self.Wk(x)) + 1
v = self.Wv(x)
q = self.split_heads(q)
k = self.split_heads(k)
v = self.split_heads(v)
# causal linear attention cumulative sum
k_cum = tf.cumsum(k, axis=2)
kv_cum = tf.cumsum(k * v, axis=2)
z = 1.0 / tf.reduce_sum(q * k_cum, axis=-1, keepdims=True)
out = (q * kv_cum) * z
out = self.combine_heads(out)
out = self.dropout(out, training=training)
return self.out(out)
class Lo(layers.Layer):
def __init__(self, d_model):
super().__init__()
self.d = layers.Dense(64, activation='silu', dtype='float16') # fp16 μ—°μ‚°
self.w = layers.Dense(d_model, dtype='float16') # fp16 μ—°μ‚°
self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32') # fp32
def call(self, x):
p = self.d(x)
p = self.w(p)
p = self.norm(p) # fp32
return tf.cast(p, x.dtype) + x # λ‹€μ‹œ fp16둜 λ§žμΆ°μ„œ Add
class Block(layers.Layer):
def __init__(self, d_model):
super().__init__()
self.lou = MHLA(d_model, 8)
self.glu = SwiGLU(d_model, 1048)
self.lo = Lo(d_model)
def call(self, x):
x = self.lou(x)
x = self.lo(x)
return x
class LaSLM(tf.keras.Model):
def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
super().__init__()
self.token_embedding = layers.Embedding(vocab_size, d_model, dtype=policy.compute_dtype)
self.pos_embedding = layers.Embedding(max_seq_len, d_model, dtype=policy.compute_dtype)
self.blocks = [Block(d_model) for _ in range(n_layers)]
self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype='float32') # ln_fλŠ” fp32
def call(self, x, training=False):
batch_size, seq_len = tf.shape(x)[0], tf.shape(x)[1]
positions = tf.range(seq_len)[tf.newaxis, :]
x = self.token_embedding(x) + self.pos_embedding(positions)
for block in self.blocks:
x = block(x)
x = self.ln_f(x)
embedding_matrix = tf.cast(self.token_embedding.embeddings, x.dtype)
logits = tf.matmul(x, embedding_matrix, transpose_b=True)
return tf.cast(logits, tf.float32) # loss 계산을 μœ„ν•΄ fp32둜 λ³€ν™˜
def smoothed_loss_keras(y_true, y_pred, eps=0.1):
y_true = tf.cast(y_true, tf.int32)
mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)
vocab = tf.shape(y_pred)[-1]
y_true_oh = tf.one_hot(y_true, depth=vocab, dtype=tf.float32)
y_true_ls = (1.0 - eps) * y_true_oh + eps / tf.cast(vocab, tf.float32)
log_probs = tf.nn.log_softmax(y_pred, axis=-1)
per_tok = -tf.reduce_sum(y_true_ls * log_probs, axis=-1)
per_tok = per_tok * mask
return tf.reduce_sum(per_tok) / (tf.reduce_sum(mask) + 1e-8)
def masked_perplexity(y_true, y_pred, eps=0.1):
y_true = tf.cast(y_true, tf.int32)
mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)
vocab = tf.shape(y_pred)[-1]
y_true_oh = tf.one_hot(y_true, depth=vocab, dtype=tf.float32)
y_true_ls = (1.0 - eps) * y_true_oh + eps / tf.cast(vocab, tf.float32)
log_probs = tf.nn.log_softmax(y_pred, axis=-1)
per_tok = -tf.reduce_sum(y_true_ls * log_probs, axis=-1)
per_tok = per_tok * mask
mean_loss = tf.reduce_sum(per_tok) / (tf.reduce_sum(mask) + 1e-8)
return tf.exp(mean_loss)
with strategy.scope():
model = LaSLM(vocab_size=vocab_size, max_seq_len=max_len, d_model=384, n_layers=3)
dummy_input = tf.zeros((batch_size, max_len), dtype=tf.int32)
_ = model(dummy_input, training=False)
model.summary()
optimizer = tf.keras.optimizers.Adam(1e-4, beta_1=0.9, beta_2=0.95, epsilon=1e-8, clipnorm=1.0)
model.compile(optimizer=optimizer, loss=smoothed_loss_keras, metrics=[masked_perplexity])
# ν•™μŠ΅
history = model.fit(dist_dataset, epochs=1, steps_per_epoch=steps_per_epoch, verbose=1)
model.save_weights("tf_model.weights.h5")
print("βœ… λͺ¨λΈ κ°€μ€‘μΉ˜ μ €μž₯ μ™„λ£Œ!")
def generate_text_topp(model, prompt, max_len=500, max_gen=500, p=0.9, temperature=0.8, min_len=20):
model_input = text_to_ids(f"<start> {prompt}")
model_input = model_input[:max_len]
generated = list(model_input)
for step in range(max_gen):
if len(generated) > max_len:
input_seq = generated[-max_len:]
else:
input_seq = generated
input_padded = np.pad(input_seq, (0, max_len - len(input_seq)), constant_values=pad_id)
input_tensor = tf.convert_to_tensor([input_padded])
logits = model(input_tensor, training=False)
next_token_logits = logits[0, len(input_seq) - 1].numpy()
next_token_logits[end_id] -= 5.0
next_token_logits[pad_id] -= 10.0
probs = tf.nn.softmax(next_token_logits / temperature).numpy()
sorted_indices = np.argsort(probs)[::-1]
sorted_probs = probs[sorted_indices]
cumulative_probs = np.cumsum(sorted_probs)
cutoff = np.searchsorted(cumulative_probs, p)
top_indices = sorted_indices[:cutoff + 1]
top_probs = sorted_probs[:cutoff + 1]
top_probs /= np.sum(top_probs)
next_token_id = np.random.choice(top_indices, p=top_probs)
if next_token_id == end_id and len(generated) >= min_len:
break
generated.append(int(next_token_id))
return ids_to_text(generated)
print("\n\n===== 생성 κ²°κ³Ό =====")
print(generate_text_topp(model, "μ§€λ‚œ 2λ…„ λ™μ•ˆ μΆœμ—°μ—°μ΄ κ΅­κ°€κ°€ ν•„μš”ν•œ 연ꡬλ₯Ό", p=0.9))