import tensorflow as tf from tensorflow.keras import layers, Model !pip install sentencepiece import sentencepiece as spm import os, json, numpy as np, tensorflow as tf from tensorflow.keras import layers, Model import requests from tensorflow import keras from tensorflow.keras import layers import tensorflow.keras.backend as K print('1') tf.get_logger().setLevel("ERROR") SEED = 42 tf.random.set_seed(SEED) np.random.seed(SEED) # TPU 초기화 try: resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local") tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.TPUStrategy(resolver) print("✅ TPU 초기화 완료:", resolver.cluster_spec().as_dict()) on_tpu = True except Exception as e: print("⚠️ TPU 미사용, GPU/CPU로 진행:", e) strategy = tf.distribute.get_strategy() on_tpu = False # Mixed precision from tensorflow.keras import mixed_precision policy = mixed_precision.Policy("mixed_bfloat16" if on_tpu else "float32") mixed_precision.set_global_policy(policy) print("✅ Mixed precision:", policy) # ======================= # 1) 파일 다운로드 # ======================= def download_file(url, save_path): r = requests.get(url, stream=True) r.raise_for_status() with open(save_path, "wb") as f: for chunk in r.iter_content(8192*2): f.write(chunk) print(f"✅ {save_path} 저장됨") DATA_PATH = "converted.jsonl" TOKENIZER_PATH = "ko_unigram.model" if not os.path.exists(DATA_PATH): download_file( "https://huggingface.co/datasets/Yuchan5386/SFT/resolve/main/data_shuffled_1.jsonl?download=true", DATA_PATH ) if not os.path.exists(TOKENIZER_PATH): download_file( "https://huggingface.co/Yuchan5386/inlam-100m/resolve/main/ko_unigram.model?download=true", TOKENIZER_PATH ) sp = spm.SentencePieceProcessor(TOKENIZER_PATH) pad_id = sp.piece_to_id("") if sp.piece_to_id("") != -1 else 0 start_id = sp.piece_to_id("") sep_id = sp.piece_to_id("") end_id = sp.piece_to_id("") unk_id = sp.piece_to_id("") vocab_size = sp.get_piece_size() print(f"✅ Vocabulary size: {vocab_size}") max_len = 200 batch_size = 128 def text_to_ids(text): return sp.encode(text, out_type=int) def ids_to_text(ids): return sp.decode(ids) def jsonl_stream(file_path): with open(file_path, "r", encoding="utf-8") as f: for line in f: data = json.loads(line) conversations = data.get("conversations", []) for i in range(0, len(conversations) - 1, 2): human_msg = conversations[i] gpt_msg = conversations[i + 1] if human_msg.get("from") != "human" or gpt_msg.get("from") != "gpt": continue prompt = human_msg.get("value", "").strip() response = gpt_msg.get("value", "").strip() full = f" {prompt} {response} " if "" not in full: continue sep_index = full.index("") input_text = full[:sep_index + len("")].strip() target_text = full[sep_index + len(""):].strip() input_ids = text_to_ids(input_text) target_ids = text_to_ids(target_text + " ") available_len = max_len - len(input_ids) if available_len <= 0: input_ids = input_ids[-max_len:] target_ids = [] target_mask = [0] * len(input_ids) else: target_ids = target_ids[:available_len] target_mask = [0] * len(input_ids) + [1] * len(target_ids) full_input = input_ids + target_ids pad_len = max_len - len(full_input) full_input += [pad_id] * pad_len target_mask += [0] * pad_len target_seq = full_input[1:] + [end_id] target_seq = target_seq[:max_len] masked_target = [ t if m == 1 else pad_id for t, m in zip(target_seq, target_mask) ] yield ( tf.convert_to_tensor(full_input, dtype=tf.int32), tf.convert_to_tensor(masked_target, dtype=tf.int32) ) dataset = tf.data.Dataset.from_generator( lambda: jsonl_stream(DATA_PATH), output_signature=( tf.TensorSpec(shape=(max_len,), dtype=tf.int32), tf.TensorSpec(shape=(max_len,), dtype=tf.int32), ), ) dataset = dataset.shuffle(1000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE) with strategy.scope(): dist_dataset = strategy.experimental_distribute_dataset(dataset) class SwiGLU(layers.Layer): def __init__(self, d_model, d_ff): super().__init__() self.proj = layers.Dense(d_ff) self.out = layers.Dense(d_model) def call(self, x): x_proj = self.proj(x) x_val, x_gate = tf.split(x_proj, 2, axis=-1) return self.out(x_val * tf.nn.silu(x_gate)) class CrossBlock(layers.Layer): def __init__(self): super().__init__() self.alpha = layers.Dense(1, activation='sigmoid', dtype='float32') def call(self, x, z): a = self.alpha(x) y = a * x + (1.0 - a) * z return y class EncoderBlock(layers.Layer): def __init__(self, d_model, num_heads, dff, dropout=0.1): super().__init__() self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model) self.ffn = SwiGLU(d_model, 512) self.norm1 = layers.LayerNormalization(epsilon=1e-6) self.norm2 = layers.LayerNormalization(epsilon=1e-6) self.dropout1 = layers.Dropout(dropout) self.dropout2 = layers.Dropout(dropout) def call(self, x, mask=None, training=False): attn_out = self.dropout1(self.mha(x, x, x, attention_mask=mask), training=training) out1 = self.norm1(x + attn_out) ffn_out = self.dropout2(self.ffn(out1), training=training) return self.norm2(out1 + ffn_out) class LoU(layers.Layer): def __init__(self, d_model, clip_value=5.0, eps=1e-6): super().__init__() self.d_model = d_model self.clip_value = float(clip_value) self.eps = float(eps) self.Q = layers.Dense(d_model, dtype='float32') self.K = layers.Dense(d_model, dtype='float32') self.V = layers.Dense(d_model, dtype='float32') self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32') self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32') self.alpha_linear = layers.Dense(1, activation='sigmoid', dtype='float32') self.cross = CrossBlock() self.glu = SwiGLU(d_model, 512) def _ema_over_time(self, score, alpha_dynamic): seq = tf.transpose(score, perm=[1, 0, 2]) alpha_seq = tf.transpose(alpha_dynamic, perm=[1, 0, 2]) def step(prev_ema, inputs): x_t, alpha_t = inputs new = alpha_t * x_t + (1.0 - alpha_t) * prev_ema return new init = seq[0] first_alpha = alpha_seq[0] remaining_seq = seq[1:] remaining_alpha = alpha_seq[1:] elems = (remaining_seq, remaining_alpha) ema_seq = tf.scan(fn=step, elems=elems, initializer=init) ema_seq = tf.concat([tf.expand_dims(init, 0), ema_seq], axis=0) ema = tf.transpose(ema_seq, perm=[1, 0, 2]) return ema def call(self, x, z): x_f32 = tf.cast(x, tf.float32) residual = x_f32 x_f32 = self.norm1(x) q = self.Q(x_f32) k = self.K(x_f32) V = self.V(x_f32) # 기존 코드: # g_q = tf.nn.sigmoid(q) # g_k = tf.nn.sigmoid(k) g_q = (tf.nn.tanh(q) + 1.0) / 2.0 g_k = (tf.nn.tanh(k) + 1.0) / 2.0 score = g_q * g_k alpha_dynamic = self.alpha_linear(x_f32) score_ema = self._ema_over_time(score, alpha_dynamic) mean_last = tf.reduce_mean(score_ema, axis=-1, keepdims=True) denom = tf.maximum(mean_last, self.eps) score_norm = score_ema / denom score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value) x_comb = score_clipped * V out = self.norm(x_comb + residual) out = self.cross(out, z) out = self.glu(out) return tf.cast(out, x.dtype) class Transformer(tf.keras.Model): def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, max_len=100, dropout=0.1): super().__init__() self.max_len = max_len self.d_model = d_model self.enc_embedding = layers.Embedding(input_vocab_size, d_model) self.enc_pos_embedding = layers.Embedding(max_len, d_model) self.dec_embedding = layers.Embedding(target_vocab_size, d_model) self.dec_pos_embedding = layers.Embedding(max_len, d_model) self.enc_layers = [EncoderBlock(d_model, num_heads, dff, dropout) for _ in range(num_layers)] self.dec_layers = [LoU(d_model) for _ in range(num_layers)] self.final_layer = layers.Dense(target_vocab_size) def call(self, inputs, training=False): enc_inputs = inputs["enc_inputs"] dec_inputs = inputs["dec_inputs"] enc_pos = tf.range(tf.shape(enc_inputs)[1])[tf.newaxis, :] dec_pos = tf.range(tf.shape(dec_inputs)[1])[tf.newaxis, :] x = self.enc_embedding(enc_inputs) + self.enc_pos_embedding(enc_pos) for layer in self.enc_layers: x = layer(x, training=training) enc_out = x y = self.dec_embedding(dec_inputs) + self.dec_pos_embedding(dec_pos) for layer in self.dec_layers: y = layer(y, enc_out, training=training) return self.final_layer(y)