OpenLab-NLP commited on
Commit
b321543
·
verified ·
1 Parent(s): dba50a9

Create .py

Browse files
Files changed (1) hide show
  1. .py +278 -0
.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+ import os, json, numpy as np, tensorflow as tf
3
+ from tensorflow.keras import layers, Model
4
+ import requests
5
+ from tensorflow.keras import mixed_precision
6
+ import glob
7
+
8
+ tf.get_logger().setLevel("ERROR")
9
+ SEED = 42
10
+ tf.random.set_seed(SEED)
11
+ np.random.seed(SEED)
12
+
13
+ try:
14
+     resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
15
+     tf.tpu.experimental.initialize_tpu_system(resolver)
16
+     strategy = tf.distribute.TPUStrategy(resolver)
17
+     print("✅ TPU 초기화 완료")
18
+     on_tpu = True
19
+
20
+ except Exception as e:
21
+     print("⚠️ TPU 미사용, GPU/CPU로 진행:", e)
22
+     strategy = tf.distribute.get_strategy()
23
+     on_tpu = False
24
+
25
+ policy_name = "mixed_bfloat16" if on_tpu else "float32"
26
+ policy = mixed_precision.Policy(policy_name)
27
+ mixed_precision.set_global_policy(policy)
28
+ print(f"✅ Mixed precision: {policy_name}")
29
+
30
+ def download_file(url, save_path):
31
+     if not os.path.exists(save_path):
32
+         r = requests.get(url, stream=True)
33
+         r.raise_for_status()
34
+         with open(save_path, "wb") as f:
35
+             for chunk in r.iter_content(8192*2):
36
+                 f.write(chunk)
37
+         print(f"✅ {save_path} 저장됨")
38
+
39
+ TOKENIZER_PATH = "tokenizer.model"
40
+ download_file("https://huggingface.co/datasets/OpenLab-NLP/tiny-corpus/resolve/main/tokenizer.model?download=true", TOKENIZER_PATH)
41
+ DATA_DIR = "/kaggle/input/lm-pretrain"
42
+ FILE_PATTERN = os.path.join(DATA_DIR, "tokenized_variable_part_*.txt")
43
+ sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
44
+ pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
45
+ end_id = sp.piece_to_id("[EOS]")
46
+ vocab_size = sp.get_piece_size()
47
+
48
+ max_len = 512
49
+ batch_size = 768
50
+
51
+ import random
52
+
53
+ def prepare_packed_dataset(file_pattern, max_len, batch_size):
54
+     file_list = tf.io.gfile.glob(file_pattern)
55
+    
56
+     # [수정] 파일 경로 리스트 자체를 무작위로 섞습니다.
57
+     # 이렇게 하면 매 에포크마다 파일을 읽는 순서가 달라집니다.
58
+     random.shuffle(file_list)
59
+     print(f"🔄 파일 로드 순서 섞기 완료 (첫 번째 파일: {file_list[0]})")
60
+
61
+     dataset = tf.data.TextLineDataset(file_list)
62
+    
63
+     def parse_tokens(line):
64
+         return tf.strings.to_number(tf.strings.split(line), tf.int32)
65
+
66
+     dataset = dataset.map(parse_tokens, num_parallel_calls=tf.data.AUTOTUNE)
67
+     dataset = dataset.unbatch()
68
+     dataset = dataset.batch(max_len + 1, drop_remainder=True)
69
+    
70
+     def split_input_target(chunk):
71
+         return chunk[:-1], chunk[1:]
72
+
73
+     dataset = dataset.map(split_input_target, num_parallel_calls=tf.data.AUTOTUNE)
74
+     dataset = dataset.shuffle(20000)
75
+     dataset = dataset.batch(batch_size, drop_remainder=True)  
76
+     return dataset.prefetch(tf.data.AUTOTUNE)
77
+
78
+ with strategy.scope():
79
+     dataset = prepare_packed_dataset(FILE_PATTERN, max_len, batch_size)
80
+     dist_dataset = strategy.experimental_distribute_dataset(dataset)
81
+     print("✅ 데이터 패킹 및 TPU 분산 파이프라인 준비 완료")
82
+
83
+ class TimeMix(layers.Layer):
84
+     def __init__(self, d_model, layer_id, n_layers):
85
+         super().__init__()
86
+         self.d_model = d_model
87
+         ratio = (layer_id / (n_layers - 1)) if n_layers > 1 else 0.5
88
+         decay_speed = np.arange(d_model)
89
+         self.time_decay = tf.Variable(
90
+             -5 + 8 * (decay_speed / (d_model - 1)) ** (0.7 + 1.3 * ratio),
91
+             dtype=tf.float32, name="time_decay"
92
+         )
93
+         self.time_first = tf.Variable(
94
+             np.ones(d_model) * np.log(0.3),
95
+             dtype=tf.float32, name="time_first"
96
+         )
97
+         self.time_mix_k = tf.Variable(1 - (ratio ** 0.5), dtype=tf.float32)
98
+         self.time_mix_v = tf.Variable(1 - (ratio ** 0.5), dtype=tf.float32)
99
+         self.time_mix_r = tf.Variable(1 - (ratio ** 0.2), dtype=tf.float32)
100
+         self.key = layers.Dense(d_model, use_bias=False)
101
+         self.value = layers.Dense(d_model, use_bias=False)
102
+         self.receptance = layers.Dense(d_model, use_bias=False)
103
+         self.output_projection = layers.Dense(d_model, use_bias=False)
104
+     def call(self, x, training=False):
105
+         t_type = x.dtype
106
+         tm_k = tf.cast(self.time_mix_k, t_type)
107
+         tm_v = tf.cast(self.time_mix_v, t_type)
108
+         tm_r = tf.cast(self.time_mix_r, t_type)
109
+         xx = tf.pad(x[:, :-1, :], [[0, 0], [1, 0], [0, 0]])
110
+         k = self.key(x * tm_k + xx * (1 - tm_k))
111
+         v = self.value(x * tm_v + xx * (1 - tm_v))
112
+         r = self.receptance(x * tm_r + xx * (1 - tm_r))
113
+         wkv = self.parallel_wkv(k, v)
114
+         return self.output_projection(tf.nn.sigmoid(r) * wkv)
115
+
116
+     def parallel_wkv(self, k, v):
117
+         t_type = k.dtype
118
+         w = tf.cast(tf.exp(self.time_decay), t_type)
119
+         u = tf.cast(self.time_first, t_type)
120
+         t = tf.shape(k)[1]
121
+         t_index = tf.cast(tf.range(t), t_type)[:, None]
122
+         s = k - (t_index * w)
123
+         kv = tf.exp(s) * v
124
+         k_exp = tf.exp(s)
125
+         num = tf.cumsum(kv, axis=1) - kv + tf.exp(s + u) * v
126
+         den = tf.cumsum(k_exp, axis=1) - k_exp + tf.exp(s + u)
127
+         return num / (den + 1e-8)
128
+
129
+ class ChannelMix(layers.Layer):
130
+ def __init__(self, d_model, layer_id, n_layers, num_experts=8, top_k=2):
131
+ super().__init__()
132
+ self.d_model = d_model
133
+ self.num_experts = num_experts
134
+ self.top_k = top_k
135
+
136
+ ratio = (layer_id / (n_layers - 1)) if n_layers > 1 else 0.5
137
+ self.time_mix_k = tf.Variable(1 - (ratio ** 0.5), dtype=tf.float32)
138
+ self.time_mix_r = tf.Variable(1 - (ratio ** 0.5), dtype=tf.float32)
139
+
140
+ # Gate: 전문가 선택을 위한 로직
141
+ self.gate = layers.Dense(num_experts, use_bias=False)
142
+
143
+ # Experts 가중치
144
+ self.key_weight = self.add_weight(
145
+ name="expert_key",
146
+ shape=(num_experts, d_model, int(d_model * 4)),
147
+ initializer="glorot_uniform"
148
+ )
149
+ self.value_weight = self.add_weight(
150
+ name="expert_value",
151
+ shape=(num_experts, int(d_model * 4), d_model),
152
+ initializer="glorot_uniform"
153
+ )
154
+ self.receptance = layers.Dense(d_model, use_bias=False)
155
+
156
+ def call(self, x, training=False):
157
+ t_type = x.dtype
158
+ b, t, d = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]
159
+ xx = tf.pad(x[:, :-1, :], [[0, 0], [1, 0], [0, 0]])
160
+
161
+ k_in = x * tf.cast(self.time_mix_k, t_type) + xx * (1 - tf.cast(self.time_mix_k, t_type))
162
+
163
+ # 1. Gate Logits 및 Top-K 선택
164
+ gate_logits = self.gate(k_in) # (B, T, num_experts)
165
+
166
+ # Top-K 전문가와 가중치 추출
167
+ raw_weights, indices = tf.math.top_k(gate_logits, k=self.top_k)
168
+ gate_weights = tf.nn.softmax(tf.cast(raw_weights, tf.float32))
169
+ gate_weights = tf.cast(gate_weights, t_type) # (B, T, top_k)
170
+
171
+ # 2. Sparse 연산을 위한 Mask 생성 (수치적 안정성 및 로드 밸런싱용)
172
+ # 실제로는 모든 전문가를 다 계산한 뒤 Masking하는 방식이 TPU MXU 활용에 유리할 수 있음
173
+ # 여기서는 einsum을 활용하되 선택된 전문가의 영향력만 남김
174
+ masks = tf.one_hot(indices, depth=self.num_experts, dtype=t_type) # (B, T, top_k, num_experts)
175
+ final_mask = tf.reduce_sum(masks * tf.expand_dims(gate_weights, -1), axis=2) # (B, T, num_experts)
176
+
177
+ # 3. Auxiliary Loss (전문가 균등 분배)
178
+ if training:
179
+ # Load Balancing Loss: gate_logits의 확률 분포가 균등하도록 유도
180
+ prob_dist = tf.nn.softmax(tf.cast(gate_logits, tf.float32), axis=-1)
181
+ importance = tf.reduce_sum(prob_dist, axis=[0, 1])
182
+ load = tf.reduce_sum(tf.cast(final_mask > 0, tf.float32), axis=[0, 1])
183
+ aux_loss = tf.reduce_sum(importance * load) * (self.num_experts / (tf.cast(b * t, tf.float32) ** 2))
184
+ self.add_loss(0.01 * aux_loss)
185
+
186
+ # 4. 전문가 연산 (Einsum 활용)
187
+ # 모든 전문가를 계산하되, mask를 통해 필요한 정보만 남김 (Sparse Approximation)
188
+ k_experts = tf.einsum('btd,edh->bteh', k_in, self.key_weight)
189
+ k_experts = tf.square(tf.nn.relu(k_experts))
190
+ v_experts = tf.einsum('bteh,ehd->bted', k_experts, self.value_weight) # (B, T, E, D)
191
+
192
+ # 5. 가중 합산 (최종 선택된 전문가의 결과만 결합)
193
+ kv = tf.reduce_sum(v_experts * tf.expand_dims(final_mask, -1), axis=2)
194
+
195
+ # Receptance (Gate) 연산
196
+ r_in = x * tf.cast(self.time_mix_r, t_type) + xx * (1 - tf.cast(self.time_mix_r, t_type))
197
+ r = self.receptance(r_in)
198
+
199
+ return tf.nn.sigmoid(r) * kv
200
+
201
+ class Block(layers.Layer):
202
+     def __init__(self, d_model, layer_id, n_layers):
203
+         super().__init__()
204
+         self.ln = layers.LayerNormalization(epsilon=1e-5)
205
+         self.time_mix = TimeMix(d_model, layer_id, n_layers)
206
+         self.channel_mix = ChannelMix(d_model, layer_id, n_layers)
207
+     def call(self, x, training=False):
208
+         ln_x = self.ln(x)
209
+         return x + self.time_mix(ln_x, training=training) + self.channel_mix(ln_x)
210
+
211
+ class Head(tf.keras.Model):
212
+     def __init__(self, vocab_size):
213
+         super().__init__()
214
+         self.lm_head = layers.Dense(vocab_size, use_bias=False, name="output_head", dtype=policy)
215
+     def call(self, x, training=False):
216
+         logits = self.lm_head(x)
217
+         return tf.cast(logits, tf.float32)
218
+    
219
+ class LM(tf.keras.Model):
220
+     def __init__(self, d_model, n_layers, dropout_rate=0.1):
221
+         super().__init__()
222
+         self.token_embedding = layers.Embedding(vocab_size, d_model)
223
+         self.blocks = [Block(d_model, i, n_layers) for i in range(n_layers)]
224
+         self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
225
+     def call(self, x, training=False):
226
+         x = self.token_embedding(x)
227
+         for block in self.blocks:
228
+             x = block(x, training=training)      
229
+         x = tf.cast(x, tf.float32)
230
+         x = self.ln_f(x)
231
+         return x
232
+
233
+ def smoothed_loss_keras(y_true, y_pred, eps=0.1):
234
+     y_true = tf.cast(y_true, tf.int32)
235
+     mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)
236
+     vocab = tf.shape(y_pred)[-1]
237
+     y_true_oh = tf.one_hot(y_true, depth=vocab, dtype=tf.float32)
238
+     y_true_ls = (1.0 - eps) * y_true_oh + eps / tf.cast(vocab, tf.float32)
239
+     log_probs = tf.nn.log_softmax(y_pred, axis=-1)
240
+     per_tok = -tf.reduce_sum(y_true_ls * log_probs, axis=-1)
241
+     return tf.reduce_sum(per_tok * mask) / (tf.reduce_sum(mask) + 1e-8)
242
+
243
+ with strategy.scope():
244
+     blocklm = LM(d_model=512, n_layers=16)
245
+     head = Head(vocab_size=vocab_size)
246
+
247
+     inputs = layers.Input(shape=(max_len,), dtype=tf.int32)
248
+     x = blocklm(inputs)
249
+     outputs = head(x)
250
+     model = tf.keras.Model(inputs=inputs, outputs=outputs)
251
+     optimizer = tf.keras.optimizers.AdamW(learning_rate=1e-4, weight_decay=0.01)
252
+     model.compile(optimizer=optimizer, loss=smoothed_loss_keras)
253
+     dummy_input = np.zeros((1, max_len), dtype=np.int32)
254
+     model(dummy_input)
255
+     model.summary()
256
+
257
+ def get_training_stats(file_pattern, max_len, batch_size):
258
+     total_tokens = 0
259
+     files = glob.glob(file_pattern)
260
+     for f in files:
261
+         with open(f, 'r') as file:
262
+             for line in file:
263
+                 total_tokens += len(line.split())
264
+     total_chunks = total_tokens // (max_len + 1)
265
+     steps_per_epoch = total_chunks // batch_size  
266
+     return total_tokens, total_chunks, steps_per_epoch
267
+
268
+ #total_tokens, total_chunks, steps_per_epoch = get_training_stats(FILE_PATTERN, max_len, batch_size)
269
+
270
+ #print(f"✅ 총 토큰 수: {total_tokens}")
271
+ #print(f"✅ 생성된 총 덩어리(Chunk) 수: {total_chunks}")
272
+ #print(f"✅ steps_per_epoch: {steps_per_epoch}")
273
+
274
+ model.fit(dist_dataset, epochs=1, steps_per_epoch=14582)
275
+ blocklm.save_weights("blocklm.weights.h5")
276
+ head.save_weights("head.weights.h5")
277
+
278
+ print("저장됨")