Yuchan commited on
Commit
77b9d70
·
verified ·
1 Parent(s): 286b189

Update Mo.py

Browse files
Files changed (1) hide show
  1. Mo.py +106 -45
Mo.py CHANGED
@@ -1,23 +1,19 @@
1
- import tensorflow as tf
2
- from tensorflow.keras import layers, Model
3
- import numpy as np
4
- import tensorflow.keras.backend as K
5
- from tensorflow.keras import mixed_precision
6
  import sentencepiece as spm
7
- import os, json
 
8
  import requests
9
- import gradio as gr
 
 
10
 
11
  print('1')
12
-
13
  tf.get_logger().setLevel("ERROR")
14
  SEED = 42
15
  tf.random.set_seed(SEED)
16
  np.random.seed(SEED)
17
- max_len = 512 # 기존 코드에서 200으로 설정됨
18
- batch_size = 128
19
 
20
- # TPU 초기화 (기존 코드와 동일)
21
  try:
22
  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
23
  tf.tpu.experimental.initialize_tpu_system(resolver)
@@ -30,15 +26,15 @@ except Exception as e:
30
  strategy = tf.distribute.get_strategy()
31
  on_tpu = False
32
 
33
- # Mixed precision (기존 코드와 동일)
 
34
  policy = mixed_precision.Policy("mixed_bfloat16" if on_tpu else "float32")
35
  mixed_precision.set_global_policy(policy)
36
  print("✅ Mixed precision:", policy)
37
 
38
  # =======================
39
- # 1) 파일 다운로드 및 토크나이저 초기화 (기존 코드와 동일)
40
  # =======================
41
-
42
  def download_file(url, save_path):
43
  r = requests.get(url, stream=True)
44
  r.raise_for_status()
@@ -47,13 +43,13 @@ def download_file(url, save_path):
47
  f.write(chunk)
48
  print(f"✅ {save_path} 저장됨")
49
 
50
- MODEL_PATH = "model.weights.h5"
51
  TOKENIZER_PATH = "ko_unigram.model"
52
 
53
- if not os.path.exists(MODEL_PATH):
54
  download_file(
55
- "https://huggingface.co/Yuchan5386/Model_Prototype/resolve/main/model.weights.h5?download=true",
56
- MODEL_PATH
57
  )
58
 
59
  if not os.path.exists(TOKENIZER_PATH):
@@ -72,12 +68,52 @@ unk_id = sp.piece_to_id("<unk>")
72
  vocab_size = sp.get_piece_size()
73
  print(f"✅ Vocabulary size: {vocab_size}")
74
 
 
 
 
75
  def text_to_ids(text):
76
  return sp.encode(text, out_type=int)
77
 
78
  def ids_to_text(ids):
79
  return sp.decode(ids)
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  class SwiGLU(layers.Layer):
82
  def __init__(self, d_model, d_ff):
83
  super().__init__()
@@ -180,24 +216,67 @@ class ReLM(tf.keras.Model):
180
  logits = tf.matmul(x, embedding_matrix, transpose_b=True)
181
  return tf.cast(logits, tf.float32)
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
 
184
  model = ReLM(
185
  vocab_size=vocab_size,
186
  max_seq_len=max_len,
187
  d_model=256,
188
  n_layers=1
189
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  dummy_input = np.zeros((1, max_len), dtype=np.int32)
191
- _ = model(dummy_input)
192
  model.summary()
193
- model.load_weights(MODEL_PATH)
194
- print("모델 가중치 로드 완료!")
195
- # =======================
196
- # 6) 추론 함수 (기존 코드 유지)
197
- # 더미 인풋으로 모델 초기화
198
 
 
199
 
200
- def generate_text_topp(model, prompt, max_len=512, max_gen=512, p=0.9, temperature=0.8, min_len=20):
 
 
 
 
 
201
  model_input = text_to_ids(f"<start> {prompt}")
202
  model_input = model_input[:max_len]
203
  generated = list(model_input)
@@ -226,23 +305,5 @@ def generate_text_topp(model, prompt, max_len=512, max_gen=512, p=0.9, temperatu
226
  generated.append(int(next_token_id))
227
  return ids_to_text(generated)
228
 
229
- def gr_generate(prompt, max_len=512, max_gen=512, p=0.8, temperature=0.8):
230
- return generate_text_topp(model, prompt, max_len=max_len, p=p, temperature=temperature)
231
-
232
- # Gradio 인터페이스 정의
233
- iface = gr.Interface(
234
- fn=gr_generate,
235
- inputs=[
236
- gr.Textbox(label="Prompt 입력", placeholder="여기에 문장 입력...", lines=2),
237
- gr.Slider(20, 512, value=150, step=1, label="Max length"),
238
- gr.Slider(0.1, 1.0, value=0.8, step=0.05, label="Top-p"),
239
- gr.Slider(0.1, 2.0, value=0.8, step=0.05, label="Temperature")
240
- ],
241
- outputs=[
242
- gr.Textbox(label="생성 결과", lines=10)
243
- ],
244
- title="Cuma LM 텍스트 생성",
245
- description="간단한 Gradio UI로 Cuma 모델 텍스트 생성 테스트"
246
- )
247
-
248
- iface.launch()
 
1
+ !pip install sentencepiece
 
 
 
 
2
  import sentencepiece as spm
3
+ import os, json, numpy as np, tensorflow as tf
4
+ from tensorflow.keras import layers, Model
5
  import requests
6
+ from tensorflow import keras
7
+ from tensorflow.keras import layers
8
+ import tensorflow.keras.backend as K
9
 
10
  print('1')
 
11
  tf.get_logger().setLevel("ERROR")
12
  SEED = 42
13
  tf.random.set_seed(SEED)
14
  np.random.seed(SEED)
 
 
15
 
16
+ # TPU 초기화
17
  try:
18
  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
19
  tf.tpu.experimental.initialize_tpu_system(resolver)
 
26
  strategy = tf.distribute.get_strategy()
27
  on_tpu = False
28
 
29
+ # Mixed precision
30
+ from tensorflow.keras import mixed_precision
31
  policy = mixed_precision.Policy("mixed_bfloat16" if on_tpu else "float32")
32
  mixed_precision.set_global_policy(policy)
33
  print("✅ Mixed precision:", policy)
34
 
35
  # =======================
36
+ # 1) 파일 다운로드
37
  # =======================
 
38
  def download_file(url, save_path):
39
  r = requests.get(url, stream=True)
40
  r.raise_for_status()
 
43
  f.write(chunk)
44
  print(f"✅ {save_path} 저장됨")
45
 
46
+ DATA_PATH = "corpus.txt"
47
  TOKENIZER_PATH = "ko_unigram.model"
48
 
49
+ if not os.path.exists(DATA_PATH):
50
  download_file(
51
+ "https://huggingface.co/datasets/Yuchan5386/Prototype/resolve/main/corpus_ko.txt?download=true",
52
+ DATA_PATH
53
  )
54
 
55
  if not os.path.exists(TOKENIZER_PATH):
 
68
  vocab_size = sp.get_piece_size()
69
  print(f"✅ Vocabulary size: {vocab_size}")
70
 
71
+ max_len = 512
72
+ batch_size = 128
73
+
74
  def text_to_ids(text):
75
  return sp.encode(text, out_type=int)
76
 
77
  def ids_to_text(ids):
78
  return sp.decode(ids)
79
 
80
+ def txt_stream(file_path):
81
+ with open(file_path, "r", encoding="utf-8") as f:
82
+ for line in f:
83
+ text = line.strip()
84
+ if not text:
85
+ continue
86
+
87
+ ids = text_to_ids(text)
88
+ ids = ids[:max_len - 1] # 마지막에 <end> 넣기 위해 -1
89
+
90
+ full_input = ids + [end_id]
91
+ pad_len = max_len - len(full_input)
92
+ full_input += [pad_id] * pad_len
93
+
94
+ # target = next-token shifted sequence
95
+ target = full_input[1:] + [pad_id]
96
+ yield (
97
+ tf.convert_to_tensor(full_input, dtype=tf.int32),
98
+ tf.convert_to_tensor(target, dtype=tf.int32)
99
+ )
100
+
101
+
102
+ LIMIT = 500000 # 원하는 만큼
103
+
104
+ dataset = tf.data.Dataset.from_generator(
105
+ lambda: txt_stream(DATA_PATH),
106
+ output_signature=(
107
+ tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
108
+ tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
109
+ )
110
+ )
111
+
112
+ dataset = dataset.take(LIMIT).shuffle(2000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
113
+
114
+ with strategy.scope():
115
+ dist_dataset = strategy.experimental_distribute_dataset(dataset)
116
+
117
  class SwiGLU(layers.Layer):
118
  def __init__(self, d_model, d_ff):
119
  super().__init__()
 
216
  logits = tf.matmul(x, embedding_matrix, transpose_b=True)
217
  return tf.cast(logits, tf.float32)
218
 
219
+ loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
220
+
221
+ def masked_loss(y_true, y_pred):
222
+ loss = loss_fn(y_true, y_pred)
223
+ mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)
224
+ masked_loss = tf.reduce_sum(loss * mask) / tf.reduce_sum(mask)
225
+ return masked_loss
226
+
227
+ def masked_perplexity(y_true, y_pred):
228
+ loss = loss_fn(y_true, y_pred)
229
+ mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)
230
+ avg_loss = tf.reduce_sum(loss * mask) / tf.reduce_sum(mask)
231
+ return tf.exp(tf.minimum(avg_loss, 10.0)) # 수치 안정성 확보
232
+
233
+ def create_lr_schedule(initial_lr=5e-5, decay_steps=10000, decay_rate=0.9):
234
+ return tf.keras.optimizers.schedules.ExponentialDecay(
235
+ initial_learning_rate=initial_lr,
236
+ decay_steps=decay_steps,
237
+ decay_rate=decay_rate,
238
+ staircase=False
239
+ )
240
 
241
+ # 모델 생성
242
  model = ReLM(
243
  vocab_size=vocab_size,
244
  max_seq_len=max_len,
245
  d_model=256,
246
  n_layers=1
247
  )
248
+
249
+ # 옵티마이저 설정
250
+ optimizer = tf.keras.optimizers.Adam(
251
+ learning_rate=create_lr_schedule(),
252
+ beta_1=0.9,
253
+ beta_2=0.95,
254
+ epsilon=1e-8,
255
+ clipnorm=1.0
256
+ )
257
+
258
+ # 모델 컴파일
259
+ model.compile(
260
+ optimizer=optimizer,
261
+ loss=masked_loss,
262
+ metrics=[
263
+ masked_perplexity
264
+ ]
265
+ )
266
+
267
+ # 더미 인풋으로 모델 초기화
268
  dummy_input = np.zeros((1, max_len), dtype=np.int32)
269
+ model(dummy_input)
270
  model.summary()
 
 
 
 
 
271
 
272
+ history = model.fit(dataset, epochs=1, verbose=1)
273
 
274
+
275
+ # 가중치 저장
276
+ model.save_weights("model.weights.h5")
277
+ print("모델 가중치 저장 완료!")
278
+
279
+ def generate_text_topp(model, prompt, max_len=150, max_gen=150, p=0.9, temperature=0.8, min_len=20):
280
  model_input = text_to_ids(f"<start> {prompt}")
281
  model_input = model_input[:max_len]
282
  generated = list(model_input)
 
305
  generated.append(int(next_token_id))
306
  return ids_to_text(generated)
307
 
308
+ print("\n\n===== 생성 결과 =====")
309
+ print(generate_text_topp(model, "지난 2년 동안 출연연이 국가가 필요한 연구를", p=0.9))