OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 22, 2025

Commit

f82693c

verified ·

1 Parent(s): 411d64d

Update AlphaS2S.py

Browse files

Files changed (1) hide show

AlphaS2S.py +144 -0

AlphaS2S.py CHANGED Viewed

@@ -1,6 +1,150 @@
 import tensorflow as tf
 from tensorflow.keras import layers, Model
 class SwiGLU(layers.Layer):
     def __init__(self, d_model, d_ff):
         super().__init__()

 import tensorflow as tf
 from tensorflow.keras import layers, Model
+!pip install sentencepiece
+import sentencepiece as spm
+import os, json, numpy as np, tensorflow as tf
+from tensorflow.keras import layers, Model
+import requests
+from tensorflow import keras
+from tensorflow.keras import layers
+import tensorflow.keras.backend as K
+print('1')
+tf.get_logger().setLevel("ERROR")
+SEED = 42
+tf.random.set_seed(SEED)
+np.random.seed(SEED)
+# TPU 초기화
+try:
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    strategy = tf.distribute.TPUStrategy(resolver)
+    print("✅ TPU 초기화 완료:", resolver.cluster_spec().as_dict())
+    on_tpu = True
+except Exception as e:
+    print("⚠️ TPU 미사용, GPU/CPU로 진행:", e)
+    strategy = tf.distribute.get_strategy()
+    on_tpu = False
+# Mixed precision
+from tensorflow.keras import mixed_precision
+policy = mixed_precision.Policy("mixed_bfloat16" if on_tpu else "float32")
+mixed_precision.set_global_policy(policy)
+print("✅ Mixed precision:", policy)
+# =======================
+# 1) 파일 다운로드
+# =======================
+def download_file(url, save_path):
+    r = requests.get(url, stream=True)
+    r.raise_for_status()
+    with open(save_path, "wb") as f:
+        for chunk in r.iter_content(8192*2):
+            f.write(chunk)
+    print(f"✅ {save_path} 저장됨")
+DATA_PATH = "converted.jsonl"
+TOKENIZER_PATH = "ko_unigram.model"
+if not os.path.exists(DATA_PATH):
+    download_file(
+        "https://huggingface.co/datasets/Yuchan5386/SFT/resolve/main/data_shuffled_1.jsonl?download=true",
+        DATA_PATH
+    )
+if not os.path.exists(TOKENIZER_PATH):
+    download_file(
+        "https://huggingface.co/Yuchan5386/inlam-100m/resolve/main/ko_unigram.model?download=true",
+        TOKENIZER_PATH
+    )
+sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
+pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
+start_id = sp.piece_to_id("<start>")
+sep_id = sp.piece_to_id("<sep>")
+end_id = sp.piece_to_id("<end>")
+unk_id = sp.piece_to_id("<unk>")
+vocab_size = sp.get_piece_size()
+print(f"✅ Vocabulary size: {vocab_size}")
+max_len = 200
+batch_size = 128
+def text_to_ids(text):
+    return sp.encode(text, out_type=int)
+def ids_to_text(ids):
+    return sp.decode(ids)
+def jsonl_stream(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data = json.loads(line)
+            conversations = data.get("conversations", [])
+            for i in range(0, len(conversations) - 1, 2):
+                human_msg = conversations[i]
+                gpt_msg   = conversations[i + 1]
+                if human_msg.get("from") != "human" or gpt_msg.get("from") != "gpt":
+                    continue
+                prompt   = human_msg.get("value", "").strip()
+                response = gpt_msg.get("value", "").strip()
+                full = f"<start> {prompt} <sep> {response} <end>"
+                if "<sep>" not in full:
+                    continue
+                sep_index  = full.index("<sep>")
+                input_text = full[:sep_index + len("<sep>")].strip()
+                target_text = full[sep_index + len("<sep>"):].strip()
+                input_ids  = text_to_ids(input_text)
+                target_ids = text_to_ids(target_text + " <end>")
+                available_len = max_len - len(input_ids)
+                if available_len <= 0:
+                    input_ids = input_ids[-max_len:]
+                    target_ids = []
+                    target_mask = [0] * len(input_ids)
+                else:
+                    target_ids = target_ids[:available_len]
+                    target_mask = [0] * len(input_ids) + [1] * len(target_ids)
+                full_input = input_ids + target_ids
+                pad_len = max_len - len(full_input)
+                full_input += [pad_id] * pad_len
+                target_mask += [0] * pad_len
+                target_seq = full_input[1:] + [end_id]
+                target_seq = target_seq[:max_len]
+                masked_target = [
+                    t if m == 1 else pad_id
+                    for t, m in zip(target_seq, target_mask)
+                ]
+                yield (
+                    tf.convert_to_tensor(full_input, dtype=tf.int32),
+                    tf.convert_to_tensor(masked_target, dtype=tf.int32)
+                )
+dataset = tf.data.Dataset.from_generator(
+    lambda: jsonl_stream(DATA_PATH),
+    output_signature=(
+        tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
+        tf.TensorSpec(shape=(max_len,), dtype=tf.int32),
+    ),
+)
+dataset = dataset.shuffle(1000, seed=SEED).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
+with strategy.scope():
+    dist_dataset = strategy.experimental_distribute_dataset(dataset)
 class SwiGLU(layers.Layer):
     def __init__(self, d_model, d_ff):
         super().__init__()