OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 26, 2025

Commit

41c502c

verified ·

1 Parent(s): cc78280

Update AlphaS2S.py

Browse files

Files changed (1) hide show

AlphaS2S.py +24 -14

AlphaS2S.py CHANGED Viewed

@@ -62,24 +62,36 @@ if not os.path.exists(TOKENIZER_PATH):
     )
 sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
 pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
-start_id = sp.piece_to_id("<sos>")
-context_s_id = sp.piece_to_id("<context>")
-context_e_id = sp.piece_to_id("</context>")
-user_s_id = sp.piece_to_id("<user>")
-user_e_id = sp.piece_to_id("</user>")
-end_id = sp.piece_to_id("<eos>")
 unk_id = sp.piece_to_id("<unk>")
 vocab_size = sp.get_piece_size()
 print(f"✅ Vocabulary size: {vocab_size}")
 def text_to_ids(text):
     return sp.encode(text, out_type=int)
 def ids_to_text(ids):
     return sp.decode(ids)
 # =======================
 # JSONL → TF Dataset 로드 (ID 레벨 특수 토큰 포함)
 # =======================
@@ -87,27 +99,25 @@ def jsonl_stream(file_path):
     with open(file_path, "r", encoding="utf-8") as f:
         for line in f:
             data = json.loads(line)
-            context = data["context"]
-            prompt = data["prompt"]
-            answer = data["answer"]
             # =======================
             # Encoder input: ID 레벨에서 특수 토큰 명시
             # =======================
-            enc_ids = [context_s_id] + text_to_ids(context) + [context_e_id] + \
-                      [user_s_id] + text_to_ids(prompt) + [user_e_id]
             enc_ids = enc_ids[:max_len]  # max_len 제한
             # =======================
             # Decoder input: <sos> + answer
             # =======================
-            dec_input_ids = [start_id] + text_to_ids(answer)
             dec_input_ids = dec_input_ids[:max_len]
             # =======================
             # Target: answer + <eos>
             # =======================
-            target_ids = text_to_ids(answer) + [end_id]
             target_ids = target_ids[:max_len]
             # =======================
@@ -255,7 +265,7 @@ def create_lr_schedule(initial_lr=5e-5, decay_steps=10000, decay_rate=0.9):
 with strategy.scope():
     # ⚠️ 수정: chat_vocab_size 대신 정의된 vocab_size 사용
-    chat_model = Transformer(num_layers=2, d_model=256, num_heads=4, dff=768, input_vocab_size=vocab_size, target_vocab_size=vocab_size, max_len=256, dropout=0.1)
     dummy_input = {
         "enc_inputs": tf.zeros((1, max_len), dtype=tf.int32),

     )
 sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
+sp_en = spm.SentencePieceProcessor(TOKENIZER_PATH1)
 pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
+start_id = sp.piece_to_id("<start>")
+sep_id = sp.piece_to_id("<sep>")
+end_id = sp.piece_to_id("<end>")
 unk_id = sp.piece_to_id("<unk>")
 vocab_size = sp.get_piece_size()
 print(f"✅ Vocabulary size: {vocab_size}")
+epad_id = sp_en.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
+estart_id = sp_en.piece_to_id("<start>")
+esep_id = sp_en.piece_to_id("<sep>")
+eend_id = sp_en.piece_to_id("<end>")
+eunk_id = sp_en.piece_to_id("<unk>")
+evocab_size = sp_en.get_piece_size()
+print(f"✅ Vocabulary size: {evocab_size}")
 def text_to_ids(text):
     return sp.encode(text, out_type=int)
 def ids_to_text(ids):
     return sp.decode(ids)
+def etext_to_ids(text):
+    return sp_en.encode(text, out_type=int)
+def eids_to_text(ids):
+    return sp_en.decode(ids)
 # =======================
 # JSONL → TF Dataset 로드 (ID 레벨 특수 토큰 포함)
 # =======================
     with open(file_path, "r", encoding="utf-8") as f:
         for line in f:
             data = json.loads(line)
+            prompt = data["ko"]
+            answer = data["en"]
             # =======================
             # Encoder input: ID 레벨에서 특수 토큰 명시
             # =======================
+            enc_ids = text_to_ids(prompt)
             enc_ids = enc_ids[:max_len]  # max_len 제한
             # =======================
             # Decoder input: <sos> + answer
             # =======================
+            dec_input_ids = [estart_id] + text_to_ids(answer)
             dec_input_ids = dec_input_ids[:max_len]
             # =======================
             # Target: answer + <eos>
             # =======================
+            target_ids = etext_to_ids(answer) + [eend_id]
             target_ids = target_ids[:max_len]
             # =======================
 with strategy.scope():
     # ⚠️ 수정: chat_vocab_size 대신 정의된 vocab_size 사용
+    chat_model = Transformer(num_layers=2, d_model=256, num_heads=4, dff=768, input_vocab_size=vocab_size, target_vocab_size=evocab_size, max_len=256, dropout=0.1)
     dummy_input = {
         "enc_inputs": tf.zeros((1, max_len), dtype=tf.int32),