Yuchan commited on
Commit
19949b0
ยท
verified ยท
1 Parent(s): 42abc8c

Update AlphaS2S.py

Browse files
Files changed (1) hide show
  1. AlphaS2S.py +8 -6
AlphaS2S.py CHANGED
@@ -51,22 +51,25 @@ TOKENIZER_PATH = "ko_unigram.model"
51
 
52
  if not os.path.exists(DATA_PATH):
53
  download_file(
54
- "https://huggingface.co/datasets/Yuchan5386/TinyInst/resolve/main/output.jsonl?download=true",
55
  DATA_PATH
56
  )
57
 
58
  if not os.path.exists(TOKENIZER_PATH):
59
  download_file(
60
- "https://huggingface.co/datasets/Yuchan5386/TinyInst/resolve/main/ko_unigram.model?download=true",
61
  TOKENIZER_PATH
62
  )
63
 
64
  sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
65
 
66
  pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
67
- start_id = sp.piece_to_id("<start>")
68
- sep_id = sp.piece_to_id("<sep>")
69
- end_id = sp.piece_to_id("<end>")
 
 
 
70
  unk_id = sp.piece_to_id("<unk>")
71
  vocab_size = sp.get_piece_size()
72
  print(f"โœ… Vocabulary size: {vocab_size}")
@@ -77,7 +80,6 @@ def text_to_ids(text):
77
  def ids_to_text(ids):
78
  return sp.decode(ids)
79
 
80
-
81
  # =======================
82
  # 2) ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ ํ•จ์ˆ˜ (๊ธฐ์กด ์ฝ”๋“œ์™€ ๋™์ผ)
83
  # =======================
 
51
 
52
  if not os.path.exists(DATA_PATH):
53
  download_file(
54
+ "https://huggingface.co/datasets/Yuchan5386/Multiturn/resolve/main/dataset_shuffled.jsonl?download=true",
55
  DATA_PATH
56
  )
57
 
58
  if not os.path.exists(TOKENIZER_PATH):
59
  download_file(
60
+ "https://huggingface.co/datasets/Yuchan5386/Multiturn/resolve/main/unigram.model?download=true",
61
  TOKENIZER_PATH
62
  )
63
 
64
  sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
65
 
66
  pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
67
+ start_id = sp.piece_to_id("<sos>")
68
+ context_s_id = sp.piece_to_id("<context>")
69
+ context_e_id = sp.piece_to_id("</context>")
70
+ user_s_id = sp.piece_to_id("<user>")
71
+ user_e_id = sp.piece_to_id("</user>")
72
+ end_id = sp.piece_to_id("<eos>")
73
  unk_id = sp.piece_to_id("<unk>")
74
  vocab_size = sp.get_piece_size()
75
  print(f"โœ… Vocabulary size: {vocab_size}")
 
80
  def ids_to_text(ids):
81
  return sp.decode(ids)
82
 
 
83
  # =======================
84
  # 2) ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ ํ•จ์ˆ˜ (๊ธฐ์กด ์ฝ”๋“œ์™€ ๋™์ผ)
85
  # =======================