Spaces:
Sleeping
Sleeping
File size: 1,353 Bytes
c1596ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | import json
import os
import sentencepiece as spm
import yaml
# params
with open("/workspace/params.yaml", "r", encoding="utf-8") as f:
params = yaml.safe_load(f)
def train_sentencepiece(
json_path,
model_prefix="sub_tokenizer",
vocab_size=500,
model_type="unigram"
):
with open(json_path, 'r') as f:
data = json.load(f)
txt_path = "/workspace/src/dataset/sub_tokenizing_captions.txt"
with open(txt_path, "w", encoding="utf-8") as f:
for item in data:
captions = item["captions"]
for caption in captions:
f.write(caption.lower() + "\n")
spm.SentencePieceTrainer.train(
input=txt_path,
model_prefix=model_prefix,
vocab_size=vocab_size,
model_type=model_type,
pad_piece="<pad>",
bos_piece="<sos>",
eos_piece="<eos>",
unk_piece="<unk>",
pad_id=0,
bos_id=1,
eos_id=2,
unk_id=3
)
print("tokenizer training done")
if __name__ == "__main__":
train_sentencepiece(
json_path="/workspace/data/captioning/annotations/train.json",
model_prefix="/workspace/src/dataset/sub_tokenizer2000",
vocab_size=params["captioning"]["tokenizer"]["sp_vocab_size"],
model_type="unigram"
) |