tfm_reproduction_COME / tokenizer /apply_tokenizer.py
clouds125's picture
Upload folder using huggingface_hub
2d8ff8e verified
raw
history blame contribute delete
371 Bytes
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer.from_file(
"./salesforce/codet5-vocab.json",
"./salesforce/codet5-merges.txt"
)
tokenizer.add_special_tokens([
"<pad>",
"<s>",
"</s>",
"<unk>",
"<mask>"
])
print(
tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens
)