babylm-mop-10m-gpt2 / tokenizer_config.json
NeTS-lab's picture
Upload 4 files
f1fb89a verified
raw
history blame
686 Bytes
{
"tokenizer_class": "MorPieceTokenizer",
"auto_map": {
"AutoTokenizer": [
"morpiece_tokenizer.MorPieceTokenizer",
null
]
},
"bos_token": "<s>",
"eos_token": "</s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"mask_token": "<mask>",
"sep_token": "<sep>",
"cls_token": "<cls>",
"model_max_length": 512,
"padding_side": "left",
"truncation_side": "right",
"chat_template": null,
"clean_up_tokenization_spaces": false,
"split_special_tokens": false,
"strip_accents": null,
"add_prefix_space": true,
"vocab_size": 29066,
"min_frequency": 10,
"cutoff": 100,
"bf": 10,
"use_tokenizers_lib": true
}