QuarkTokenizer / tokenizer_config.json
ThingsAI's picture
Quark BPE tokenizer EN+IT 65536 vocab v1
9fb41e6 verified
{
"added_tokens_decoder": {
"0": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"3": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4": {
"content": "<|system|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"5": {
"content": "<|user|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"6": {
"content": "<|assistant|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"7": {
"content": "<|endofturn|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"8": {
"content": "<|thinking|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"9": {
"content": "<|/thinking|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"10": {
"content": "<|reserved_0|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"11": {
"content": "<|reserved_1|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"12": {
"content": "<|reserved_2|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"13": {
"content": "<|reserved_3|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"14": {
"content": "<|reserved_4|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"15": {
"content": "<|reserved_5|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"16": {
"content": "<|reserved_6|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"17": {
"content": "<|reserved_7|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"18": {
"content": "<|reserved_8|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"19": {
"content": "<|reserved_9|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"20": {
"content": "<|reserved_10|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"21": {
"content": "<|reserved_11|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"22": {
"content": "<|reserved_12|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"23": {
"content": "<|reserved_13|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"24": {
"content": "<|reserved_14|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"25": {
"content": "<|reserved_15|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"26": {
"content": "<|reserved_16|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"27": {
"content": "<|reserved_17|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"28": {
"content": "<|reserved_18|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"29": {
"content": "<|reserved_19|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"30": {
"content": "<|reserved_20|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"31": {
"content": "<|reserved_21|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32": {
"content": "<|reserved_22|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"33": {
"content": "<|reserved_23|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"34": {
"content": "<|reserved_24|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"35": {
"content": "<|reserved_25|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"36": {
"content": "<|reserved_26|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"37": {
"content": "<|reserved_27|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"38": {
"content": "<|reserved_28|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"39": {
"content": "<|reserved_29|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"40": {
"content": "<|reserved_30|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"41": {
"content": "<|reserved_31|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"42": {
"content": "<|reserved_32|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"43": {
"content": "<|reserved_33|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"44": {
"content": "<|reserved_34|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"45": {
"content": "<|reserved_35|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"46": {
"content": "<|reserved_36|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"47": {
"content": "<|reserved_37|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"48": {
"content": "<|reserved_38|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"49": {
"content": "<|reserved_39|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"50": {
"content": "<|reserved_40|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"51": {
"content": "<|reserved_41|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52": {
"content": "<|reserved_42|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"53": {
"content": "<|reserved_43|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"54": {
"content": "<|reserved_44|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"55": {
"content": "<|reserved_45|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"56": {
"content": "<|reserved_46|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"57": {
"content": "<|reserved_47|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"58": {
"content": "<|reserved_48|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"59": {
"content": "<|reserved_49|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"60": {
"content": "<|reserved_50|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"61": {
"content": "<|reserved_51|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"62": {
"content": "<|reserved_52|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"63": {
"content": "<|reserved_53|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [
"<|system|>",
"<|user|>",
"<|assistant|>",
"<|endofturn|>",
"<|thinking|>",
"<|/thinking|>",
"<|reserved_0|>",
"<|reserved_1|>",
"<|reserved_2|>",
"<|reserved_3|>",
"<|reserved_4|>",
"<|reserved_5|>",
"<|reserved_6|>",
"<|reserved_7|>",
"<|reserved_8|>",
"<|reserved_9|>",
"<|reserved_10|>",
"<|reserved_11|>",
"<|reserved_12|>",
"<|reserved_13|>",
"<|reserved_14|>",
"<|reserved_15|>",
"<|reserved_16|>",
"<|reserved_17|>",
"<|reserved_18|>",
"<|reserved_19|>",
"<|reserved_20|>",
"<|reserved_21|>",
"<|reserved_22|>",
"<|reserved_23|>",
"<|reserved_24|>",
"<|reserved_25|>",
"<|reserved_26|>",
"<|reserved_27|>",
"<|reserved_28|>",
"<|reserved_29|>",
"<|reserved_30|>",
"<|reserved_31|>",
"<|reserved_32|>",
"<|reserved_33|>",
"<|reserved_34|>",
"<|reserved_35|>",
"<|reserved_36|>",
"<|reserved_37|>",
"<|reserved_38|>",
"<|reserved_39|>",
"<|reserved_40|>",
"<|reserved_41|>",
"<|reserved_42|>",
"<|reserved_43|>",
"<|reserved_44|>",
"<|reserved_45|>",
"<|reserved_46|>",
"<|reserved_47|>",
"<|reserved_48|>",
"<|reserved_49|>",
"<|reserved_50|>",
"<|reserved_51|>",
"<|reserved_52|>",
"<|reserved_53|>"
],
"bos_token": "<s>",
"clean_up_tokenization_spaces": false,
"created_by": "OvercastLab",
"description": "Quark bilingual EN+IT tokenizer — BPE byte-level 65536 vocab",
"eos_token": "</s>",
"extra_special_tokens": {},
"languages": [
"en",
"it"
],
"model_max_length": 2048,
"pad_token": "<pad>",
"padding_side": "right",
"tokenizer_class": "PreTrainedTokenizerFast",
"unk_token": "<unk>"
}