Lake-1-6B-spe / tokenizer.json
BICORP's picture
Upload 4 files
b2654cf verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
".": 5,
"T": 6,
"a": 7,
"b": 8,
"c": 9,
"d": 10,
"e": 11,
"f": 12,
"g": 13,
"h": 14,
"i": 15,
"j": 16,
"k": 17,
"l": 18,
"m": 19,
"n": 20,
"o": 21,
"p": 22,
"q": 23,
"r": 24,
"s": 25,
"t": 26,
"u": 27,
"v": 28,
"w": 29,
"x": 30,
"y": 31,
"z": 32,
"Ġ": 33,
"Ġt": 34,
"Th": 35,
"er": 36,
"fo": 37,
"in": 38,
"is": 39,
"mp": 40,
"Ġfo": 41,
"ĠTh": 42,
"az": 43,
"ain": 44,
"amp": 45,
"br": 46,
"ck": 47,
"do": 48,
"en": 49,
"ex": 50,
"he": 51,
"iz": 52,
"ick": 53,
"ju": 54,
"ken": 55,
"le": 56,
"laz": 57,
"ov": 58,
"ow": 59,
"oken": 60,
"qu": 61,
"rain": 62,
"samp": 63,
"Ġa": 64,
"Ġis": 65,
"Ġbr": 66,
"Ġdo": 67,
"Ġju": 68,
"Ġlaz": 69,
"Ġov": 70,
"Ġqu": 71,
"Ġsamp": 72,
"Ġtex": 73,
"Ġthe": 74,
"Ġtoken": 75,
"Ġtrain": 76,
"ing": 77,
"mps": 78,
"Ġfor": 79,
"Ġfox": 80,
"izer": 81,
"own": 82,
"Ġbrown": 83,
"Ġdog": 84,
"Ġjumps": 85,
"Ġlazy": 86,
"Ġover": 87,
"Ġquick": 88,
"Ġsample": 89,
"Ġtext": 90,
"Ġtokenizer": 91,
"Ġtraining": 92,
"ĠThe": 93,
"ĠThis": 94
},
"merges": [
[
"Ġ",
"t"
],
[
"T",
"h"
],
[
"e",
"r"
],
[
"f",
"o"
],
[
"i",
"n"
],
[
"i",
"s"
],
[
"m",
"p"
],
[
"Ġ",
"fo"
],
[
"Ġ",
"Th"
],
[
"a",
"z"
],
[
"a",
"in"
],
[
"a",
"mp"
],
[
"b",
"r"
],
[
"c",
"k"
],
[
"d",
"o"
],
[
"e",
"n"
],
[
"e",
"x"
],
[
"h",
"e"
],
[
"i",
"z"
],
[
"i",
"ck"
],
[
"j",
"u"
],
[
"k",
"en"
],
[
"l",
"e"
],
[
"l",
"az"
],
[
"o",
"v"
],
[
"o",
"w"
],
[
"o",
"ken"
],
[
"q",
"u"
],
[
"r",
"ain"
],
[
"s",
"amp"
],
[
"Ġ",
"a"
],
[
"Ġ",
"is"
],
[
"Ġ",
"br"
],
[
"Ġ",
"do"
],
[
"Ġ",
"ju"
],
[
"Ġ",
"laz"
],
[
"Ġ",
"ov"
],
[
"Ġ",
"qu"
],
[
"Ġ",
"samp"
],
[
"Ġt",
"ex"
],
[
"Ġt",
"he"
],
[
"Ġt",
"oken"
],
[
"Ġt",
"rain"
],
[
"in",
"g"
],
[
"mp",
"s"
],
[
"Ġfo",
"r"
],
[
"Ġfo",
"x"
],
[
"iz",
"er"
],
[
"ow",
"n"
],
[
"Ġbr",
"own"
],
[
"Ġdo",
"g"
],
[
"Ġju",
"mps"
],
[
"Ġlaz",
"y"
],
[
"Ġov",
"er"
],
[
"Ġqu",
"ick"
],
[
"Ġsamp",
"le"
],
[
"Ġtex",
"t"
],
[
"Ġtoken",
"izer"
],
[
"Ġtrain",
"ing"
],
[
"ĠTh",
"e"
],
[
"ĠTh",
"is"
]
]
}
}