speedcore / tokenizer.json
speedartificialintelligence1122's picture
Upload tokenizer.json
fda8691 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "NFD"
},
{
"type": "Lowercase"
},
{
"type": "StripAccents"
}
]
},
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<bos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "<bos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 1
}
}
],
"special_tokens": {
"<bos>": {
"id": "<bos>",
"ids": [
0
],
"tokens": [
"<bos>"
]
},
"<eos>": {
"id": "<eos>",
"ids": [
1
],
"tokens": [
"<eos>"
]
}
}
},
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<pad>": 0,
"<bos>": 1,
"<eos>": 2,
"<unk>": 3,
",": 4,
".": 5,
"a": 6,
"b": 7,
"c": 8,
"d": 9,
"e": 10,
"f": 11,
"g": 12,
"h": 13,
"i": 14,
"j": 15,
"k": 16,
"l": 17,
"m": 18,
"n": 19,
"o": 20,
"p": 21,
"r": 22,
"s": 23,
"t": 24,
"u": 25,
"v": 26,
"w": 27,
"y": 28,
"—": 29,
"’": 30,
"“": 31,
"”": 32,
"re": 33,
"an": 34,
"in": 35,
"is": 36,
"al": 37,
"ea": 38,
"on": 39,
"sp": 40,
"th": 41,
"tu": 42,
"and": 43,
"am": 44,
"bo": 45,
"bu": 46,
"co": 47,
"de": 48,
"el": 49,
"fr": 50,
"ion": 51,
"us": 52,
"we": 53,
"ai": 54,
"are": 55,
"ca": 56,
"ce": 57,
"ed": 58,
"en": 59,
"eed": 60,
"fu": 61,
"ig": 62,
"il": 63,
"lo": 64,
"lig": 65,
"no": 66,
"om": 67,
"or": 68,
"ri": 69,
"sc": 70,
"va": 71,
"ve": 72,
"all": 73,
"ear": 74,
"speed": 75,
"the": 76,
"ture": 77,
"buil": 78,
"from": 79,
"ence": 80,
"future": 81,
".”": 82,
"ah": 83,
"abo": 84,
"ade": 85,
"afr": 86,
"ava": 87,
"be": 88,
"by": 89,
"bel": 90,
"ch": 91,
"can": 92,
"dre": 93,
"din": 94,
"es": 95,
"et": 96,
"ean": 97,
"eve": 98,
"for": 99,
"hn": 100,
"his": 101,
"hel": 102,
"hear": 103,
"it": 104,
"ith": 105,
"ica": 106,
"iri": 107,
"ience": 108,
"ieve": 109,
"java": 110,
"jes": 111,
"kr": 112,
"ks": 113,
"lea": 114,
"let": 115,
"my": 116,
"made": 117,
"mean": 118,
"mfor": 119,
"ntu": 120,
"nam": 121,
"nit": 122,
"ou": 123,
"owe": 124,
"oks": 125,
"pt": 126,
"py": 127,
"pre": 128,
"powe": 129,
"rn": 130,
"rs": 131,
"rus": 132,
"ss": 133,
"sear": 134,
"to": 135,
"tal": 136,
"tel": 137,
"tion": 138,
"tor": 139,
"ut": 140,
"ure": 141,
"ubu": 142,
"unit": 143,
"vis": 144,
"with": 145,
"relig": 146,
"inno": 147,
"intel": 148,
"ishn": 149,
"eak": 150,
"spiri": 151,
"speak": 152,
"thon": 153,
"tual": 154,
"books": 155,
"born": 156,
"code": 157,
"codin": 158,
"comfor": 159,
"ders": 160,
"use": 161,
"cause": 162,
"love": 163,
"ligence": 164,
"not": 165,
"ript": 166,
"science": 167,
"script": 168,
"vation": 169,
"allah": 170,
"build": 171,
"built": 172,
"about": 173,
"africa": 174,
"because": 175,
"believe": 176,
"dream": 177,
"histor": 178,
"hello": 179,
"heart": 180,
"javascript": 181,
"jesus": 182,
"krishn": 183,
"leaders": 184,
"means": 185,
"name": 186,
"our": 187,
"python": 188,
"press": 189,
"power": 190,
"rust": 191,
"search": 192,
"talk": 193,
"ubuntu": 194,
"unity": 195,
"vision": 196,
"religion": 197,
"innovation": 198,
"intelligence": 199,
"spiritual": 200,
"coding": 201,
"comfort": 202,
"history": 203,
"krishna": 204,
"pressure": 205
},
"merges": [
[
"r",
"e"
],
[
"a",
"n"
],
[
"i",
"n"
],
[
"i",
"s"
],
[
"a",
"l"
],
[
"e",
"a"
],
[
"o",
"n"
],
[
"s",
"p"
],
[
"t",
"h"
],
[
"t",
"u"
],
[
"an",
"d"
],
[
"a",
"m"
],
[
"b",
"o"
],
[
"b",
"u"
],
[
"c",
"o"
],
[
"d",
"e"
],
[
"e",
"l"
],
[
"f",
"r"
],
[
"i",
"on"
],
[
"u",
"s"
],
[
"w",
"e"
],
[
"a",
"i"
],
[
"a",
"re"
],
[
"c",
"a"
],
[
"c",
"e"
],
[
"e",
"d"
],
[
"e",
"n"
],
[
"e",
"ed"
],
[
"f",
"u"
],
[
"i",
"g"
],
[
"i",
"l"
],
[
"l",
"o"
],
[
"l",
"ig"
],
[
"n",
"o"
],
[
"o",
"m"
],
[
"o",
"r"
],
[
"r",
"i"
],
[
"s",
"c"
],
[
"v",
"a"
],
[
"v",
"e"
],
[
"al",
"l"
],
[
"ea",
"r"
],
[
"sp",
"eed"
],
[
"th",
"e"
],
[
"tu",
"re"
],
[
"bu",
"il"
],
[
"fr",
"om"
],
[
"en",
"ce"
],
[
"fu",
"ture"
],
[
".",
"”"
],
[
"a",
"h"
],
[
"a",
"bo"
],
[
"a",
"de"
],
[
"a",
"fr"
],
[
"a",
"va"
],
[
"b",
"e"
],
[
"b",
"y"
],
[
"b",
"el"
],
[
"c",
"h"
],
[
"c",
"an"
],
[
"d",
"re"
],
[
"d",
"in"
],
[
"e",
"s"
],
[
"e",
"t"
],
[
"e",
"an"
],
[
"e",
"ve"
],
[
"f",
"or"
],
[
"h",
"n"
],
[
"h",
"is"
],
[
"h",
"el"
],
[
"h",
"ear"
],
[
"i",
"t"
],
[
"i",
"th"
],
[
"i",
"ca"
],
[
"i",
"ri"
],
[
"i",
"ence"
],
[
"i",
"eve"
],
[
"j",
"ava"
],
[
"j",
"es"
],
[
"k",
"r"
],
[
"k",
"s"
],
[
"l",
"ea"
],
[
"l",
"et"
],
[
"m",
"y"
],
[
"m",
"ade"
],
[
"m",
"ean"
],
[
"m",
"for"
],
[
"n",
"tu"
],
[
"n",
"am"
],
[
"n",
"it"
],
[
"o",
"u"
],
[
"o",
"we"
],
[
"o",
"ks"
],
[
"p",
"t"
],
[
"p",
"y"
],
[
"p",
"re"
],
[
"p",
"owe"
],
[
"r",
"n"
],
[
"r",
"s"
],
[
"r",
"us"
],
[
"s",
"s"
],
[
"s",
"ear"
],
[
"t",
"o"
],
[
"t",
"al"
],
[
"t",
"el"
],
[
"t",
"ion"
],
[
"t",
"or"
],
[
"u",
"t"
],
[
"u",
"re"
],
[
"u",
"bu"
],
[
"u",
"nit"
],
[
"v",
"is"
],
[
"w",
"ith"
],
[
"re",
"lig"
],
[
"in",
"no"
],
[
"in",
"tel"
],
[
"is",
"hn"
],
[
"ea",
"k"
],
[
"sp",
"iri"
],
[
"sp",
"eak"
],
[
"th",
"on"
],
[
"tu",
"al"
],
[
"bo",
"oks"
],
[
"bo",
"rn"
],
[
"co",
"de"
],
[
"co",
"din"
],
[
"co",
"mfor"
],
[
"de",
"rs"
],
[
"us",
"e"
],
[
"ca",
"use"
],
[
"lo",
"ve"
],
[
"lig",
"ence"
],
[
"no",
"t"
],
[
"ri",
"pt"
],
[
"sc",
"ience"
],
[
"sc",
"ript"
],
[
"va",
"tion"
],
[
"all",
"ah"
],
[
"buil",
"d"
],
[
"buil",
"t"
],
[
"abo",
"ut"
],
[
"afr",
"ica"
],
[
"be",
"cause"
],
[
"bel",
"ieve"
],
[
"dre",
"am"
],
[
"his",
"tor"
],
[
"hel",
"lo"
],
[
"hear",
"t"
],
[
"java",
"script"
],
[
"jes",
"us"
],
[
"kr",
"ishn"
],
[
"lea",
"ders"
],
[
"mean",
"s"
],
[
"nam",
"e"
],
[
"ou",
"r"
],
[
"py",
"thon"
],
[
"pre",
"ss"
],
[
"powe",
"r"
],
[
"rus",
"t"
],
[
"sear",
"ch"
],
[
"tal",
"k"
],
[
"ubu",
"ntu"
],
[
"unit",
"y"
],
[
"vis",
"ion"
],
[
"relig",
"ion"
],
[
"inno",
"vation"
],
[
"intel",
"ligence"
],
[
"spiri",
"tual"
],
[
"codin",
"g"
],
[
"comfor",
"t"
],
[
"histor",
"y"
],
[
"krishn",
"a"
],
[
"press",
"ure"
]
]
}
}