Base-mini / tokenizer.json
QuantaSparkLabs's picture
Push to HF
59f1228 verified
Raw
History Blame Contribute Delete
20.9 kB
{
"version": "1.0",
"truncation": null,
"padding": {
"strategy": "BatchLongest",
"direction": "Right",
"pad_to_multiple_of": null,
"pad_id": 0,
"pad_type_id": 0,
"pad_token": "[PAD]"
},
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "[BOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "[EOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "NFC"
},
{
"type": "Lowercase"
}
]
},
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[BOS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[EOS]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[BOS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[EOS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[EOS]",
"type_id": 1
}
}
],
"special_tokens": {
"[BOS]": {
"id": "[BOS]",
"ids": [
5
],
"tokens": [
"[BOS]"
]
},
"[EOS]": {
"id": "[EOS]",
"ids": [
6
],
"tokens": [
"[EOS]"
]
}
}
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[SEP]": 2,
"[CLS]": 3,
"[MASK]": 4,
"[BOS]": 5,
"[EOS]": 6,
"!": 7,
"\"": 8,
"#": 9,
"$": 10,
"%": 11,
"&": 12,
"'": 13,
"(": 14,
")": 15,
"*": 16,
"+": 17,
",": 18,
"-": 19,
".": 20,
"/": 21,
"0": 22,
"1": 23,
"2": 24,
"3": 25,
"4": 26,
"5": 27,
"6": 28,
"7": 29,
"8": 30,
"9": 31,
":": 32,
";": 33,
"<": 34,
"=": 35,
">": 36,
"?": 37,
"@": 38,
"A": 39,
"B": 40,
"C": 41,
"D": 42,
"E": 43,
"F": 44,
"G": 45,
"H": 46,
"I": 47,
"J": 48,
"K": 49,
"L": 50,
"M": 51,
"N": 52,
"O": 53,
"P": 54,
"Q": 55,
"R": 56,
"S": 57,
"T": 58,
"U": 59,
"V": 60,
"W": 61,
"X": 62,
"Y": 63,
"Z": 64,
"[": 65,
"\\": 66,
"]": 67,
"^": 68,
"_": 69,
"`": 70,
"a": 71,
"b": 72,
"c": 73,
"d": 74,
"e": 75,
"f": 76,
"g": 77,
"h": 78,
"i": 79,
"j": 80,
"k": 81,
"l": 82,
"m": 83,
"n": 84,
"o": 85,
"p": 86,
"q": 87,
"r": 88,
"s": 89,
"t": 90,
"u": 91,
"v": 92,
"w": 93,
"x": 94,
"y": 95,
"z": 96,
"{": 97,
"|": 98,
"}": 99,
"~": 100,
"¡": 101,
"¢": 102,
"£": 103,
"¤": 104,
"¥": 105,
"¦": 106,
"§": 107,
"¨": 108,
"©": 109,
"ª": 110,
"«": 111,
"¬": 112,
"®": 113,
"¯": 114,
"°": 115,
"±": 116,
"²": 117,
"³": 118,
"´": 119,
"µ": 120,
"¶": 121,
"·": 122,
"¸": 123,
"¹": 124,
"º": 125,
"»": 126,
"¼": 127,
"½": 128,
"¾": 129,
"¿": 130,
"À": 131,
"Á": 132,
"Â": 133,
"Ã": 134,
"Ä": 135,
"Å": 136,
"Æ": 137,
"Ç": 138,
"È": 139,
"É": 140,
"Ê": 141,
"Ë": 142,
"Ì": 143,
"Í": 144,
"Î": 145,
"Ï": 146,
"Ð": 147,
"Ñ": 148,
"Ò": 149,
"Ó": 150,
"Ô": 151,
"Õ": 152,
"Ö": 153,
"×": 154,
"Ø": 155,
"Ù": 156,
"Ú": 157,
"Û": 158,
"Ü": 159,
"Ý": 160,
"Þ": 161,
"ß": 162,
"à": 163,
"á": 164,
"â": 165,
"ã": 166,
"ä": 167,
"å": 168,
"æ": 169,
"ç": 170,
"è": 171,
"é": 172,
"ê": 173,
"ë": 174,
"ì": 175,
"í": 176,
"î": 177,
"ï": 178,
"ð": 179,
"ñ": 180,
"ò": 181,
"ó": 182,
"ô": 183,
"õ": 184,
"ö": 185,
"÷": 186,
"ø": 187,
"ù": 188,
"ú": 189,
"û": 190,
"ü": 191,
"ý": 192,
"þ": 193,
"ÿ": 194,
"Ā": 195,
"ā": 196,
"Ă": 197,
"ă": 198,
"Ą": 199,
"ą": 200,
"Ć": 201,
"ć": 202,
"Ĉ": 203,
"ĉ": 204,
"Ċ": 205,
"ċ": 206,
"Č": 207,
"č": 208,
"Ď": 209,
"ď": 210,
"Đ": 211,
"đ": 212,
"Ē": 213,
"ē": 214,
"Ĕ": 215,
"ĕ": 216,
"Ė": 217,
"ė": 218,
"Ę": 219,
"ę": 220,
"Ě": 221,
"ě": 222,
"Ĝ": 223,
"ĝ": 224,
"Ğ": 225,
"ğ": 226,
"Ġ": 227,
"ġ": 228,
"Ģ": 229,
"ģ": 230,
"Ĥ": 231,
"ĥ": 232,
"Ħ": 233,
"ħ": 234,
"Ĩ": 235,
"ĩ": 236,
"Ī": 237,
"ī": 238,
"Ĭ": 239,
"ĭ": 240,
"Į": 241,
"į": 242,
"İ": 243,
"ı": 244,
"IJ": 245,
"ij": 246,
"Ĵ": 247,
"ĵ": 248,
"Ķ": 249,
"ķ": 250,
"ĸ": 251,
"Ĺ": 252,
"ĺ": 253,
"Ļ": 254,
"ļ": 255,
"Ľ": 256,
"ľ": 257,
"Ŀ": 258,
"ŀ": 259,
"Ł": 260,
"ł": 261,
"Ń": 262,
"es": 263,
"en": 264,
"th": 265,
"the": 266,
"at": 267,
"or": 268,
"de": 269,
"ates": 270,
"ut": 271,
"co": 272,
"ra": 273,
"di": 274,
"Ġp": 275,
"Ġt": 276,
"ns": 277,
"ent": 278,
"st": 279,
"Ġpr": 280,
"Ġa": 281,
"io": 282,
"ts": 283,
"Ġde": 284,
"Ġdeco": 285,
"li": 286,
"wor": 287,
"work": 288,
"Ġf": 289,
"ne": 290,
"Ġpro": 291,
"Ġg": 292,
"des": 293,
"Ġe": 294,
"er": 295,
"put": 296,
"al": 297,
"der": 298,
"ces": 299,
"for": 300,
"rans": 301,
"Ġtrans": 302,
"form": 303,
"Ġtransform": 304,
"Ġs": 305,
"Ġo": 306,
"zes": 307,
"ions": 308,
"Ġen": 309,
"Ġenco": 310,
"re": 311,
"Ġdecodes": 312,
"ab": 313,
"but": 314,
"bab": 315,
"ili": 316,
"ibut": 317,
"ribut": 318,
"ty": 319,
"Ġdi": 320,
"stribut": 321,
"Ġprobab": 322,
"ility": 323,
"Ġdistribut": 324,
"Ġprobability": 325,
"Ġdistributions": 326,
"dd": 327,
"hi": 328,
"Ġst": 329,
"Ġhi": 330,
"dden": 331,
"Ġstates": 332,
"Ġhidden": 333,
"ener": 334,
"Ġgener": 335,
"Ġgenerates": 336,
"ework": 337,
"mework": 338,
"ramework": 339,
"Ġframework": 340,
"as": 341,
"cl": 342,
"fi": 343,
"ifi": 344,
"sifi": 345,
"Ġcl": 346,
"assifi": 347,
"Ġclassifi": 348,
"Ġclassifies": 349,
"be": 350,
"ddi": 351,
"gs": 352,
"mbe": 353,
"ngs": 354,
"Ġembe": 355,
"ddings": 356,
"Ġembeddings": 357,
"twork": 358,
"Ġne": 359,
"Ġnetwork": 360,
"eli": 361,
"ip": 362,
"Ġpip": 363,
"eline": 364,
"Ġpipeline": 365,
"Ġdecoder": 366,
"ct": 367,
"eat": 368,
"ect": 369,
"ure": 370,
"vect": 371,
"Ġvect": 372,
"ors": 373,
"Ġfeat": 374,
"Ġvectors": 375,
"Ġfeature": 376,
"an": 377,
"gent": 378,
"Ġagent": 379,
"mo": 380,
"Ġmo": 381,
"del": 382,
"Ġmodel": 383,
"in": 384,
"ken": 385,
"oken": 386,
"Ġin": 387,
"Ġtoken": 388,
"Ġinput": 389,
"Ġtokens": 390,
"gi": 391,
"lo": 392,
"Ġlo": 393,
"utput": 394,
"Ġoutput": 395,
"gits": 396,
"Ġlogits": 397,
"uates": 398,
"val": 399,
"Ġeval": 400,
"Ġtransforms": 401,
"Ġevaluates": 402,
"ses": 403,
"Ġproces": 404,
"Ġprocesses": 405,
"dates": 406,
"pdates": 407,
"updates": 408,
"Ġupdates": 409,
"radi": 410,
"Ġgradi": 411,
"Ġgradient": 412,
"nal": 413,
"yzes": 414,
"Ġanal": 415,
"Ġanalyzes": 416,
"eq": 417,
"ex": 418,
"uen": 419,
"Ġtex": 420,
"Ġseq": 421,
"uences": 422,
"Ġtext": 423,
"Ġsequences": 424,
"em": 425,
"yst": 426,
"Ġsyst": 427,
"Ġsystem": 428,
"Ġtransformer": 429,
"Ġencodes": 430,
"im": 431,
"izes": 432,
"pt": 433,
"Ġopt": 434,
"imizes": 435,
"Ġoptimizes": 436,
"Ġencoder": 437,
"ei": 438,
"gh": 439,
"tent": 440,
"wei": 441,
"Ġat": 442,
"Ġwei": 443,
"ion": 444,
"ghts": 445,
"tention": 446,
"Ġattention": 447,
"Ġweights": 448,
"gor": 449,
"ith": 450,
"lgor": 451,
"Ġalgor": 452,
"ithm": 453,
"Ġalgorithm": 454,
"cts": 455,
"edi": 456,
"Ġpredi": 457,
"Ġpredicts": 458,
"pr": 459,
"Ġre": 460,
"esent": 461,
"ations": 462,
"present": 463,
"Ġrepresent": 464,
"Ġrepresentations": 465
},
"merges": [
[
"e",
"s"
],
[
"e",
"n"
],
[
"t",
"h"
],
[
"th",
"e"
],
[
"a",
"t"
],
[
"o",
"r"
],
[
"d",
"e"
],
[
"at",
"es"
],
[
"u",
"t"
],
[
"c",
"o"
],
[
"r",
"a"
],
[
"d",
"i"
],
[
"Ġ",
"p"
],
[
"Ġ",
"t"
],
[
"n",
"s"
],
[
"en",
"t"
],
[
"s",
"t"
],
[
"Ġp",
"r"
],
[
"Ġ",
"a"
],
[
"i",
"o"
],
[
"t",
"s"
],
[
"Ġ",
"de"
],
[
"Ġde",
"co"
],
[
"l",
"i"
],
[
"w",
"or"
],
[
"wor",
"k"
],
[
"Ġ",
"f"
],
[
"n",
"e"
],
[
"Ġpr",
"o"
],
[
"Ġ",
"g"
],
[
"d",
"es"
],
[
"Ġ",
"e"
],
[
"e",
"r"
],
[
"p",
"ut"
],
[
"a",
"l"
],
[
"de",
"r"
],
[
"c",
"es"
],
[
"f",
"or"
],
[
"ra",
"ns"
],
[
"Ġt",
"rans"
],
[
"for",
"m"
],
[
"Ġtrans",
"form"
],
[
"Ġ",
"s"
],
[
"Ġ",
"o"
],
[
"z",
"es"
],
[
"io",
"ns"
],
[
"Ġ",
"en"
],
[
"Ġen",
"co"
],
[
"r",
"e"
],
[
"Ġdeco",
"des"
],
[
"a",
"b"
],
[
"b",
"ut"
],
[
"b",
"ab"
],
[
"i",
"li"
],
[
"i",
"but"
],
[
"r",
"ibut"
],
[
"t",
"y"
],
[
"Ġ",
"di"
],
[
"st",
"ribut"
],
[
"Ġpro",
"bab"
],
[
"ili",
"ty"
],
[
"Ġdi",
"stribut"
],
[
"Ġprobab",
"ility"
],
[
"Ġdistribut",
"ions"
],
[
"d",
"d"
],
[
"h",
"i"
],
[
"Ġ",
"st"
],
[
"Ġ",
"hi"
],
[
"dd",
"en"
],
[
"Ġst",
"ates"
],
[
"Ġhi",
"dden"
],
[
"en",
"er"
],
[
"Ġg",
"ener"
],
[
"Ġgener",
"ates"
],
[
"e",
"work"
],
[
"m",
"ework"
],
[
"ra",
"mework"
],
[
"Ġf",
"ramework"
],
[
"a",
"s"
],
[
"c",
"l"
],
[
"f",
"i"
],
[
"i",
"fi"
],
[
"s",
"ifi"
],
[
"Ġ",
"cl"
],
[
"as",
"sifi"
],
[
"Ġcl",
"assifi"
],
[
"Ġclassifi",
"es"
],
[
"b",
"e"
],
[
"d",
"di"
],
[
"g",
"s"
],
[
"m",
"be"
],
[
"n",
"gs"
],
[
"Ġe",
"mbe"
],
[
"ddi",
"ngs"
],
[
"Ġembe",
"ddings"
],
[
"t",
"work"
],
[
"Ġ",
"ne"
],
[
"Ġne",
"twork"
],
[
"e",
"li"
],
[
"i",
"p"
],
[
"Ġp",
"ip"
],
[
"eli",
"ne"
],
[
"Ġpip",
"eline"
],
[
"Ġdeco",
"der"
],
[
"c",
"t"
],
[
"e",
"at"
],
[
"e",
"ct"
],
[
"u",
"re"
],
[
"v",
"ect"
],
[
"Ġ",
"vect"
],
[
"or",
"s"
],
[
"Ġf",
"eat"
],
[
"Ġvect",
"ors"
],
[
"Ġfeat",
"ure"
],
[
"a",
"n"
],
[
"g",
"ent"
],
[
"Ġa",
"gent"
],
[
"m",
"o"
],
[
"Ġ",
"mo"
],
[
"de",
"l"
],
[
"Ġmo",
"del"
],
[
"i",
"n"
],
[
"k",
"en"
],
[
"o",
"ken"
],
[
"Ġ",
"in"
],
[
"Ġt",
"oken"
],
[
"Ġin",
"put"
],
[
"Ġtoken",
"s"
],
[
"g",
"i"
],
[
"l",
"o"
],
[
"Ġ",
"lo"
],
[
"ut",
"put"
],
[
"Ġo",
"utput"
],
[
"gi",
"ts"
],
[
"Ġlo",
"gits"
],
[
"u",
"ates"
],
[
"v",
"al"
],
[
"Ġe",
"val"
],
[
"Ġtransform",
"s"
],
[
"Ġeval",
"uates"
],
[
"s",
"es"
],
[
"Ġpro",
"ces"
],
[
"Ġproces",
"ses"
],
[
"d",
"ates"
],
[
"p",
"dates"
],
[
"u",
"pdates"
],
[
"Ġ",
"updates"
],
[
"ra",
"di"
],
[
"Ġg",
"radi"
],
[
"Ġgradi",
"ent"
],
[
"n",
"al"
],
[
"y",
"zes"
],
[
"Ġa",
"nal"
],
[
"Ġanal",
"yzes"
],
[
"e",
"q"
],
[
"e",
"x"
],
[
"u",
"en"
],
[
"Ġt",
"ex"
],
[
"Ġs",
"eq"
],
[
"uen",
"ces"
],
[
"Ġtex",
"t"
],
[
"Ġseq",
"uences"
],
[
"e",
"m"
],
[
"y",
"st"
],
[
"Ġs",
"yst"
],
[
"Ġsyst",
"em"
],
[
"Ġtransform",
"er"
],
[
"Ġenco",
"des"
],
[
"i",
"m"
],
[
"i",
"zes"
],
[
"p",
"t"
],
[
"Ġo",
"pt"
],
[
"im",
"izes"
],
[
"Ġopt",
"imizes"
],
[
"Ġenco",
"der"
],
[
"e",
"i"
],
[
"g",
"h"
],
[
"t",
"ent"
],
[
"w",
"ei"
],
[
"Ġ",
"at"
],
[
"Ġ",
"wei"
],
[
"io",
"n"
],
[
"gh",
"ts"
],
[
"tent",
"ion"
],
[
"Ġat",
"tention"
],
[
"Ġwei",
"ghts"
],
[
"g",
"or"
],
[
"i",
"th"
],
[
"l",
"gor"
],
[
"Ġa",
"lgor"
],
[
"ith",
"m"
],
[
"Ġalgor",
"ithm"
],
[
"c",
"ts"
],
[
"e",
"di"
],
[
"Ġpr",
"edi"
],
[
"Ġpredi",
"cts"
],
[
"p",
"r"
],
[
"Ġ",
"re"
],
[
"es",
"ent"
],
[
"at",
"ions"
],
[
"pr",
"esent"
],
[
"Ġre",
"present"
],
[
"Ġrepresent",
"ations"
]
]
}
}