tiny_spe_tokenizer / tokenizer.json
kartikmosaicml's picture
Upload tokenizer
fe83f38
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
},
{
"id": 1,
"content": "<s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
},
{
"id": 2,
"content": "</s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Prepend",
"prepend": "▁"
},
{
"type": "Replace",
"pattern": {
"String": " "
},
"content": "▁"
}
]
},
"pre_tokenizer": null,
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<s>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "<s>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 0
}
}
],
"special_tokens": {
"<s>": {
"id": "<s>",
"ids": [
1
],
"tokens": [
"<s>"
]
}
}
},
"decoder": {
"type": "Sequence",
"decoders": [
{
"type": "Replace",
"pattern": {
"String": "▁"
},
"content": " "
},
{
"type": "ByteFallback"
},
{
"type": "Fuse"
},
{
"type": "Strip",
"content": " ",
"start": 1,
"stop": 0
}
]
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": true,
"byte_fallback": true,
"vocab": {
"<unk>": 0,
"<s>": 1,
"</s>": 2,
"<pad>": 3,
"<n>": 4,
"<t>": 5,
"<0x00>": 6,
"<0x01>": 7,
"<0x02>": 8,
"<0x03>": 9,
"<0x04>": 10,
"<0x05>": 11,
"<0x06>": 12,
"<0x07>": 13,
"<0x08>": 14,
"<0x09>": 15,
"<0x0A>": 16,
"<0x0B>": 17,
"<0x0C>": 18,
"<0x0D>": 19,
"<0x0E>": 20,
"<0x0F>": 21,
"<0x10>": 22,
"<0x11>": 23,
"<0x12>": 24,
"<0x13>": 25,
"<0x14>": 26,
"<0x15>": 27,
"<0x16>": 28,
"<0x17>": 29,
"<0x18>": 30,
"<0x19>": 31,
"<0x1A>": 32,
"<0x1B>": 33,
"<0x1C>": 34,
"<0x1D>": 35,
"<0x1E>": 36,
"<0x1F>": 37,
"<0x20>": 38,
"<0x21>": 39,
"<0x22>": 40,
"<0x23>": 41,
"<0x24>": 42,
"<0x25>": 43,
"<0x26>": 44,
"<0x27>": 45,
"<0x28>": 46,
"<0x29>": 47,
"<0x2A>": 48,
"<0x2B>": 49,
"<0x2C>": 50,
"<0x2D>": 51,
"<0x2E>": 52,
"<0x2F>": 53,
"<0x30>": 54,
"<0x31>": 55,
"<0x32>": 56,
"<0x33>": 57,
"<0x34>": 58,
"<0x35>": 59,
"<0x36>": 60,
"<0x37>": 61,
"<0x38>": 62,
"<0x39>": 63,
"<0x3A>": 64,
"<0x3B>": 65,
"<0x3C>": 66,
"<0x3D>": 67,
"<0x3E>": 68,
"<0x3F>": 69,
"<0x40>": 70,
"<0x41>": 71,
"<0x42>": 72,
"<0x43>": 73,
"<0x44>": 74,
"<0x45>": 75,
"<0x46>": 76,
"<0x47>": 77,
"<0x48>": 78,
"<0x49>": 79,
"<0x4A>": 80,
"<0x4B>": 81,
"<0x4C>": 82,
"<0x4D>": 83,
"<0x4E>": 84,
"<0x4F>": 85,
"<0x50>": 86,
"<0x51>": 87,
"<0x52>": 88,
"<0x53>": 89,
"<0x54>": 90,
"<0x55>": 91,
"<0x56>": 92,
"<0x57>": 93,
"<0x58>": 94,
"<0x59>": 95,
"<0x5A>": 96,
"<0x5B>": 97,
"<0x5C>": 98,
"<0x5D>": 99,
"<0x5E>": 100,
"<0x5F>": 101,
"<0x60>": 102,
"<0x61>": 103,
"<0x62>": 104,
"<0x63>": 105,
"<0x64>": 106,
"<0x65>": 107,
"<0x66>": 108,
"<0x67>": 109,
"<0x68>": 110,
"<0x69>": 111,
"<0x6A>": 112,
"<0x6B>": 113,
"<0x6C>": 114,
"<0x6D>": 115,
"<0x6E>": 116,
"<0x6F>": 117,
"<0x70>": 118,
"<0x71>": 119,
"<0x72>": 120,
"<0x73>": 121,
"<0x74>": 122,
"<0x75>": 123,
"<0x76>": 124,
"<0x77>": 125,
"<0x78>": 126,
"<0x79>": 127,
"<0x7A>": 128,
"<0x7B>": 129,
"<0x7C>": 130,
"<0x7D>": 131,
"<0x7E>": 132,
"<0x7F>": 133,
"<0x80>": 134,
"<0x81>": 135,
"<0x82>": 136,
"<0x83>": 137,
"<0x84>": 138,
"<0x85>": 139,
"<0x86>": 140,
"<0x87>": 141,
"<0x88>": 142,
"<0x89>": 143,
"<0x8A>": 144,
"<0x8B>": 145,
"<0x8C>": 146,
"<0x8D>": 147,
"<0x8E>": 148,
"<0x8F>": 149,
"<0x90>": 150,
"<0x91>": 151,
"<0x92>": 152,
"<0x93>": 153,
"<0x94>": 154,
"<0x95>": 155,
"<0x96>": 156,
"<0x97>": 157,
"<0x98>": 158,
"<0x99>": 159,
"<0x9A>": 160,
"<0x9B>": 161,
"<0x9C>": 162,
"<0x9D>": 163,
"<0x9E>": 164,
"<0x9F>": 165,
"<0xA0>": 166,
"<0xA1>": 167,
"<0xA2>": 168,
"<0xA3>": 169,
"<0xA4>": 170,
"<0xA5>": 171,
"<0xA6>": 172,
"<0xA7>": 173,
"<0xA8>": 174,
"<0xA9>": 175,
"<0xAA>": 176,
"<0xAB>": 177,
"<0xAC>": 178,
"<0xAD>": 179,
"<0xAE>": 180,
"<0xAF>": 181,
"<0xB0>": 182,
"<0xB1>": 183,
"<0xB2>": 184,
"<0xB3>": 185,
"<0xB4>": 186,
"<0xB5>": 187,
"<0xB6>": 188,
"<0xB7>": 189,
"<0xB8>": 190,
"<0xB9>": 191,
"<0xBA>": 192,
"<0xBB>": 193,
"<0xBC>": 194,
"<0xBD>": 195,
"<0xBE>": 196,
"<0xBF>": 197,
"<0xC0>": 198,
"<0xC1>": 199,
"<0xC2>": 200,
"<0xC3>": 201,
"<0xC4>": 202,
"<0xC5>": 203,
"<0xC6>": 204,
"<0xC7>": 205,
"<0xC8>": 206,
"<0xC9>": 207,
"<0xCA>": 208,
"<0xCB>": 209,
"<0xCC>": 210,
"<0xCD>": 211,
"<0xCE>": 212,
"<0xCF>": 213,
"<0xD0>": 214,
"<0xD1>": 215,
"<0xD2>": 216,
"<0xD3>": 217,
"<0xD4>": 218,
"<0xD5>": 219,
"<0xD6>": 220,
"<0xD7>": 221,
"<0xD8>": 222,
"<0xD9>": 223,
"<0xDA>": 224,
"<0xDB>": 225,
"<0xDC>": 226,
"<0xDD>": 227,
"<0xDE>": 228,
"<0xDF>": 229,
"<0xE0>": 230,
"<0xE1>": 231,
"<0xE2>": 232,
"<0xE3>": 233,
"<0xE4>": 234,
"<0xE5>": 235,
"<0xE6>": 236,
"<0xE7>": 237,
"<0xE8>": 238,
"<0xE9>": 239,
"<0xEA>": 240,
"<0xEB>": 241,
"<0xEC>": 242,
"<0xED>": 243,
"<0xEE>": 244,
"<0xEF>": 245,
"<0xF0>": 246,
"<0xF1>": 247,
"<0xF2>": 248,
"<0xF3>": 249,
"<0xF4>": 250,
"<0xF5>": 251,
"<0xF6>": 252,
"<0xF7>": 253,
"<0xF8>": 254,
"<0xF9>": 255,
"<0xFA>": 256,
"<0xFB>": 257,
"<0xFC>": 258,
"<0xFD>": 259,
"<0xFE>": 260,
"<0xFF>": 261,
"▁": 262,
"e": 263,
"t": 264,
"a": 265,
"o": 266,
"i": 267,
"n": 268,
"r": 269,
"s": 270,
"h": 271,
"l": 272,
"d": 273,
"c": 274,
"u": 275,
"m": 276,
"p": 277,
"f": 278,
"g": 279,
"y": 280,
"w": 281,
"b": 282,
".": 283,
"v": 284,
",": 285,
"k": 286,
"T": 287,
"I": 288,
"S": 289,
"A": 290,
"C": 291,
"0": 292,
"-": 293,
"x": 294,
"1": 295,
"M": 296,
"P": 297,
"W": 298,
"B": 299,
"D": 300,
"2": 301,
"E": 302,
"’": 303,
"H": 304,
"R": 305,
"F": 306,
"L": 307,
"O": 308,
"N": 309,
"j": 310,
"'": 311,
"G": 312,
"z": 313,
"q": 314,
")": 315,
"5": 316,
"(": 317,
"3": 318,
"4": 319,
"U": 320,
"!": 321,
"\"": 322,
"9": 323,
"J": 324,
":": 325,
"?": 326,
"8": 327,
"V": 328,
"K": 329,
"Y": 330,
"6": 331,
"7": 332,
"/": 333,
"”": 334,
"β€œ": 335,
"–": 336,
";": 337,
"&": 338,
"%": 339,
"X": 340,
"Q": 341,
"$": 342,
"Z": 343,
"β€”": 344,
"…": 345,
"β€˜": 346,
"*": 347,
"]": 348,
"_": 349
},
"merges": []
}
}