SpecTUS_pretrained_only / tokenizer.json
hajekad's picture
Upload folder using huggingface_hub (#1)
b333ffd verified
raw
history blame
7.08 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "<neims>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "<nist>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "<rassp>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 7,
"content": "<trafo>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 8,
"content": "<source1>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 9,
"content": "<source2>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 10,
"content": "<source3>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "NFKC"
}
]
},
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"post_processor": null,
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<eos>": 0,
"<unk>": 1,
"<pad>": 2,
"<bos>": 3,
"<neims>": 4,
"<nist>": 5,
"<rassp>": 6,
"<trafo>": 7,
"<source1>": 8,
"<source2>": 9,
"<source3>": 10,
"!": 11,
"\"": 12,
"#": 13,
"$": 14,
"%": 15,
"&": 16,
"'": 17,
"(": 18,
")": 19,
"*": 20,
"+": 21,
",": 22,
"-": 23,
".": 24,
"/": 25,
"0": 26,
"1": 27,
"2": 28,
"3": 29,
"4": 30,
"5": 31,
"6": 32,
"7": 33,
"8": 34,
"9": 35,
":": 36,
";": 37,
"<": 38,
"=": 39,
">": 40,
"?": 41,
"@": 42,
"A": 43,
"B": 44,
"C": 45,
"D": 46,
"E": 47,
"F": 48,
"G": 49,
"H": 50,
"I": 51,
"J": 52,
"K": 53,
"L": 54,
"M": 55,
"N": 56,
"O": 57,
"P": 58,
"Q": 59,
"R": 60,
"S": 61,
"T": 62,
"U": 63,
"V": 64,
"W": 65,
"X": 66,
"Y": 67,
"Z": 68,
"[": 69,
"\\": 70,
"]": 71,
"^": 72,
"_": 73,
"`": 74,
"a": 75,
"b": 76,
"c": 77,
"d": 78,
"e": 79,
"f": 80,
"g": 81,
"h": 82,
"i": 83,
"j": 84,
"k": 85,
"l": 86,
"m": 87,
"n": 88,
"o": 89,
"p": 90,
"q": 91,
"r": 92,
"s": 93,
"t": 94,
"u": 95,
"v": 96,
"w": 97,
"x": 98,
"y": 99,
"z": 100,
"{": 101,
"|": 102,
"}": 103,
"~": 104,
"¡": 105,
"¢": 106,
"£": 107,
"¤": 108,
"¥": 109,
"¦": 110,
"§": 111,
"¨": 112,
"©": 113,
"ª": 114,
"«": 115,
"¬": 116,
"®": 117,
"¯": 118,
"°": 119,
"±": 120,
"²": 121,
"³": 122,
"´": 123,
"µ": 124,
"¶": 125,
"·": 126,
"¸": 127,
"¹": 128,
"º": 129,
"»": 130,
"¼": 131,
"½": 132,
"¾": 133,
"¿": 134,
"À": 135,
"Á": 136,
"Â": 137,
"Ã": 138,
"Ä": 139,
"Å": 140,
"Æ": 141,
"Ç": 142,
"È": 143,
"É": 144,
"Ê": 145,
"Ë": 146,
"Ì": 147,
"Í": 148,
"Î": 149,
"Ï": 150,
"Ð": 151,
"Ñ": 152,
"Ò": 153,
"Ó": 154,
"Ô": 155,
"Õ": 156,
"Ö": 157,
"×": 158,
"Ø": 159,
"Ù": 160,
"Ú": 161,
"Û": 162,
"Ü": 163,
"Ý": 164,
"Þ": 165,
"ß": 166,
"à": 167,
"á": 168,
"â": 169,
"ã": 170,
"ä": 171,
"å": 172,
"æ": 173,
"ç": 174,
"è": 175,
"é": 176,
"ê": 177,
"ë": 178,
"ì": 179,
"í": 180,
"î": 181,
"ï": 182,
"ð": 183,
"ñ": 184,
"ò": 185,
"ó": 186,
"ô": 187,
"õ": 188,
"ö": 189,
"÷": 190,
"ø": 191,
"ù": 192,
"ú": 193,
"û": 194,
"ü": 195,
"ý": 196,
"þ": 197,
"ÿ": 198,
"Ā": 199,
"ā": 200,
"Ă": 201,
"ă": 202,
"Ą": 203,
"ą": 204,
"Ć": 205,
"ć": 206,
"Ĉ": 207,
"ĉ": 208,
"Ċ": 209,
"ċ": 210,
"Č": 211,
"č": 212,
"Ď": 213,
"ď": 214,
"Đ": 215,
"đ": 216,
"Ē": 217,
"ē": 218,
"Ĕ": 219,
"ĕ": 220,
"Ė": 221,
"ė": 222,
"Ę": 223,
"ę": 224,
"Ě": 225,
"ě": 226,
"Ĝ": 227,
"ĝ": 228,
"Ğ": 229,
"ğ": 230,
"Ġ": 231,
"ġ": 232,
"Ģ": 233,
"ģ": 234,
"Ĥ": 235,
"ĥ": 236,
"Ħ": 237,
"ħ": 238,
"Ĩ": 239,
"ĩ": 240,
"Ī": 241,
"ī": 242,
"Ĭ": 243,
"ĭ": 244,
"Į": 245,
"į": 246,
"İ": 247,
"ı": 248,
"IJ": 249,
"ij": 250,
"Ĵ": 251,
"ĵ": 252,
"Ķ": 253,
"ķ": 254,
"ĸ": 255,
"Ĺ": 256,
"ĺ": 257,
"Ļ": 258,
"ļ": 259,
"Ľ": 260,
"ľ": 261,
"Ŀ": 262,
"ŀ": 263,
"Ł": 264,
"ł": 265,
"Ń": 266
},
"merges": []
}
}