diff --git "a/tokenizer.json" "b/tokenizer.json" new file mode 100644--- /dev/null +++ "b/tokenizer.json" @@ -0,0 +1,19516 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "!": 4, + "\"": 5, + "#": 6, + "$": 7, + "%": 8, + "&": 9, + "'": 10, + "(": 11, + ")": 12, + "*": 13, + "+": 14, + ",": 15, + "-": 16, + ".": 17, + "/": 18, + "0": 19, + "1": 20, + "2": 21, + "3": 22, + "4": 23, + "5": 24, + "6": 25, + "7": 26, + "8": 27, + "9": 28, + ":": 29, + ";": 30, + "<": 31, + "=": 32, + ">": 33, + "?": 34, + "@": 35, + "A": 36, + "B": 37, + "C": 38, + "D": 39, + "E": 40, + "F": 41, + "G": 42, + "H": 43, + "I": 44, + "J": 45, + "K": 46, + "L": 47, + "M": 48, + "N": 49, + "O": 50, + "P": 51, + "Q": 52, + "R": 53, + "S": 54, + "T": 55, + "U": 56, + "V": 57, + "W": 58, + "X": 59, + "Y": 60, + "Z": 61, + "[": 62, + "\\": 63, + "]": 64, + "^": 65, + "_": 66, + "`": 67, + "a": 68, + "b": 69, + "c": 70, + "d": 71, + "e": 72, + "f": 73, + "g": 74, + "h": 75, + "i": 76, + "j": 77, + "k": 78, + "l": 79, + "m": 80, + "n": 81, + "o": 82, + "p": 83, + "q": 84, + "r": 85, + "s": 86, + "t": 87, + "u": 88, + "v": 89, + "w": 90, + "x": 91, + "y": 92, + "z": 93, + "{": 94, + "|": 95, + "}": 96, + "~": 97, + "¡": 98, + "¢": 99, + "£": 100, + "¤": 101, + "¥": 102, + "¦": 103, + "§": 104, + "¨": 105, + "©": 106, + "ª": 107, + "«": 108, + "¬": 109, + "®": 110, + "¯": 111, + "°": 112, + "±": 113, + "²": 114, + "³": 115, + "´": 116, + "µ": 117, + "¶": 118, + "·": 119, + "¸": 120, + "¹": 121, + "º": 122, + "»": 123, + "¼": 124, + "½": 125, + "¾": 126, + "¿": 127, + "À": 128, + "Á": 129, + "Â": 130, + "Ã": 131, + "Ä": 132, + "Å": 133, + "Æ": 134, + "Ç": 135, + "È": 136, + "É": 137, + "Ê": 138, + "Ë": 139, + "Ì": 140, + "Í": 141, + "Î": 142, + "Ï": 143, + "Ð": 144, + "Ñ": 145, + "Ò": 146, + "Ó": 147, + "Ô": 148, + "Õ": 149, + "Ö": 150, + "×": 151, + "Ø": 152, + "Ù": 153, + "Ú": 154, + "Û": 155, + "Ü": 156, + "Ý": 157, + "Þ": 158, + "ß": 159, + "à": 160, + "á": 161, + "â": 162, + "ã": 163, + "ä": 164, + "å": 165, + "æ": 166, + "ç": 167, + "è": 168, + "é": 169, + "ê": 170, + "ë": 171, + "ì": 172, + "í": 173, + "î": 174, + "ï": 175, + "ð": 176, + "ñ": 177, + "ò": 178, + "ó": 179, + "ô": 180, + "õ": 181, + "ö": 182, + "÷": 183, + "ø": 184, + "ù": 185, + "ú": 186, + "û": 187, + "ü": 188, + "ý": 189, + "þ": 190, + "ÿ": 191, + "Ā": 192, + "ā": 193, + "Ă": 194, + "ă": 195, + "Ą": 196, + "ą": 197, + "Ć": 198, + "ć": 199, + "Ĉ": 200, + "ĉ": 201, + "Ċ": 202, + "ċ": 203, + "Č": 204, + "č": 205, + "Ď": 206, + "ď": 207, + "Đ": 208, + "đ": 209, + "Ē": 210, + "ē": 211, + "Ĕ": 212, + "ĕ": 213, + "Ė": 214, + "ė": 215, + "Ę": 216, + "ę": 217, + "Ě": 218, + "ě": 219, + "Ĝ": 220, + "ĝ": 221, + "Ğ": 222, + "ğ": 223, + "Ġ": 224, + "ġ": 225, + "Ģ": 226, + "ģ": 227, + "Ĥ": 228, + "ĥ": 229, + "Ħ": 230, + "ħ": 231, + "Ĩ": 232, + "ĩ": 233, + "Ī": 234, + "ī": 235, + "Ĭ": 236, + "ĭ": 237, + "Į": 238, + "į": 239, + "İ": 240, + "ı": 241, + "IJ": 242, + "ij": 243, + "Ĵ": 244, + "ĵ": 245, + "Ķ": 246, + "ķ": 247, + "ĸ": 248, + "Ĺ": 249, + "ĺ": 250, + "Ļ": 251, + "ļ": 252, + "Ľ": 253, + "ľ": 254, + "Ŀ": 255, + "ŀ": 256, + "Ł": 257, + "ł": 258, + "Ń": 259, + "Ġs": 260, + "Ġa": 261, + "me": 262, + "li": 263, + "ll": 264, + "It": 265, + "nd": 266, + "