diff --git "a/tokenizer.json" "b/tokenizer.json" new file mode 100644--- /dev/null +++ "b/tokenizer.json" @@ -0,0 +1,59451 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 4, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 5, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 6, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 7, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": null, + "decoder": null, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "": 5, + "": 6, + "": 7, + "!": 8, + "\"": 9, + "#": 10, + "$": 11, + "%": 12, + "&": 13, + "'": 14, + "(": 15, + ")": 16, + "*": 17, + "+": 18, + ",": 19, + "-": 20, + ".": 21, + "/": 22, + "0": 23, + "1": 24, + "2": 25, + "3": 26, + "4": 27, + "5": 28, + "6": 29, + "7": 30, + "8": 31, + "9": 32, + ":": 33, + ";": 34, + "<": 35, + "=": 36, + ">": 37, + "?": 38, + "A": 39, + "B": 40, + "C": 41, + "D": 42, + "E": 43, + "F": 44, + "G": 45, + "H": 46, + "I": 47, + "J": 48, + "K": 49, + "L": 50, + "M": 51, + "N": 52, + "O": 53, + "P": 54, + "Q": 55, + "R": 56, + "S": 57, + "T": 58, + "U": 59, + "V": 60, + "W": 61, + "X": 62, + "Y": 63, + "Z": 64, + "[": 65, + "\\": 66, + "]": 67, + "_": 68, + "a": 69, + "b": 70, + "c": 71, + "d": 72, + "e": 73, + "f": 74, + "g": 75, + "h": 76, + "i": 77, + "j": 78, + "k": 79, + "l": 80, + "m": 81, + "n": 82, + "o": 83, + "p": 84, + "q": 85, + "r": 86, + "s": 87, + "t": 88, + "u": 89, + "v": 90, + "w": 91, + "x": 92, + "y": 93, + "z": 94, + "{": 95, + "|": 96, + "}": 97, + "~": 98, + "ª": 99, + "°": 100, + "³": 101, + "º": 102, + "À": 103, + "Á": 104, + "Ã": 105, + "Ç": 106, + "É": 107, + "Í": 108, + "Ó": 109, + "×": 110, + "ß": 111, + "à": 112, + "á": 113, + "â": 114, + "ã": 115, + "ç": 116, + "è": 117, + "é": 118, + "ê": 119, + "ì": 120, + "í": 121, + "ñ": 122, + "ò": 123, + "ó": 124, + "ô": 125, + "õ": 126, + "ö": 127, + "ø": 128, + "ú": 129, + "ü": 130, + "ă": 131, + "ć": 132, + "Č": 133, + "č": 134, + "ě": 135, + "ī": 136, + "ō": 137, + "ś": 138, + "ş": 139, + "š": 140, + "ž": 141, + "Φ": 142, + "έ": 143, + "α": 144, + "θ": 145, + "ν": 146, + "ω": 147, + "​": 148, + "–": 149, + "—": 150, + "’": 151, + "“": 152, + "”": 153, + "∞": 154, + "≈": 155, + "✅": 156, + "❌": 157, + "岁": 158, + "星": 159, + "木": 160, + "歲": 161, + "te": 162, + "ER": 163, + "de": 164, + "ão": 165, + "is": 166, + "ra": 167, + "ca": 168, + "