{ "version": "1.0", "truncation": null, "padding": null, "added_tokens": [ { "id": 0, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 1, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 2, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], "normalizer": { "type": "NFC" }, "pre_tokenizer": { "type": "Sequence", "pretokenizers": [ { "type": "Split", "pattern": { "Regex": "[+=]|[^\\S\\r\\n]*[\\n\\r]+|[^\\S\\r\\n]+" }, "behavior": "Isolated", "invert": false }, { "type": "Split", "pattern": { "Regex": "\\p{N}{1,3}" }, "behavior": "Isolated", "invert": false }, { "type": "ByteLevel", "add_prefix_space": false, "trim_offsets": true, "use_regex": false } ] }, "post_processor": null, "decoder": { "type": "ByteLevel", "add_prefix_space": true, "trim_offsets": true, "use_regex": true }, "model": { "type": "BPE", "dropout": null, "unk_token": null, "continuing_subword_prefix": null, "end_of_word_suffix": null, "fuse_unk": false, "byte_fallback": false, "ignore_merges": false, "vocab": { "": 0, "": 1, "": 2, "!": 3, "\"": 4, "#": 5, "$": 6, "%": 7, "&": 8, "'": 9, "(": 10, ")": 11, "*": 12, "+": 13, ",": 14, "-": 15, ".": 16, "/": 17, "0": 18, "1": 19, "2": 20, "3": 21, "4": 22, "5": 23, "6": 24, "7": 25, "8": 26, "9": 27, ":": 28, ";": 29, "<": 30, "=": 31, ">": 32, "?": 33, "@": 34, "A": 35, "B": 36, "C": 37, "D": 38, "E": 39, "F": 40, "G": 41, "H": 42, "I": 43, "J": 44, "K": 45, "L": 46, "M": 47, "N": 48, "O": 49, "P": 50, "Q": 51, "R": 52, "S": 53, "T": 54, "U": 55, "V": 56, "W": 57, "X": 58, "Y": 59, "Z": 60, "[": 61, "\\": 62, "]": 63, "^": 64, "_": 65, "`": 66, "a": 67, "b": 68, "c": 69, "d": 70, "e": 71, "f": 72, "g": 73, "h": 74, "i": 75, "j": 76, "k": 77, "l": 78, "m": 79, "n": 80, "o": 81, "p": 82, "q": 83, "r": 84, "s": 85, "t": 86, "u": 87, "v": 88, "w": 89, "x": 90, "y": 91, "z": 92, "{": 93, "|": 94, "}": 95, "~": 96, "¡": 97, "¢": 98, "£": 99, "¤": 100, "¥": 101, "¦": 102, "§": 103, "¨": 104, "©": 105, "ª": 106, "«": 107, "¬": 108, "®": 109, "¯": 110, "°": 111, "±": 112, "²": 113, "³": 114, "´": 115, "µ": 116, "¶": 117, "·": 118, "¸": 119, "¹": 120, "º": 121, "»": 122, "¼": 123, "½": 124, "¾": 125, "¿": 126, "À": 127, "Á": 128, "Â": 129, "Ã": 130, "Ä": 131, "Å": 132, "Æ": 133, "Ç": 134, "È": 135, "É": 136, "Ê": 137, "Ë": 138, "Ì": 139, "Í": 140, "Î": 141, "Ï": 142, "Ð": 143, "Ñ": 144, "Ò": 145, "Ó": 146, "Ô": 147, "Õ": 148, "Ö": 149, "×": 150, "Ø": 151, "Ù": 152, "Ú": 153, "Û": 154, "Ü": 155, "Ý": 156, "Þ": 157, "ß": 158, "à": 159, "á": 160, "â": 161, "ã": 162, "ä": 163, "å": 164, "æ": 165, "ç": 166, "è": 167, "é": 168, "ê": 169, "ë": 170, "ì": 171, "í": 172, "î": 173, "ï": 174, "ð": 175, "ñ": 176, "ò": 177, "ó": 178, "ô": 179, "õ": 180, "ö": 181, "÷": 182, "ø": 183, "ù": 184, "ú": 185, "û": 186, "ü": 187, "ý": 188, "þ": 189, "ÿ": 190, "Ā": 191, "ā": 192, "Ă": 193, "ă": 194, "Ą": 195, "ą": 196, "Ć": 197, "ć": 198, "Ĉ": 199, "ĉ": 200, "Ċ": 201, "ċ": 202, "Č": 203, "č": 204, "Ď": 205, "ď": 206, "Đ": 207, "đ": 208, "Ē": 209, "ē": 210, "Ĕ": 211, "ĕ": 212, "Ė": 213, "ė": 214, "Ę": 215, "ę": 216, "Ě": 217, "ě": 218, "Ĝ": 219, "ĝ": 220, "Ğ": 221, "ğ": 222, "Ġ": 223, "ġ": 224, "Ģ": 225, "ģ": 226, "Ĥ": 227, "ĥ": 228, "Ħ": 229, "ħ": 230, "Ĩ": 231, "ĩ": 232, "Ī": 233, "ī": 234, "Ĭ": 235, "ĭ": 236, "Į": 237, "į": 238, "İ": 239, "ı": 240, "IJ": 241, "ij": 242, "Ĵ": 243, "ĵ": 244, "Ķ": 245, "ķ": 246, "ĸ": 247, "Ĺ": 248, "ĺ": 249, "Ļ": 250, "ļ": 251, "Ľ": 252, "ľ": 253, "Ŀ": 254, "ŀ": 255, "Ł": 256, "ł": 257, "Ń": 258, "nt": 259, "ent": 260, "cent": 261, "tr": 262, "qu": 263, "e-": 264, "in": 265, "qua": 266, "ix": 267, "eu": 268, "quatr": 269, "nte-": 270, "gt": 271, "vin": 272, "vingt": 273, "cin": 274, "vingt-": 275, "ante-": 276, "se": 277, "hu": 278, "it": 279, "neu": 280, "oix": 281, "pt": 282, "soix": 283, "quatre-": 284, "sept": 285, "huit": 286, "neuf": 287, "quatre-vingt-": 288, "ze": 289, "deu": 290, "is": 291, "ois": 292, "six": 293, "trois": 294, "quatre": 295, "cinq": 296, "soixante-": 297, "deux": 298, "il": 299, "le": 300, "mil": 301, "mille": 302, "dix": 303, "trent": 304, "quar": 305, "cinqua": 306, "dix-": 307, "trente-": 308, "quarante-": 309, "cinquante-": 310, "un": 311, "nte": 312, "et": 313, "ize": 314, "ante": 315, "do": 316, "eize": 317, "nze": 318, "or": 319, "onze": 320, "tor": 321, "uze": 322, "treize": 323, "quin": 324, "quator": 325, "seize": 326, "soixante": 327, "quatre-vingt-dix-": 328, "soixante-dix-": 329, "douze": 330, "quinze": 331, "quatorze": 332, "trente": 333, "quarante": 334, "cinquante": 335, "vingts": 336, "vingt-sept": 337, "vingt-huit": 338, "vingt-neuf": 339, "vingt-six": 340, "vingt-trois": 341, "vingt-quatre": 342, "vingt-cinq": 343, "vingt-deux": 344, "quatre-vingts": 345, "quatre-vingt-sept": 346, "quatre-vingt-huit": 347, "quatre-vingt-neuf": 348, "quatre-vingt-six": 349, "quatre-vingt-trois": 350, "quatre-vingt-quatre": 351, "quatre-vingt-cinq": 352, "quatre-vingt-deux": 353, "quatre-vingt-dix": 354, "quatre-vingt-un": 355, "quatre-vingt-onze": 356, "quatre-vingt-treize": 357, "quatre-vingt-seize": 358, "quatre-vingt-douze": 359, "quatre-vingt-quinze": 360, "quatre-vingt-quatorze": 361, "soixante-sept": 362, "soixante-huit": 363, "soixante-neuf": 364, "soixante-six": 365, "soixante-trois": 366, "soixante-quatre": 367, "soixante-cinq": 368, "soixante-deux": 369, "soixante-dix": 370, "soixante-treize": 371, "soixante-seize": 372, "soixante-douze": 373, "soixante-quinze": 374, "soixante-quatorze": 375, "dix-sept": 376, "dix-huit": 377, "dix-neuf": 378, "trente-sept": 379, "trente-huit": 380, "trente-neuf": 381, "trente-six": 382, "trente-trois": 383, "trente-quatre": 384, "trente-cinq": 385, "trente-deux": 386, "quarante-sept": 387, "quarante-huit": 388, "quarante-neuf": 389, "quarante-six": 390, "quarante-trois": 391, "quarante-quatre": 392, "quarante-cinq": 393, "quarante-deux": 394, "cinquante-sept": 395, "cinquante-huit": 396, "cinquante-neuf": 397, "cinquante-six": 398, "cinquante-trois": 399, "cinquante-quatre": 400, "cinquante-cinq": 401, "cinquante-deux": 402, "quatre-vingt-dix-sept": 403, "quatre-vingt-dix-huit": 404, "quatre-vingt-dix-neuf": 405, "soixante-dix-sept": 406, "soixante-dix-huit": 407, "soixante-dix-neuf": 408, "cents": 409, "ro": 410, "zÃ": 411, "©ro": 412, "zéro": 413 }, "merges": [ [ "n", "t" ], [ "e", "nt" ], [ "c", "ent" ], [ "t", "r" ], [ "q", "u" ], [ "e", "-" ], [ "i", "n" ], [ "qu", "a" ], [ "i", "x" ], [ "e", "u" ], [ "qua", "tr" ], [ "nt", "e-" ], [ "g", "t" ], [ "v", "in" ], [ "vin", "gt" ], [ "c", "in" ], [ "vingt", "-" ], [ "a", "nte-" ], [ "s", "e" ], [ "h", "u" ], [ "i", "t" ], [ "n", "eu" ], [ "o", "ix" ], [ "p", "t" ], [ "s", "oix" ], [ "quatr", "e-" ], [ "se", "pt" ], [ "hu", "it" ], [ "neu", "f" ], [ "quatre-", "vingt-" ], [ "z", "e" ], [ "d", "eu" ], [ "i", "s" ], [ "o", "is" ], [ "s", "ix" ], [ "tr", "ois" ], [ "quatr", "e" ], [ "cin", "q" ], [ "soix", "ante-" ], [ "deu", "x" ], [ "i", "l" ], [ "l", "e" ], [ "m", "il" ], [ "mil", "le" ], [ "d", "ix" ], [ "tr", "ent" ], [ "qua", "r" ], [ "cin", "qua" ], [ "dix", "-" ], [ "trent", "e-" ], [ "quar", "ante-" ], [ "cinqua", "nte-" ], [ "u", "n" ], [ "nt", "e" ], [ "e", "t" ], [ "i", "ze" ], [ "a", "nte" ], [ "d", "o" ], [ "e", "ize" ], [ "n", "ze" ], [ "o", "r" ], [ "o", "nze" ], [ "t", "or" ], [ "u", "ze" ], [ "tr", "eize" ], [ "qu", "in" ], [ "qua", "tor" ], [ "se", "ize" ], [ "soix", "ante" ], [ "quatre-vingt-", "dix-" ], [ "soixante-", "dix-" ], [ "do", "uze" ], [ "quin", "ze" ], [ "quator", "ze" ], [ "trent", "e" ], [ "quar", "ante" ], [ "cinqua", "nte" ], [ "vingt", "s" ], [ "vingt-", "sept" ], [ "vingt-", "huit" ], [ "vingt-", "neuf" ], [ "vingt-", "six" ], [ "vingt-", "trois" ], [ "vingt-", "quatre" ], [ "vingt-", "cinq" ], [ "vingt-", "deux" ], [ "quatre-", "vingts" ], [ "quatre-vingt-", "sept" ], [ "quatre-vingt-", "huit" ], [ "quatre-vingt-", "neuf" ], [ "quatre-vingt-", "six" ], [ "quatre-vingt-", "trois" ], [ "quatre-vingt-", "quatre" ], [ "quatre-vingt-", "cinq" ], [ "quatre-vingt-", "deux" ], [ "quatre-vingt-", "dix" ], [ "quatre-vingt-", "un" ], [ "quatre-vingt-", "onze" ], [ "quatre-vingt-", "treize" ], [ "quatre-vingt-", "seize" ], [ "quatre-vingt-", "douze" ], [ "quatre-vingt-", "quinze" ], [ "quatre-vingt-", "quatorze" ], [ "soixante-", "sept" ], [ "soixante-", "huit" ], [ "soixante-", "neuf" ], [ "soixante-", "six" ], [ "soixante-", "trois" ], [ "soixante-", "quatre" ], [ "soixante-", "cinq" ], [ "soixante-", "deux" ], [ "soixante-", "dix" ], [ "soixante-", "treize" ], [ "soixante-", "seize" ], [ "soixante-", "douze" ], [ "soixante-", "quinze" ], [ "soixante-", "quatorze" ], [ "dix-", "sept" ], [ "dix-", "huit" ], [ "dix-", "neuf" ], [ "trente-", "sept" ], [ "trente-", "huit" ], [ "trente-", "neuf" ], [ "trente-", "six" ], [ "trente-", "trois" ], [ "trente-", "quatre" ], [ "trente-", "cinq" ], [ "trente-", "deux" ], [ "quarante-", "sept" ], [ "quarante-", "huit" ], [ "quarante-", "neuf" ], [ "quarante-", "six" ], [ "quarante-", "trois" ], [ "quarante-", "quatre" ], [ "quarante-", "cinq" ], [ "quarante-", "deux" ], [ "cinquante-", "sept" ], [ "cinquante-", "huit" ], [ "cinquante-", "neuf" ], [ "cinquante-", "six" ], [ "cinquante-", "trois" ], [ "cinquante-", "quatre" ], [ "cinquante-", "cinq" ], [ "cinquante-", "deux" ], [ "quatre-vingt-dix-", "sept" ], [ "quatre-vingt-dix-", "huit" ], [ "quatre-vingt-dix-", "neuf" ], [ "soixante-dix-", "sept" ], [ "soixante-dix-", "huit" ], [ "soixante-dix-", "neuf" ], [ "cent", "s" ], [ "r", "o" ], [ "z", "Ã" ], [ "©", "ro" ], [ "zÃ", "©ro" ] ] } }