{ "version": "1.0", "truncation": null, "padding": null, "added_tokens": [ { "id": 0, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 1, "content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 2, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 3, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], "normalizer": { "type": "Lowercase" }, "pre_tokenizer": { "type": "ByteLevel", "add_prefix_space": true, "trim_offsets": true, "use_regex": true }, "post_processor": { "type": "TemplateProcessing", "single": [ { "SpecialToken": { "id": "<|endoftext|>", "type_id": 0 } }, { "Sequence": { "id": "A", "type_id": 0 } } ], "pair": [ { "Sequence": { "id": "A", "type_id": 0 } }, { "Sequence": { "id": "B", "type_id": 1 } } ], "special_tokens": { "<|endoftext|>": { "id": "<|endoftext|>", "ids": [ 1 ], "tokens": [ "<|endoftext|>" ] } } }, "decoder": { "type": "ByteLevel", "add_prefix_space": true, "trim_offsets": true, "use_regex": true }, "model": { "type": "BPE", "dropout": null, "unk_token": "", "continuing_subword_prefix": null, "end_of_word_suffix": null, "fuse_unk": false, "byte_fallback": false, "ignore_merges": false, "vocab": { "": 0, "<|endoftext|>": 1, "": 2, "": 3, "!": 4, "\"": 5, "#": 6, "$": 7, "%": 8, "&": 9, "'": 10, "(": 11, ")": 12, "*": 13, "+": 14, ",": 15, "-": 16, ".": 17, "/": 18, "0": 19, "1": 20, "2": 21, "3": 22, "4": 23, "5": 24, "6": 25, "7": 26, "8": 27, "9": 28, ":": 29, ";": 30, "<": 31, "=": 32, ">": 33, "?": 34, "@": 35, "[": 36, "\\": 37, "]": 38, "^": 39, "_": 40, "`": 41, "a": 42, "b": 43, "c": 44, "d": 45, "e": 46, "f": 47, "g": 48, "h": 49, "i": 50, "j": 51, "k": 52, "l": 53, "m": 54, "n": 55, "o": 56, "p": 57, "q": 58, "r": 59, "s": 60, "t": 61, "u": 62, "v": 63, "w": 64, "x": 65, "y": 66, "z": 67, "{": 68, "|": 69, "}": 70, "~": 71, "¡": 72, "¢": 73, "£": 74, "¤": 75, "¥": 76, "¦": 77, "§": 78, "¨": 79, "©": 80, "ª": 81, "«": 82, "¬": 83, "®": 84, "¯": 85, "°": 86, "±": 87, "²": 88, "³": 89, "´": 90, "µ": 91, "¶": 92, "·": 93, "¸": 94, "¹": 95, "º": 96, "»": 97, "¼": 98, "½": 99, "¾": 100, "¿": 101, "Â": 102, "Ã": 103, "Ä": 104, "Å": 105, "Æ": 106, "Ç": 107, "È": 108, "É": 109, "Ê": 110, "Ë": 111, "Ì": 112, "Î": 113, "Ï": 114, "Ð": 115, "Ñ": 116, "Ò": 117, "Ó": 118, "Õ": 119, "Ö": 120, "×": 121, "Ø": 122, "Ù": 123, "Û": 124, "Þ": 125, "à": 126, "á": 127, "â": 128, "ã": 129, "ä": 130, "å": 131, "æ": 132, "ç": 133, "è": 134, "é": 135, "ê": 136, "ë": 137, "ì": 138, "í": 139, "î": 140, "ï": 141, "ð": 142, "Ċ": 143, "Ġ": 144, "Ģ": 145, "ģ": 146, "Ĥ": 147, "ĥ": 148, "Ħ": 149, "ħ": 150, "Ĩ": 151, "ĩ": 152, "Ī": 153, "ī": 154, "Ĭ": 155, "ĭ": 156, "Į": 157, "į": 158, "İ": 159, "ı": 160, "IJ": 161, "ij": 162, "Ĵ": 163, "ĵ": 164, "Ķ": 165, "ķ": 166, "ĸ": 167, "Ĺ": 168, "ĺ": 169, "Ļ": 170, "ļ": 171, "Ľ": 172, "ľ": 173, "Ŀ": 174, "ŀ": 175, "Ł": 176, "ł": 177, "Ń": 178, "ı": 179, "Ġb": 180, "ar": 181, "ü": 182, "er": 183, "ÅŁ": 184, "an": 185, "in": 186, "en": 187, "Ġk": 188, "ç": 189, "Ġs": 190, "Ġd": 191, "ir": 192, "ÄŁ": 193, "Ġy": 194, "ın": 195, "Ġg": 196, "Ġo": 197, "il": 198, "ma": 199 }, "merges": [ [ "Ä", "±" ], [ "Ġ", "b" ], [ "a", "r" ], [ "Ã", "¼" ], [ "e", "r" ], [ "Å", "Ł" ], [ "a", "n" ], [ "i", "n" ], [ "e", "n" ], [ "Ġ", "k" ], [ "Ã", "§" ], [ "Ġ", "s" ], [ "Ġ", "d" ], [ "i", "r" ], [ "Ä", "Ł" ], [ "Ġ", "y" ], [ "ı", "n" ], [ "Ġ", "g" ], [ "Ġ", "o" ], [ "i", "l" ], [ "m", "a" ] ] } }