{ "version": "1.0", "truncation": null, "padding": null, "added_tokens": [ { "id": 0, "content": "[PAD]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 1, "content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 2, "content": "[CLS]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 3, "content": "[SEP]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 4, "content": "[MASK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], "normalizer": null, "pre_tokenizer": { "type": "ByteLevel", "add_prefix_space": false, "trim_offsets": true, "use_regex": true }, "post_processor": { "type": "ByteLevel", "add_prefix_space": true, "trim_offsets": true, "use_regex": true }, "decoder": { "type": "ByteLevel", "add_prefix_space": true, "trim_offsets": true, "use_regex": true }, "model": { "type": "BPE", "dropout": null, "unk_token": "[UNK]", "continuing_subword_prefix": null, "end_of_word_suffix": null, "fuse_unk": false, "byte_fallback": false, "ignore_merges": false, "vocab": { "[PAD]": 0, "[UNK]": 1, "[CLS]": 2, "[SEP]": 3, "[MASK]": 4, ".": 5, "T": 6, "a": 7, "b": 8, "c": 9, "d": 10, "e": 11, "f": 12, "g": 13, "h": 14, "i": 15, "j": 16, "k": 17, "l": 18, "m": 19, "n": 20, "o": 21, "p": 22, "q": 23, "r": 24, "s": 25, "t": 26, "u": 27, "v": 28, "w": 29, "x": 30, "y": 31, "z": 32, "Ġ": 33, "Ġt": 34, "Th": 35, "er": 36, "fo": 37, "in": 38, "is": 39, "mp": 40, "Ġfo": 41, "ĠTh": 42, "az": 43, "ain": 44, "amp": 45, "br": 46, "ck": 47, "do": 48, "en": 49, "ex": 50, "he": 51, "iz": 52, "ick": 53, "ju": 54, "ken": 55, "le": 56, "laz": 57, "ov": 58, "ow": 59, "oken": 60, "qu": 61, "rain": 62, "samp": 63, "Ġa": 64, "Ġis": 65, "Ġbr": 66, "Ġdo": 67, "Ġju": 68, "Ġlaz": 69, "Ġov": 70, "Ġqu": 71, "Ġsamp": 72, "Ġtex": 73, "Ġthe": 74, "Ġtoken": 75, "Ġtrain": 76, "ing": 77, "mps": 78, "Ġfor": 79, "Ġfox": 80, "izer": 81, "own": 82, "Ġbrown": 83, "Ġdog": 84, "Ġjumps": 85, "Ġlazy": 86, "Ġover": 87, "Ġquick": 88, "Ġsample": 89, "Ġtext": 90, "Ġtokenizer": 91, "Ġtraining": 92, "ĠThe": 93, "ĠThis": 94 }, "merges": [ [ "Ġ", "t" ], [ "T", "h" ], [ "e", "r" ], [ "f", "o" ], [ "i", "n" ], [ "i", "s" ], [ "m", "p" ], [ "Ġ", "fo" ], [ "Ġ", "Th" ], [ "a", "z" ], [ "a", "in" ], [ "a", "mp" ], [ "b", "r" ], [ "c", "k" ], [ "d", "o" ], [ "e", "n" ], [ "e", "x" ], [ "h", "e" ], [ "i", "z" ], [ "i", "ck" ], [ "j", "u" ], [ "k", "en" ], [ "l", "e" ], [ "l", "az" ], [ "o", "v" ], [ "o", "w" ], [ "o", "ken" ], [ "q", "u" ], [ "r", "ain" ], [ "s", "amp" ], [ "Ġ", "a" ], [ "Ġ", "is" ], [ "Ġ", "br" ], [ "Ġ", "do" ], [ "Ġ", "ju" ], [ "Ġ", "laz" ], [ "Ġ", "ov" ], [ "Ġ", "qu" ], [ "Ġ", "samp" ], [ "Ġt", "ex" ], [ "Ġt", "he" ], [ "Ġt", "oken" ], [ "Ġt", "rain" ], [ "in", "g" ], [ "mp", "s" ], [ "Ġfo", "r" ], [ "Ġfo", "x" ], [ "iz", "er" ], [ "ow", "n" ], [ "Ġbr", "own" ], [ "Ġdo", "g" ], [ "Ġju", "mps" ], [ "Ġlaz", "y" ], [ "Ġov", "er" ], [ "Ġqu", "ick" ], [ "Ġsamp", "le" ], [ "Ġtex", "t" ], [ "Ġtoken", "izer" ], [ "Ġtrain", "ing" ], [ "ĠTh", "e" ], [ "ĠTh", "is" ] ] } }