{ "version": "1.0", "truncation": { "direction": "Right", "max_length": 1024, "strategy": "LongestFirst", "stride": 0 }, "padding": { "strategy": { "Fixed": 1024 }, "direction": "Right", "pad_to_multiple_of": null, "pad_id": 0, "pad_type_id": 0, "pad_token": "[PAD]" }, "added_tokens": [ { "id": 0, "content": "[PAD]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 1, "content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 2, "content": "[CLS]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 3, "content": "[SEP]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 4, "content": "[MASK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], "normalizer": { "type": "BertNormalizer", "clean_text": true, "handle_chinese_chars": true, "strip_accents": null, "lowercase": false }, "pre_tokenizer": { "type": "BertPreTokenizer" }, "post_processor": { "type": "TemplateProcessing", "single": [ { "SpecialToken": { "id": "[CLS]", "type_id": 0 } }, { "Sequence": { "id": "A", "type_id": 0 } }, { "SpecialToken": { "id": "[SEP]", "type_id": 0 } } ], "pair": [ { "SpecialToken": { "id": "[CLS]", "type_id": 0 } }, { "Sequence": { "id": "A", "type_id": 0 } }, { "SpecialToken": { "id": "[SEP]", "type_id": 0 } }, { "Sequence": { "id": "B", "type_id": 1 } }, { "SpecialToken": { "id": "[SEP]", "type_id": 1 } } ], "special_tokens": { "[CLS]": { "id": "[CLS]", "ids": [ 2 ], "tokens": [ "[CLS]" ] }, "[SEP]": { "id": "[SEP]", "ids": [ 3 ], "tokens": [ "[SEP]" ] } } }, "decoder": { "type": "WordPiece", "prefix": "##", "cleanup": true }, "model": { "type": "WordPiece", "unk_token": "[UNK]", "continuing_subword_prefix": "##", "max_input_chars_per_word": 100, "vocab": { "[PAD]": 0, "[UNK]": 1, "[CLS]": 2, "[SEP]": 3, "[MASK]": 4, "A": 5, "T": 6, "C": 7, "G": 8, "N": 9, "AAA": 10, "AAT": 11, "AAC": 12, "AAG": 13, "ATA": 14, "ATT": 15, "ATC": 16, "ATG": 17, "ACA": 18, "ACT": 19, "ACC": 20, "ACG": 21, "AGA": 22, "AGT": 23, "AGC": 24, "AGG": 25, "TAA": 26, "TAT": 27, "TAC": 28, "TAG": 29, "TTA": 30, "TTT": 31, "TTC": 32, "TTG": 33, "TCA": 34, "TCT": 35, "TCC": 36, "TCG": 37, "TGA": 38, "TGT": 39, "TGC": 40, "TGG": 41, "CAA": 42, "CAT": 43, "CAC": 44, "CAG": 45, "CTA": 46, "CTT": 47, "CTC": 48, "CTG": 49, "CCA": 50, "CCT": 51, "CCC": 52, "CCG": 53, "CGA": 54, "CGT": 55, "CGC": 56, "CGG": 57, "GAA": 58, "GAT": 59, "GAC": 60, "GAG": 61, "GTA": 62, "GTT": 63, "GTC": 64, "GTG": 65, "GCA": 66, "GCT": 67, "GCC": 68, "GCG": 69, "GGA": 70, "GGT": 71, "GGC": 72, "GGG": 73 } } }