my-awesome-model / tokenizer.json
vojtam's picture
Upload tokenizer.json
643f42d verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": false
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<|endoftext|>": 0,
"[PAD]": 1,
"A": 2,
"C": 3,
"G": 4,
"T": 5,
"Ċ": 6,
"TT": 7,
"AA": 8,
"TG": 9,
"AG": 10,
"CC": 11,
"TC": 12,
"AC": 13,
"GG": 14,
"ATT": 15,
"AT": 16,
"ATG": 17,
"GC": 18,
"TAA": 19,
"TCC": 20,
"ACC": 21,
"AAAA": 22,
"AGG": 23,
"AGC": 24,
"ATC": 25,
"TTC": 26,
"AAG": 27,
"TTTT": 28,
"TGC": 29,
"TGG": 30,
"AAC": 31,
"TTG": 32,
"TAG": 33,
"TAC": 34,
"CCC": 35,
"TATT": 36,
"TGGG": 37,
"AGAA": 38,
"TAT": 39,
"AGGG": 40,
"TTTC": 41,
"AGGC": 42,
"AGCC": 43,
"TGTG": 44,
"ATAA": 45,
"ATTC": 46,
"TTGG": 47,
"ACAC": 48,
"AAGG": 49,
"TCTC": 50,
"TCCC": 51,
"TATG": 52,
"TTTG": 53,
"TTCC": 54,
"AGAG": 55,
"AAAC": 56,
"ATGG": 57,
"AGTG": 58,
"ACCC": 59,
"AGAC": 60,
"TGCC": 61,
"ATTG": 62,
"ATCC": 63,
"ATGC": 64,
"ATAC": 65,
"TCAC": 66,
"TCTG": 67,
"TTAA": 68,
"TGAA": 69,
"TGGC": 70,
"TTGC": 71,
"TATC": 72,
"TAAG": 73,
"TAAC": 74,
"AAAG": 75,
"GGG": 76,
"AAGC": 77,
"GGC": 78,
"TTAC": 79,
"ATAT": 80,
"TAGC": 81,
"TACC": 82,
"AACC": 83,
"AATG": 84,
"TAGG": 85,
"ATATT": 86,
"GCC": 87,
"AGTC": 88,
"TTTTC": 89,
"TGAC": 90,
"AAAAC": 91,
"AATC": 92,
"TTTAA": 93,
"AAAAG": 94,
"ATAG": 95,
"TGTC": 96,
"TTATT": 97,
"TTTTG": 98,
"TGAG": 99,
"AATT": 100,
"AAATT": 101,
"ACAG": 102,
"TTTCC": 103,
"AATAA": 104,
"TCAG": 105,
"AGGCC": 106,
"AAATG": 107,
"TGGGC": 108,
"ACTC": 109,
"ACG": 110,
"ATTTC": 111,
"ACTG": 112,
"TTAG": 113,
"TGGCC": 114,
"ATATG": 115,
"ACAA": 116,
"ATCTC": 117,
"TATTC": 118,
"TGTAA": 119,
"ACTT": 120,
"ATTCC": 121,
"AAAT": 122,
"ATGCC": 123,
"AAAAAAAA": 124,
"TTCCC": 125,
"TTTGC": 126,
"TTTAG": 127,
"TCCCC": 128,
"TGGGG": 129,
"TTCTC": 130,
"TAAAA": 131,
"ACCCC": 132,
"AGAAG": 133,
"ACCTC": 134,
"AGGGC": 135,
"TTTTTTTT": 136,
"ACATT": 137,
"AGATG": 138,
"GGCC": 139,
"GTG": 140,
"AAGCC": 141,
"ATAAAA": 142,
"AGGAG": 143,
"ATGGC": 144,
"ATTAC": 145,
"TTTAC": 146,
"ATTGC": 147,
"TGGAG": 148,
"TCAA": 149,
"ACTGC": 150,
"TATTG": 151,
"AAGGC": 152,
"TTTGG": 153,
"TTGCC": 154,
"AAATC": 155,
"TTGGC": 156,
"GGGC": 157,
"ATATC": 158,
"AGAAC": 159,
"ACTCC": 160,
"ATCCC": 161,
"ATAAC": 162,
"AATTC": 163,
"AGGGG": 164,
"AAAGC": 165,
"AGAGC": 166,
"TCG": 167,
"ATTTT": 168,
"TGAGG": 169,
"ATGGG": 170,
"AAAGG": 171,
"TTGGG": 172,
"AGAAAA": 173,
"TCTCC": 174,
"ATAAG": 175,
"ATTTG": 176,
"TATCC": 177,
"ACCAC": 178,
"TCTTC": 179,
"ATCAC": 180,
"AGAGG": 181,
"TTTATT": 182,
"TGTAG": 183,
"AGTAA": 184,
"ATAGC": 185,
"AGCCC": 186,
"AGATT": 187,
"AGTT": 188,
"GGGG": 189,
"TGTGC": 190,
"TGTGG": 191,
"TATGC": 192,
"TGCCC": 193,
"TGAAG": 194,
"TGTT": 195,
"TCTT": 196,
"AGACC": 197,
"TCATT": 198,
"TCTGC": 199,
"AAGGG": 200,
"AGTTC": 201,
"AAACC": 202,
"ACATG": 203,
"ACTTC": 204,
"ACAGC": 205,
"AATAG": 206,
"AATAC": 207,
"TGATG": 208,
"AATGC": 209,
"AATGG": 210,
"TGATT": 211,
"TCAGC": 212,
"TGTTC": 213,
"TGAGC": 214,
"TCTTG": 215,
"TCAGG": 216,
"TTTTCC": 217,
"TTAAAA": 218,
"TATGG": 219,
"ATACC": 220,
"AGTGG": 221,
"TCCCAGC": 222,
"AGATC": 223,
"AACCC": 224,
"TACCC": 225,
"TGAAC": 226,
"AATCC": 227,
"TGTTG": 228,
"TTAAG": 229,
"TAGCC": 230,
"TTAAC": 231,
"TTATG": 232,
"ATAGG": 233,
"AATTG": 234,
"TTTAT": 235,
"AGTGC": 236,
"TGTCC": 237,
"TAGGC": 238,
"TCTAA": 239,
"AĊ": 240,
"TCATC": 241,
"TCAAG": 242,
"TCACC": 243,
"AATTTT": 244,
"TCTTTT": 245,
"TTATC": 246,
"AGAAGC": 247,
"AGTGGC": 248,
"ATTTTC": 249,
"CCCC": 250,
"AAGAC": 251,
"AGTAG": 252,
"ATTGG": 253,
"TCATG": 254,
"AAATAA": 255
},
"merges": [
[
"T",
"T"
],
[
"A",
"A"
],
[
"T",
"G"
],
[
"A",
"G"
],
[
"C",
"C"
],
[
"T",
"C"
],
[
"A",
"C"
],
[
"G",
"G"
],
[
"A",
"TT"
],
[
"A",
"T"
],
[
"A",
"TG"
],
[
"G",
"C"
],
[
"T",
"AA"
],
[
"T",
"CC"
],
[
"A",
"CC"
],
[
"AA",
"AA"
],
[
"AG",
"G"
],
[
"AG",
"C"
],
[
"A",
"TC"
],
[
"TT",
"C"
],
[
"AA",
"G"
],
[
"TT",
"TT"
],
[
"TG",
"C"
],
[
"TG",
"G"
],
[
"AA",
"C"
],
[
"TT",
"G"
],
[
"T",
"AG"
],
[
"T",
"AC"
],
[
"CC",
"C"
],
[
"T",
"ATT"
],
[
"TG",
"GG"
],
[
"AG",
"AA"
],
[
"T",
"AT"
],
[
"AG",
"GG"
],
[
"TT",
"TC"
],
[
"AG",
"GC"
],
[
"AG",
"CC"
],
[
"TG",
"TG"
],
[
"AT",
"AA"
],
[
"ATT",
"C"
],
[
"TT",
"GG"
],
[
"AC",
"AC"
],
[
"AA",
"GG"
],
[
"TC",
"TC"
],
[
"TCC",
"C"
],
[
"T",
"ATG"
],
[
"TT",
"TG"
],
[
"TT",
"CC"
],
[
"AG",
"AG"
],
[
"AA",
"AC"
],
[
"ATG",
"G"
],
[
"AG",
"TG"
],
[
"ACC",
"C"
],
[
"AG",
"AC"
],
[
"TG",
"CC"
],
[
"ATT",
"G"
],
[
"AT",
"CC"
],
[
"ATG",
"C"
],
[
"AT",
"AC"
],
[
"TC",
"AC"
],
[
"TC",
"TG"
],
[
"TT",
"AA"
],
[
"TG",
"AA"
],
[
"TG",
"GC"
],
[
"TT",
"GC"
],
[
"T",
"ATC"
],
[
"TAA",
"G"
],
[
"TAA",
"C"
],
[
"AA",
"AG"
],
[
"GG",
"G"
],
[
"AA",
"GC"
],
[
"GG",
"C"
],
[
"TT",
"AC"
],
[
"AT",
"AT"
],
[
"T",
"AGC"
],
[
"T",
"ACC"
],
[
"AA",
"CC"
],
[
"AA",
"TG"
],
[
"T",
"AGG"
],
[
"AT",
"ATT"
],
[
"G",
"CC"
],
[
"AG",
"TC"
],
[
"TT",
"TTC"
],
[
"TG",
"AC"
],
[
"AAAA",
"C"
],
[
"AA",
"TC"
],
[
"TT",
"TAA"
],
[
"AAAA",
"G"
],
[
"AT",
"AG"
],
[
"TG",
"TC"
],
[
"TT",
"ATT"
],
[
"TTTT",
"G"
],
[
"TG",
"AG"
],
[
"AA",
"TT"
],
[
"AA",
"ATT"
],
[
"AC",
"AG"
],
[
"TT",
"TCC"
],
[
"AA",
"TAA"
],
[
"TC",
"AG"
],
[
"AGG",
"CC"
],
[
"AA",
"ATG"
],
[
"TGGG",
"C"
],
[
"AC",
"TC"
],
[
"AC",
"G"
],
[
"ATT",
"TC"
],
[
"AC",
"TG"
],
[
"TT",
"AG"
],
[
"TGG",
"CC"
],
[
"AT",
"ATG"
],
[
"AC",
"AA"
],
[
"ATC",
"TC"
],
[
"TATT",
"C"
],
[
"TG",
"TAA"
],
[
"AC",
"TT"
],
[
"ATT",
"CC"
],
[
"AA",
"AT"
],
[
"ATG",
"CC"
],
[
"AAAA",
"AAAA"
],
[
"TT",
"CCC"
],
[
"TT",
"TGC"
],
[
"TT",
"TAG"
],
[
"TCC",
"CC"
],
[
"TGGG",
"G"
],
[
"TTC",
"TC"
],
[
"TAA",
"AA"
],
[
"ACC",
"CC"
],
[
"AG",
"AAG"
],
[
"ACC",
"TC"
],
[
"AGGG",
"C"
],
[
"TTTT",
"TTTT"
],
[
"AC",
"ATT"
],
[
"AG",
"ATG"
],
[
"GG",
"CC"
],
[
"G",
"TG"
],
[
"AAG",
"CC"
],
[
"AT",
"AAAA"
],
[
"AGG",
"AG"
],
[
"ATG",
"GC"
],
[
"ATT",
"AC"
],
[
"TT",
"TAC"
],
[
"ATT",
"GC"
],
[
"TGG",
"AG"
],
[
"TC",
"AA"
],
[
"AC",
"TGC"
],
[
"TATT",
"G"
],
[
"AAGG",
"C"
],
[
"TT",
"TGG"
],
[
"TTG",
"CC"
],
[
"AA",
"ATC"
],
[
"TTGG",
"C"
],
[
"GG",
"GC"
],
[
"AT",
"ATC"
],
[
"AG",
"AAC"
],
[
"AC",
"TCC"
],
[
"AT",
"CCC"
],
[
"AT",
"AAC"
],
[
"AA",
"TTC"
],
[
"AGGG",
"G"
],
[
"AA",
"AGC"
],
[
"AG",
"AGC"
],
[
"TC",
"G"
],
[
"ATT",
"TT"
],
[
"TG",
"AGG"
],
[
"ATG",
"GG"
],
[
"AA",
"AGG"
],
[
"TTGG",
"G"
],
[
"AG",
"AAAA"
],
[
"TC",
"TCC"
],
[
"AT",
"AAG"
],
[
"ATT",
"TG"
],
[
"TAT",
"CC"
],
[
"ACC",
"AC"
],
[
"TC",
"TTC"
],
[
"ATC",
"AC"
],
[
"AG",
"AGG"
],
[
"TT",
"TATT"
],
[
"TG",
"TAG"
],
[
"AG",
"TAA"
],
[
"AT",
"AGC"
],
[
"AG",
"CCC"
],
[
"AG",
"ATT"
],
[
"AG",
"TT"
],
[
"GG",
"GG"
],
[
"TG",
"TGC"
],
[
"TG",
"TGG"
],
[
"TATG",
"C"
],
[
"TG",
"CCC"
],
[
"TG",
"AAG"
],
[
"TG",
"TT"
],
[
"TC",
"TT"
],
[
"AG",
"ACC"
],
[
"TC",
"ATT"
],
[
"TC",
"TGC"
],
[
"AAGG",
"G"
],
[
"AG",
"TTC"
],
[
"AA",
"ACC"
],
[
"AC",
"ATG"
],
[
"AC",
"TTC"
],
[
"AC",
"AGC"
],
[
"AA",
"TAG"
],
[
"AA",
"TAC"
],
[
"TG",
"ATG"
],
[
"AA",
"TGC"
],
[
"AA",
"TGG"
],
[
"TG",
"ATT"
],
[
"TC",
"AGC"
],
[
"TG",
"TTC"
],
[
"TG",
"AGC"
],
[
"TC",
"TTG"
],
[
"TC",
"AGG"
],
[
"TTTT",
"CC"
],
[
"TT",
"AAAA"
],
[
"TATG",
"G"
],
[
"AT",
"ACC"
],
[
"AG",
"TGG"
],
[
"TCCC",
"AGC"
],
[
"AG",
"ATC"
],
[
"AA",
"CCC"
],
[
"T",
"ACCC"
],
[
"TG",
"AAC"
],
[
"AA",
"TCC"
],
[
"TG",
"TTG"
],
[
"TT",
"AAG"
],
[
"TAG",
"CC"
],
[
"TT",
"AAC"
],
[
"TT",
"ATG"
],
[
"AT",
"AGG"
],
[
"AA",
"TTG"
],
[
"TT",
"TAT"
],
[
"AG",
"TGC"
],
[
"TG",
"TCC"
],
[
"TAG",
"GC"
],
[
"TC",
"TAA"
],
[
"A",
"Ċ"
],
[
"TC",
"ATC"
],
[
"TC",
"AAG"
],
[
"TC",
"ACC"
],
[
"AA",
"TTTT"
],
[
"TC",
"TTTT"
],
[
"TT",
"ATC"
],
[
"AGAA",
"GC"
],
[
"AGTG",
"GC"
],
[
"ATT",
"TTC"
],
[
"CC",
"CC"
],
[
"AAG",
"AC"
],
[
"AG",
"TAG"
],
[
"ATT",
"GG"
],
[
"TC",
"ATG"
],
[
"AA",
"ATAA"
]
]
}
}