TRAFICA-4_mer / tokenizer.json
Allanxu's picture
Upload folder using huggingface_hub
3d02714 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": true,
"strip_accents": null,
"lowercase": false
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"AAAA": 5,
"AAAT": 6,
"AAAC": 7,
"AAAG": 8,
"AATA": 9,
"AATT": 10,
"AATC": 11,
"AATG": 12,
"AACA": 13,
"AACT": 14,
"AACC": 15,
"AACG": 16,
"AAGA": 17,
"AAGT": 18,
"AAGC": 19,
"AAGG": 20,
"ATAA": 21,
"ATAT": 22,
"ATAC": 23,
"ATAG": 24,
"ATTA": 25,
"ATTT": 26,
"ATTC": 27,
"ATTG": 28,
"ATCA": 29,
"ATCT": 30,
"ATCC": 31,
"ATCG": 32,
"ATGA": 33,
"ATGT": 34,
"ATGC": 35,
"ATGG": 36,
"ACAA": 37,
"ACAT": 38,
"ACAC": 39,
"ACAG": 40,
"ACTA": 41,
"ACTT": 42,
"ACTC": 43,
"ACTG": 44,
"ACCA": 45,
"ACCT": 46,
"ACCC": 47,
"ACCG": 48,
"ACGA": 49,
"ACGT": 50,
"ACGC": 51,
"ACGG": 52,
"AGAA": 53,
"AGAT": 54,
"AGAC": 55,
"AGAG": 56,
"AGTA": 57,
"AGTT": 58,
"AGTC": 59,
"AGTG": 60,
"AGCA": 61,
"AGCT": 62,
"AGCC": 63,
"AGCG": 64,
"AGGA": 65,
"AGGT": 66,
"AGGC": 67,
"AGGG": 68,
"TAAA": 69,
"TAAT": 70,
"TAAC": 71,
"TAAG": 72,
"TATA": 73,
"TATT": 74,
"TATC": 75,
"TATG": 76,
"TACA": 77,
"TACT": 78,
"TACC": 79,
"TACG": 80,
"TAGA": 81,
"TAGT": 82,
"TAGC": 83,
"TAGG": 84,
"TTAA": 85,
"TTAT": 86,
"TTAC": 87,
"TTAG": 88,
"TTTA": 89,
"TTTT": 90,
"TTTC": 91,
"TTTG": 92,
"TTCA": 93,
"TTCT": 94,
"TTCC": 95,
"TTCG": 96,
"TTGA": 97,
"TTGT": 98,
"TTGC": 99,
"TTGG": 100,
"TCAA": 101,
"TCAT": 102,
"TCAC": 103,
"TCAG": 104,
"TCTA": 105,
"TCTT": 106,
"TCTC": 107,
"TCTG": 108,
"TCCA": 109,
"TCCT": 110,
"TCCC": 111,
"TCCG": 112,
"TCGA": 113,
"TCGT": 114,
"TCGC": 115,
"TCGG": 116,
"TGAA": 117,
"TGAT": 118,
"TGAC": 119,
"TGAG": 120,
"TGTA": 121,
"TGTT": 122,
"TGTC": 123,
"TGTG": 124,
"TGCA": 125,
"TGCT": 126,
"TGCC": 127,
"TGCG": 128,
"TGGA": 129,
"TGGT": 130,
"TGGC": 131,
"TGGG": 132,
"CAAA": 133,
"CAAT": 134,
"CAAC": 135,
"CAAG": 136,
"CATA": 137,
"CATT": 138,
"CATC": 139,
"CATG": 140,
"CACA": 141,
"CACT": 142,
"CACC": 143,
"CACG": 144,
"CAGA": 145,
"CAGT": 146,
"CAGC": 147,
"CAGG": 148,
"CTAA": 149,
"CTAT": 150,
"CTAC": 151,
"CTAG": 152,
"CTTA": 153,
"CTTT": 154,
"CTTC": 155,
"CTTG": 156,
"CTCA": 157,
"CTCT": 158,
"CTCC": 159,
"CTCG": 160,
"CTGA": 161,
"CTGT": 162,
"CTGC": 163,
"CTGG": 164,
"CCAA": 165,
"CCAT": 166,
"CCAC": 167,
"CCAG": 168,
"CCTA": 169,
"CCTT": 170,
"CCTC": 171,
"CCTG": 172,
"CCCA": 173,
"CCCT": 174,
"CCCC": 175,
"CCCG": 176,
"CCGA": 177,
"CCGT": 178,
"CCGC": 179,
"CCGG": 180,
"CGAA": 181,
"CGAT": 182,
"CGAC": 183,
"CGAG": 184,
"CGTA": 185,
"CGTT": 186,
"CGTC": 187,
"CGTG": 188,
"CGCA": 189,
"CGCT": 190,
"CGCC": 191,
"CGCG": 192,
"CGGA": 193,
"CGGT": 194,
"CGGC": 195,
"CGGG": 196,
"GAAA": 197,
"GAAT": 198,
"GAAC": 199,
"GAAG": 200,
"GATA": 201,
"GATT": 202,
"GATC": 203,
"GATG": 204,
"GACA": 205,
"GACT": 206,
"GACC": 207,
"GACG": 208,
"GAGA": 209,
"GAGT": 210,
"GAGC": 211,
"GAGG": 212,
"GTAA": 213,
"GTAT": 214,
"GTAC": 215,
"GTAG": 216,
"GTTA": 217,
"GTTT": 218,
"GTTC": 219,
"GTTG": 220,
"GTCA": 221,
"GTCT": 222,
"GTCC": 223,
"GTCG": 224,
"GTGA": 225,
"GTGT": 226,
"GTGC": 227,
"GTGG": 228,
"GCAA": 229,
"GCAT": 230,
"GCAC": 231,
"GCAG": 232,
"GCTA": 233,
"GCTT": 234,
"GCTC": 235,
"GCTG": 236,
"GCCA": 237,
"GCCT": 238,
"GCCC": 239,
"GCCG": 240,
"GCGA": 241,
"GCGT": 242,
"GCGC": 243,
"GCGG": 244,
"GGAA": 245,
"GGAT": 246,
"GGAC": 247,
"GGAG": 248,
"GGTA": 249,
"GGTT": 250,
"GGTC": 251,
"GGTG": 252,
"GGCA": 253,
"GGCT": 254,
"GGCC": 255,
"GGCG": 256,
"GGGA": 257,
"GGGT": 258,
"GGGC": 259,
"GGGG": 260
}
}
}