Commit ·
fb57838
1
Parent(s): 3852685
add tokenizer
Browse files- special_tokens_map.json +7 -0
- tokenizer.json +104 -0
- tokenizer_config.json +8 -0
special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[bos]",
|
| 3 |
+
"mask_token": "[mask]",
|
| 4 |
+
"pad_token": "[nop]",
|
| 5 |
+
"sep_token": "[eos]",
|
| 6 |
+
"unk_token": "[unk]"
|
| 7 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "1.0",
|
| 3 |
+
"truncation": null,
|
| 4 |
+
"padding": null,
|
| 5 |
+
"added_tokens": [],
|
| 6 |
+
"normalizer": null,
|
| 7 |
+
"pre_tokenizer": {
|
| 8 |
+
"type": "WhitespaceSplit"
|
| 9 |
+
},
|
| 10 |
+
"post_processor": null,
|
| 11 |
+
"decoder": null,
|
| 12 |
+
"model": {
|
| 13 |
+
"type": "WordPiece",
|
| 14 |
+
"unk_token": "[unk]",
|
| 15 |
+
"continuing_subword_prefix": "##",
|
| 16 |
+
"max_input_chars_per_word": 100,
|
| 17 |
+
"vocab": {
|
| 18 |
+
"[As+1]": 0,
|
| 19 |
+
"[=SH0]": 1,
|
| 20 |
+
"[=SH1]": 2,
|
| 21 |
+
"[=Ring2]": 3,
|
| 22 |
+
"[=Ring1]": 4,
|
| 23 |
+
"[CH1]": 5,
|
| 24 |
+
"[S]": 6,
|
| 25 |
+
"[NH2+1]": 7,
|
| 26 |
+
"[B]": 8,
|
| 27 |
+
"[C-1]": 9,
|
| 28 |
+
"[#C]": 10,
|
| 29 |
+
"[=P]": 11,
|
| 30 |
+
"[As]": 12,
|
| 31 |
+
"[B-1]": 13,
|
| 32 |
+
"[bos]": 14,
|
| 33 |
+
"[O]": 15,
|
| 34 |
+
"[OH0]": 16,
|
| 35 |
+
"[I]": 17,
|
| 36 |
+
"[nop]": 18,
|
| 37 |
+
"[Cl]": 19,
|
| 38 |
+
"[SiH2]": 20,
|
| 39 |
+
"[Ring1]": 21,
|
| 40 |
+
"[Fe-4]": 22,
|
| 41 |
+
"[CH0]": 23,
|
| 42 |
+
"[Fe]": 24,
|
| 43 |
+
"[Fe+2]": 25,
|
| 44 |
+
"[CH1-1]": 26,
|
| 45 |
+
"[=Branch3]": 27,
|
| 46 |
+
"[#Branch1]": 28,
|
| 47 |
+
"[=Branch2]": 29,
|
| 48 |
+
"[NH0]": 30,
|
| 49 |
+
"[N-1]": 31,
|
| 50 |
+
"[C]": 32,
|
| 51 |
+
"[=NH2+1]": 33,
|
| 52 |
+
"[NH1-1]": 34,
|
| 53 |
+
"[#N+1]": 35,
|
| 54 |
+
"[SeH1]": 36,
|
| 55 |
+
"[Branch3]": 37,
|
| 56 |
+
"[SH1]": 38,
|
| 57 |
+
"[CH2-1]": 39,
|
| 58 |
+
"[SH0]": 40,
|
| 59 |
+
"[=Se]": 41,
|
| 60 |
+
"[NH1+1]": 42,
|
| 61 |
+
"[K]": 43,
|
| 62 |
+
"[Ring2]": 44,
|
| 63 |
+
"[#N]": 45,
|
| 64 |
+
"[O-1]": 46,
|
| 65 |
+
"[OH1+1]": 47,
|
| 66 |
+
"[#Branch2]": 48,
|
| 67 |
+
"[=C]": 49,
|
| 68 |
+
"[I+1]": 50,
|
| 69 |
+
"[Si]": 51,
|
| 70 |
+
"[F]": 52,
|
| 71 |
+
"[=N+1]": 53,
|
| 72 |
+
"[=OH1+1]": 54,
|
| 73 |
+
"[Branch2]": 55,
|
| 74 |
+
"[=O+1]": 56,
|
| 75 |
+
"[#S]": 57,
|
| 76 |
+
"[Na]": 58,
|
| 77 |
+
"[C+1]": 59,
|
| 78 |
+
"[=B]": 60,
|
| 79 |
+
"[S+1]": 61,
|
| 80 |
+
"[unk]": 62,
|
| 81 |
+
"[=Fe]": 63,
|
| 82 |
+
"[P]": 64,
|
| 83 |
+
"[=N]": 65,
|
| 84 |
+
"[SiH1]": 66,
|
| 85 |
+
"[NH3+1]": 67,
|
| 86 |
+
"[Fe-3]": 68,
|
| 87 |
+
"[CH1+1]": 69,
|
| 88 |
+
"[Branch1]": 70,
|
| 89 |
+
"[Fe+1]": 71,
|
| 90 |
+
"[=Branch1]": 72,
|
| 91 |
+
"[=S]": 73,
|
| 92 |
+
"[Se]": 74,
|
| 93 |
+
"[N]": 75,
|
| 94 |
+
"[=As]": 76,
|
| 95 |
+
"[#Ring2]": 77,
|
| 96 |
+
"[Br]": 78,
|
| 97 |
+
"[=O]": 79,
|
| 98 |
+
"[P+1]": 80,
|
| 99 |
+
"[N+1]": 81,
|
| 100 |
+
"[eos]": 82,
|
| 101 |
+
"[Se+1]": 83
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
}
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[bos]",
|
| 3 |
+
"mask_token": "[mask]",
|
| 4 |
+
"pad_token": "[nop]",
|
| 5 |
+
"sep_token": "[eos]",
|
| 6 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 7 |
+
"unk_token": "[unk]"
|
| 8 |
+
}
|