XuJP264
commited on
Commit
·
47922fc
1
Parent(s):
6ab833d
Update tokenizer source and config vocab_size to 4260
Browse files- config.json +1 -1
- tokenizer.py +132 -0
config.json
CHANGED
|
@@ -24,5 +24,5 @@
|
|
| 24 |
"torch_dtype": "float32",
|
| 25 |
"transformers_version": "4.44.0",
|
| 26 |
"use_cache": true,
|
| 27 |
-
"vocab_size":
|
| 28 |
}
|
|
|
|
| 24 |
"torch_dtype": "float32",
|
| 25 |
"transformers_version": "4.44.0",
|
| 26 |
"use_cache": true,
|
| 27 |
+
"vocab_size": 4260
|
| 28 |
}
|
tokenizer.py
CHANGED
|
@@ -41,6 +41,138 @@ class DNAKmerTokenizer(PreTrainedTokenizer):
|
|
| 41 |
"<sp0>",
|
| 42 |
"<sp1>",
|
| 43 |
"<sp2>",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
]
|
| 45 |
self.kmers = [
|
| 46 |
"".join(kmer) for kmer in itertools.product("ATCG", repeat=self.k)
|
|
|
|
| 41 |
"<sp0>",
|
| 42 |
"<sp1>",
|
| 43 |
"<sp2>",
|
| 44 |
+
"<crispr_spacer>",
|
| 45 |
+
"<crispr_repeat>",
|
| 46 |
+
"<cas1>",
|
| 47 |
+
"<cas2>",
|
| 48 |
+
"<tracrrna>",
|
| 49 |
+
"<cas5>",
|
| 50 |
+
"<cas3>",
|
| 51 |
+
"<cas4>",
|
| 52 |
+
"<cas9>",
|
| 53 |
+
"<cas7>",
|
| 54 |
+
"<cas8c>",
|
| 55 |
+
"<cas6>",
|
| 56 |
+
"<csm3gr7>",
|
| 57 |
+
"<csn2>",
|
| 58 |
+
"<cas10>",
|
| 59 |
+
"<cas7b>",
|
| 60 |
+
"<cas6e>",
|
| 61 |
+
"<cas8e>",
|
| 62 |
+
"<cas12f>",
|
| 63 |
+
"<cse2gr11>",
|
| 64 |
+
"<csx1>",
|
| 65 |
+
"<csm2gr11>",
|
| 66 |
+
"<csm4>",
|
| 67 |
+
"<wyl>",
|
| 68 |
+
"<cas12a>",
|
| 69 |
+
"<csm6>",
|
| 70 |
+
"<deddh>",
|
| 71 |
+
"<csm5>",
|
| 72 |
+
"<casr>",
|
| 73 |
+
"<cas8b1>",
|
| 74 |
+
"<csx19>",
|
| 75 |
+
"<csx20>",
|
| 76 |
+
"<csm5gr7>",
|
| 77 |
+
"<cas6f>",
|
| 78 |
+
"<cas8b2>",
|
| 79 |
+
"<cas5f>",
|
| 80 |
+
"<rt>",
|
| 81 |
+
"<cas7f>",
|
| 82 |
+
"<cas3-cas2>",
|
| 83 |
+
"<primpol>",
|
| 84 |
+
"<cas8f>",
|
| 85 |
+
"<cysh>",
|
| 86 |
+
"<cas3hd>",
|
| 87 |
+
"<tnib>",
|
| 88 |
+
"<csx10gr5>",
|
| 89 |
+
"<cas8a1>",
|
| 90 |
+
"<csa3>",
|
| 91 |
+
"<recd>",
|
| 92 |
+
"<cmr1gr7>",
|
| 93 |
+
"<cmr4>",
|
| 94 |
+
"<cmr6gr7>",
|
| 95 |
+
"<cmr3gr5>",
|
| 96 |
+
"<cmr5gr11>",
|
| 97 |
+
"<cas8b6>",
|
| 98 |
+
"<csb2>",
|
| 99 |
+
"<cora>",
|
| 100 |
+
"<csm4gr5>",
|
| 101 |
+
"<abieii>",
|
| 102 |
+
"<can2>",
|
| 103 |
+
"<cas13d>",
|
| 104 |
+
"<csb1gr7>",
|
| 105 |
+
"<iscb-hnh>",
|
| 106 |
+
"<pd>",
|
| 107 |
+
"<tnpa>",
|
| 108 |
+
"<cse2>",
|
| 109 |
+
"<csb3>",
|
| 110 |
+
"<csm3>",
|
| 111 |
+
"<cas13b>",
|
| 112 |
+
"<unk>",
|
| 113 |
+
"<csx16>",
|
| 114 |
+
"<tpr>",
|
| 115 |
+
"<dhh>",
|
| 116 |
+
"<2og>",
|
| 117 |
+
"<cas12m>",
|
| 118 |
+
"<mem>",
|
| 119 |
+
"<csf4>",
|
| 120 |
+
"<hearo>",
|
| 121 |
+
"<tn7>",
|
| 122 |
+
"<tniq>",
|
| 123 |
+
"<csf2>",
|
| 124 |
+
"<csf3>",
|
| 125 |
+
"<csf1>",
|
| 126 |
+
"<cas8b4>",
|
| 127 |
+
"<tnsd>",
|
| 128 |
+
"<heat>",
|
| 129 |
+
"<csx17>",
|
| 130 |
+
"<cas8u1>",
|
| 131 |
+
"<csx3>",
|
| 132 |
+
"<htpx>",
|
| 133 |
+
"<cas12b>",
|
| 134 |
+
"<csm2>",
|
| 135 |
+
"<cas10d>",
|
| 136 |
+
"<csc2>",
|
| 137 |
+
"<cmr3>",
|
| 138 |
+
"<cmr5>",
|
| 139 |
+
"<csc1gr5>",
|
| 140 |
+
"<gramp>",
|
| 141 |
+
"<cmr6>",
|
| 142 |
+
"<cas8b12>",
|
| 143 |
+
"<cas11b>",
|
| 144 |
+
"<cas12c>",
|
| 145 |
+
"<cas8a4>",
|
| 146 |
+
"<tnsb>",
|
| 147 |
+
"<nyn>",
|
| 148 |
+
"<iscb-nterm>",
|
| 149 |
+
"<cas8b3>",
|
| 150 |
+
"<cas8a2>",
|
| 151 |
+
"<cas5u>",
|
| 152 |
+
"<csx27>",
|
| 153 |
+
"<csx21>",
|
| 154 |
+
"<csx23>",
|
| 155 |
+
"<tm>",
|
| 156 |
+
"<cas3d>",
|
| 157 |
+
"<cas12lambda>",
|
| 158 |
+
"<tnsc>",
|
| 159 |
+
"<cas8b5>",
|
| 160 |
+
"<stand>",
|
| 161 |
+
"<st>",
|
| 162 |
+
"<iscb-ruvciii-cterm>",
|
| 163 |
+
"<cas11>",
|
| 164 |
+
"<cas11d2>",
|
| 165 |
+
"<cas12j>",
|
| 166 |
+
"<cas12d>",
|
| 167 |
+
"<cas8b8>",
|
| 168 |
+
"<cmr1>",
|
| 169 |
+
"<cas12k>",
|
| 170 |
+
"<cas12g>",
|
| 171 |
+
"<cas13f>",
|
| 172 |
+
"<cas8b10>",
|
| 173 |
+
"<cas13i>",
|
| 174 |
+
"<toprim>",
|
| 175 |
+
"<cas12e>",
|
| 176 |
]
|
| 177 |
self.kmers = [
|
| 178 |
"".join(kmer) for kmer in itertools.product("ATCG", repeat=self.k)
|