XuJP264 commited on
Commit
47922fc
·
1 Parent(s): 6ab833d

Update tokenizer source and config vocab_size to 4260

Browse files
Files changed (2) hide show
  1. config.json +1 -1
  2. tokenizer.py +132 -0
config.json CHANGED
@@ -24,5 +24,5 @@
24
  "torch_dtype": "float32",
25
  "transformers_version": "4.44.0",
26
  "use_cache": true,
27
- "vocab_size": 4128
28
  }
 
24
  "torch_dtype": "float32",
25
  "transformers_version": "4.44.0",
26
  "use_cache": true,
27
+ "vocab_size": 4260
28
  }
tokenizer.py CHANGED
@@ -41,6 +41,138 @@ class DNAKmerTokenizer(PreTrainedTokenizer):
41
  "<sp0>",
42
  "<sp1>",
43
  "<sp2>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  ]
45
  self.kmers = [
46
  "".join(kmer) for kmer in itertools.product("ATCG", repeat=self.k)
 
41
  "<sp0>",
42
  "<sp1>",
43
  "<sp2>",
44
+ "<crispr_spacer>",
45
+ "<crispr_repeat>",
46
+ "<cas1>",
47
+ "<cas2>",
48
+ "<tracrrna>",
49
+ "<cas5>",
50
+ "<cas3>",
51
+ "<cas4>",
52
+ "<cas9>",
53
+ "<cas7>",
54
+ "<cas8c>",
55
+ "<cas6>",
56
+ "<csm3gr7>",
57
+ "<csn2>",
58
+ "<cas10>",
59
+ "<cas7b>",
60
+ "<cas6e>",
61
+ "<cas8e>",
62
+ "<cas12f>",
63
+ "<cse2gr11>",
64
+ "<csx1>",
65
+ "<csm2gr11>",
66
+ "<csm4>",
67
+ "<wyl>",
68
+ "<cas12a>",
69
+ "<csm6>",
70
+ "<deddh>",
71
+ "<csm5>",
72
+ "<casr>",
73
+ "<cas8b1>",
74
+ "<csx19>",
75
+ "<csx20>",
76
+ "<csm5gr7>",
77
+ "<cas6f>",
78
+ "<cas8b2>",
79
+ "<cas5f>",
80
+ "<rt>",
81
+ "<cas7f>",
82
+ "<cas3-cas2>",
83
+ "<primpol>",
84
+ "<cas8f>",
85
+ "<cysh>",
86
+ "<cas3hd>",
87
+ "<tnib>",
88
+ "<csx10gr5>",
89
+ "<cas8a1>",
90
+ "<csa3>",
91
+ "<recd>",
92
+ "<cmr1gr7>",
93
+ "<cmr4>",
94
+ "<cmr6gr7>",
95
+ "<cmr3gr5>",
96
+ "<cmr5gr11>",
97
+ "<cas8b6>",
98
+ "<csb2>",
99
+ "<cora>",
100
+ "<csm4gr5>",
101
+ "<abieii>",
102
+ "<can2>",
103
+ "<cas13d>",
104
+ "<csb1gr7>",
105
+ "<iscb-hnh>",
106
+ "<pd>",
107
+ "<tnpa>",
108
+ "<cse2>",
109
+ "<csb3>",
110
+ "<csm3>",
111
+ "<cas13b>",
112
+ "<unk>",
113
+ "<csx16>",
114
+ "<tpr>",
115
+ "<dhh>",
116
+ "<2og>",
117
+ "<cas12m>",
118
+ "<mem>",
119
+ "<csf4>",
120
+ "<hearo>",
121
+ "<tn7>",
122
+ "<tniq>",
123
+ "<csf2>",
124
+ "<csf3>",
125
+ "<csf1>",
126
+ "<cas8b4>",
127
+ "<tnsd>",
128
+ "<heat>",
129
+ "<csx17>",
130
+ "<cas8u1>",
131
+ "<csx3>",
132
+ "<htpx>",
133
+ "<cas12b>",
134
+ "<csm2>",
135
+ "<cas10d>",
136
+ "<csc2>",
137
+ "<cmr3>",
138
+ "<cmr5>",
139
+ "<csc1gr5>",
140
+ "<gramp>",
141
+ "<cmr6>",
142
+ "<cas8b12>",
143
+ "<cas11b>",
144
+ "<cas12c>",
145
+ "<cas8a4>",
146
+ "<tnsb>",
147
+ "<nyn>",
148
+ "<iscb-nterm>",
149
+ "<cas8b3>",
150
+ "<cas8a2>",
151
+ "<cas5u>",
152
+ "<csx27>",
153
+ "<csx21>",
154
+ "<csx23>",
155
+ "<tm>",
156
+ "<cas3d>",
157
+ "<cas12lambda>",
158
+ "<tnsc>",
159
+ "<cas8b5>",
160
+ "<stand>",
161
+ "<st>",
162
+ "<iscb-ruvciii-cterm>",
163
+ "<cas11>",
164
+ "<cas11d2>",
165
+ "<cas12j>",
166
+ "<cas12d>",
167
+ "<cas8b8>",
168
+ "<cmr1>",
169
+ "<cas12k>",
170
+ "<cas12g>",
171
+ "<cas13f>",
172
+ "<cas8b10>",
173
+ "<cas13i>",
174
+ "<toprim>",
175
+ "<cas12e>",
176
  ]
177
  self.kmers = [
178
  "".join(kmer) for kmer in itertools.product("ATCG", repeat=self.k)