zluvolyote commited on
Commit
d775864
·
1 Parent(s): 1bbd033

add tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +24 -4
  2. vocab.txt +21 -1
tokenizer.json CHANGED
@@ -85,9 +85,9 @@
85
  "G": 7,
86
  "T": 8,
87
  "##A": 9,
88
- "##T": 10,
89
- "##G": 11,
90
- "##C": 12,
91
  "AA": 13,
92
  "GA": 14,
93
  "TT": 15,
@@ -144,7 +144,27 @@
144
  "CTT": 66,
145
  "GTA": 67,
146
  "GTC": 68,
147
- "GGA": 69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  }
149
  }
150
  }
 
85
  "G": 7,
86
  "T": 8,
87
  "##A": 9,
88
+ "##C": 10,
89
+ "##T": 11,
90
+ "##G": 12,
91
  "AA": 13,
92
  "GA": 14,
93
  "TT": 15,
 
144
  "CTT": 66,
145
  "GTA": 67,
146
  "GTC": 68,
147
+ "GGA": 69,
148
+ "GTG": 70,
149
+ "CTG": 71,
150
+ "TGG": 72,
151
+ "AGC": 73,
152
+ "GGC": 74,
153
+ "AGG": 75,
154
+ "TCG": 76,
155
+ "ACG": 77,
156
+ "TGT": 78,
157
+ "CAC": 79,
158
+ "CCC": 80,
159
+ "CGT": 81,
160
+ "GCG": 82,
161
+ "GGG": 83,
162
+ "CCG": 84,
163
+ "CTC": 85,
164
+ "TGC": 86,
165
+ "CGA": 87,
166
+ "CGC": 88,
167
+ "CGG": 89
168
  }
169
  }
170
  }
vocab.txt CHANGED
@@ -8,9 +8,9 @@ C
8
  G
9
  T
10
  ##A
 
11
  ##T
12
  ##G
13
- ##C
14
  AA
15
  GA
16
  TT
@@ -68,3 +68,23 @@ CTT
68
  GTA
69
  GTC
70
  GGA
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  G
9
  T
10
  ##A
11
+ ##C
12
  ##T
13
  ##G
 
14
  AA
15
  GA
16
  TT
 
68
  GTA
69
  GTC
70
  GGA
71
+ GTG
72
+ CTG
73
+ TGG
74
+ AGC
75
+ GGC
76
+ AGG
77
+ TCG
78
+ ACG
79
+ TGT
80
+ CAC
81
+ CCC
82
+ CGT
83
+ GCG
84
+ GGG
85
+ CCG
86
+ CTC
87
+ TGC
88
+ CGA
89
+ CGC
90
+ CGG