Commit
·
892de74
1
Parent(s):
d775864
add tokenizer
Browse files- tokenizer.json +13 -10
- vocab.txt +6 -3
tokenizer.json
CHANGED
|
@@ -84,10 +84,10 @@
|
|
| 84 |
"C": 6,
|
| 85 |
"G": 7,
|
| 86 |
"T": 8,
|
| 87 |
-
"##
|
| 88 |
-
"##
|
| 89 |
"##T": 11,
|
| 90 |
-
"##
|
| 91 |
"AA": 13,
|
| 92 |
"GA": 14,
|
| 93 |
"TT": 15,
|
|
@@ -118,12 +118,12 @@
|
|
| 118 |
"TG": 40,
|
| 119 |
"GTT": 41,
|
| 120 |
"AGA": 42,
|
| 121 |
-
"
|
| 122 |
-
"
|
| 123 |
-
"
|
| 124 |
-
"
|
| 125 |
-
"
|
| 126 |
-
"
|
| 127 |
"TAT": 49,
|
| 128 |
"TTC": 50,
|
| 129 |
"ATA": 51,
|
|
@@ -164,7 +164,10 @@
|
|
| 164 |
"TGC": 86,
|
| 165 |
"CGA": 87,
|
| 166 |
"CGC": 88,
|
| 167 |
-
"CGG": 89
|
|
|
|
|
|
|
|
|
|
| 168 |
}
|
| 169 |
}
|
| 170 |
}
|
|
|
|
| 84 |
"C": 6,
|
| 85 |
"G": 7,
|
| 86 |
"T": 8,
|
| 87 |
+
"##C": 9,
|
| 88 |
+
"##G": 10,
|
| 89 |
"##T": 11,
|
| 90 |
+
"##A": 12,
|
| 91 |
"AA": 13,
|
| 92 |
"GA": 14,
|
| 93 |
"TT": 15,
|
|
|
|
| 118 |
"TG": 40,
|
| 119 |
"GTT": 41,
|
| 120 |
"AGA": 42,
|
| 121 |
+
"ATG": 43,
|
| 122 |
+
"GCT": 44,
|
| 123 |
+
"GAC": 45,
|
| 124 |
+
"ACT": 46,
|
| 125 |
+
"GAG": 47,
|
| 126 |
+
"TCA": 48,
|
| 127 |
"TAT": 49,
|
| 128 |
"TTC": 50,
|
| 129 |
"ATA": 51,
|
|
|
|
| 164 |
"TGC": 86,
|
| 165 |
"CGA": 87,
|
| 166 |
"CGC": 88,
|
| 167 |
+
"CGG": 89,
|
| 168 |
+
"TAA": 90,
|
| 169 |
+
"TGA": 91,
|
| 170 |
+
"TAG": 92
|
| 171 |
}
|
| 172 |
}
|
| 173 |
}
|
vocab.txt
CHANGED
|
@@ -7,10 +7,10 @@ A
|
|
| 7 |
C
|
| 8 |
G
|
| 9 |
T
|
| 10 |
-
##A
|
| 11 |
##C
|
| 12 |
-
##T
|
| 13 |
##G
|
|
|
|
|
|
|
| 14 |
AA
|
| 15 |
GA
|
| 16 |
TT
|
|
@@ -41,12 +41,12 @@ GGT
|
|
| 41 |
TG
|
| 42 |
GTT
|
| 43 |
AGA
|
|
|
|
| 44 |
GCT
|
| 45 |
GAC
|
| 46 |
ACT
|
| 47 |
GAG
|
| 48 |
TCA
|
| 49 |
-
ATG
|
| 50 |
TAT
|
| 51 |
TTC
|
| 52 |
ATA
|
|
@@ -88,3 +88,6 @@ TGC
|
|
| 88 |
CGA
|
| 89 |
CGC
|
| 90 |
CGG
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
C
|
| 8 |
G
|
| 9 |
T
|
|
|
|
| 10 |
##C
|
|
|
|
| 11 |
##G
|
| 12 |
+
##T
|
| 13 |
+
##A
|
| 14 |
AA
|
| 15 |
GA
|
| 16 |
TT
|
|
|
|
| 41 |
TG
|
| 42 |
GTT
|
| 43 |
AGA
|
| 44 |
+
ATG
|
| 45 |
GCT
|
| 46 |
GAC
|
| 47 |
ACT
|
| 48 |
GAG
|
| 49 |
TCA
|
|
|
|
| 50 |
TAT
|
| 51 |
TTC
|
| 52 |
ATA
|
|
|
|
| 88 |
CGA
|
| 89 |
CGC
|
| 90 |
CGG
|
| 91 |
+
TAA
|
| 92 |
+
TGA
|
| 93 |
+
TAG
|