Zeb
commited on
Commit
·
64b3309
1
Parent(s):
38928ba
Remove normalizers
Browse files- .DS_Store +0 -0
- frequencymulti_128000/tokenizer.json +1 -8
- frequencymulti_16000/tokenizer.json +1 -8
- frequencymulti_256000/tokenizer.json +2 -2
- frequencymulti_32000/tokenizer.json +1 -8
- frequencymulti_64000/tokenizer.json +1 -8
- frequencymulti_8064/tokenizer.json +1 -8
- fw57Mmulti_Entropy_thresholdB_16000/tokenizer.json +1 -8
- fw57Mmulti_Entropy_thresholdB_32000/tokenizer.json +1 -8
- fw57Mmulti_Entropy_thresholdB_64000/tokenizer.json +1 -8
- fw57Mmulti_Entropy_thresholdM_16000/tokenizer.json +1 -8
- fw57Mmulti_Entropy_thresholdM_32000/tokenizer.json +1 -8
- fw57Mmulti_Entropy_thresholdM_64000/tokenizer.json +1 -8
- fw57Mmulti_Entropy_thresholdM_8064/tokenizer.json +1 -8
- fw57Mmulti_Surprisal_thresholdB_16000/tokenizer.json +1 -8
- fw57Mmulti_Surprisal_thresholdB_32000/tokenizer.json +1 -8
- fw57Mmulti_Surprisal_thresholdB_64000/tokenizer.json +1 -8
.DS_Store
DELETED
|
Binary file (12.3 kB)
|
|
|
frequencymulti_128000/tokenizer.json
CHANGED
|
@@ -22,14 +22,7 @@
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
-
"normalizer":
|
| 26 |
-
"type": "Sequence",
|
| 27 |
-
"normalizers": [
|
| 28 |
-
{
|
| 29 |
-
"type": "NFD"
|
| 30 |
-
}
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
"pre_tokenizer": {
|
| 34 |
"type": "ByteLevel",
|
| 35 |
"add_prefix_space": true,
|
|
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"pre_tokenizer": {
|
| 27 |
"type": "ByteLevel",
|
| 28 |
"add_prefix_space": true,
|
frequencymulti_16000/tokenizer.json
CHANGED
|
@@ -22,14 +22,7 @@
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
-
"normalizer":
|
| 26 |
-
"type": "Sequence",
|
| 27 |
-
"normalizers": [
|
| 28 |
-
{
|
| 29 |
-
"type": "NFD"
|
| 30 |
-
}
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
"pre_tokenizer": {
|
| 34 |
"type": "ByteLevel",
|
| 35 |
"add_prefix_space": true,
|
|
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"pre_tokenizer": {
|
| 27 |
"type": "ByteLevel",
|
| 28 |
"add_prefix_space": true,
|
frequencymulti_256000/tokenizer.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c1ac462fae29c81b13d1ee24e1eedfe812d9c0d374ff3c61ba636cebaf52fd7
|
| 3 |
+
size 20274158
|
frequencymulti_32000/tokenizer.json
CHANGED
|
@@ -22,14 +22,7 @@
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
-
"normalizer":
|
| 26 |
-
"type": "Sequence",
|
| 27 |
-
"normalizers": [
|
| 28 |
-
{
|
| 29 |
-
"type": "NFD"
|
| 30 |
-
}
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
"pre_tokenizer": {
|
| 34 |
"type": "ByteLevel",
|
| 35 |
"add_prefix_space": true,
|
|
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"pre_tokenizer": {
|
| 27 |
"type": "ByteLevel",
|
| 28 |
"add_prefix_space": true,
|
frequencymulti_64000/tokenizer.json
CHANGED
|
@@ -22,14 +22,7 @@
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
-
"normalizer":
|
| 26 |
-
"type": "Sequence",
|
| 27 |
-
"normalizers": [
|
| 28 |
-
{
|
| 29 |
-
"type": "NFD"
|
| 30 |
-
}
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
"pre_tokenizer": {
|
| 34 |
"type": "ByteLevel",
|
| 35 |
"add_prefix_space": true,
|
|
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"pre_tokenizer": {
|
| 27 |
"type": "ByteLevel",
|
| 28 |
"add_prefix_space": true,
|
frequencymulti_8064/tokenizer.json
CHANGED
|
@@ -22,14 +22,7 @@
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
-
"normalizer":
|
| 26 |
-
"type": "Sequence",
|
| 27 |
-
"normalizers": [
|
| 28 |
-
{
|
| 29 |
-
"type": "NFD"
|
| 30 |
-
}
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
"pre_tokenizer": {
|
| 34 |
"type": "ByteLevel",
|
| 35 |
"add_prefix_space": true,
|
|
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"pre_tokenizer": {
|
| 27 |
"type": "ByteLevel",
|
| 28 |
"add_prefix_space": true,
|
fw57Mmulti_Entropy_thresholdB_16000/tokenizer.json
CHANGED
|
@@ -31,14 +31,7 @@
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
-
"normalizer":
|
| 35 |
-
"type": "Sequence",
|
| 36 |
-
"normalizers": [
|
| 37 |
-
{
|
| 38 |
-
"type": "NFD"
|
| 39 |
-
}
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
"pre_tokenizer": {
|
| 43 |
"type": "ByteLevel",
|
| 44 |
"add_prefix_space": true,
|
|
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
"pre_tokenizer": {
|
| 36 |
"type": "ByteLevel",
|
| 37 |
"add_prefix_space": true,
|
fw57Mmulti_Entropy_thresholdB_32000/tokenizer.json
CHANGED
|
@@ -31,14 +31,7 @@
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
-
"normalizer":
|
| 35 |
-
"type": "Sequence",
|
| 36 |
-
"normalizers": [
|
| 37 |
-
{
|
| 38 |
-
"type": "NFD"
|
| 39 |
-
}
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
"pre_tokenizer": {
|
| 43 |
"type": "ByteLevel",
|
| 44 |
"add_prefix_space": true,
|
|
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
"pre_tokenizer": {
|
| 36 |
"type": "ByteLevel",
|
| 37 |
"add_prefix_space": true,
|
fw57Mmulti_Entropy_thresholdB_64000/tokenizer.json
CHANGED
|
@@ -31,14 +31,7 @@
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
-
"normalizer":
|
| 35 |
-
"type": "Sequence",
|
| 36 |
-
"normalizers": [
|
| 37 |
-
{
|
| 38 |
-
"type": "NFD"
|
| 39 |
-
}
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
"pre_tokenizer": {
|
| 43 |
"type": "ByteLevel",
|
| 44 |
"add_prefix_space": true,
|
|
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
"pre_tokenizer": {
|
| 36 |
"type": "ByteLevel",
|
| 37 |
"add_prefix_space": true,
|
fw57Mmulti_Entropy_thresholdM_16000/tokenizer.json
CHANGED
|
@@ -22,14 +22,7 @@
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
-
"normalizer":
|
| 26 |
-
"type": "Sequence",
|
| 27 |
-
"normalizers": [
|
| 28 |
-
{
|
| 29 |
-
"type": "NFD"
|
| 30 |
-
}
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
"pre_tokenizer": {
|
| 34 |
"type": "ByteLevel",
|
| 35 |
"add_prefix_space": true,
|
|
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"pre_tokenizer": {
|
| 27 |
"type": "ByteLevel",
|
| 28 |
"add_prefix_space": true,
|
fw57Mmulti_Entropy_thresholdM_32000/tokenizer.json
CHANGED
|
@@ -22,14 +22,7 @@
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
-
"normalizer":
|
| 26 |
-
"type": "Sequence",
|
| 27 |
-
"normalizers": [
|
| 28 |
-
{
|
| 29 |
-
"type": "NFD"
|
| 30 |
-
}
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
"pre_tokenizer": {
|
| 34 |
"type": "ByteLevel",
|
| 35 |
"add_prefix_space": true,
|
|
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"pre_tokenizer": {
|
| 27 |
"type": "ByteLevel",
|
| 28 |
"add_prefix_space": true,
|
fw57Mmulti_Entropy_thresholdM_64000/tokenizer.json
CHANGED
|
@@ -22,14 +22,7 @@
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
-
"normalizer":
|
| 26 |
-
"type": "Sequence",
|
| 27 |
-
"normalizers": [
|
| 28 |
-
{
|
| 29 |
-
"type": "NFD"
|
| 30 |
-
}
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
"pre_tokenizer": {
|
| 34 |
"type": "ByteLevel",
|
| 35 |
"add_prefix_space": true,
|
|
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"pre_tokenizer": {
|
| 27 |
"type": "ByteLevel",
|
| 28 |
"add_prefix_space": true,
|
fw57Mmulti_Entropy_thresholdM_8064/tokenizer.json
CHANGED
|
@@ -22,14 +22,7 @@
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
-
"normalizer":
|
| 26 |
-
"type": "Sequence",
|
| 27 |
-
"normalizers": [
|
| 28 |
-
{
|
| 29 |
-
"type": "NFD"
|
| 30 |
-
}
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
"pre_tokenizer": {
|
| 34 |
"type": "ByteLevel",
|
| 35 |
"add_prefix_space": true,
|
|
|
|
| 22 |
"special": true
|
| 23 |
}
|
| 24 |
],
|
| 25 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"pre_tokenizer": {
|
| 27 |
"type": "ByteLevel",
|
| 28 |
"add_prefix_space": true,
|
fw57Mmulti_Surprisal_thresholdB_16000/tokenizer.json
CHANGED
|
@@ -31,14 +31,7 @@
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
-
"normalizer":
|
| 35 |
-
"type": "Sequence",
|
| 36 |
-
"normalizers": [
|
| 37 |
-
{
|
| 38 |
-
"type": "NFD"
|
| 39 |
-
}
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
"pre_tokenizer": {
|
| 43 |
"type": "ByteLevel",
|
| 44 |
"add_prefix_space": true,
|
|
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
"pre_tokenizer": {
|
| 36 |
"type": "ByteLevel",
|
| 37 |
"add_prefix_space": true,
|
fw57Mmulti_Surprisal_thresholdB_32000/tokenizer.json
CHANGED
|
@@ -31,14 +31,7 @@
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
-
"normalizer":
|
| 35 |
-
"type": "Sequence",
|
| 36 |
-
"normalizers": [
|
| 37 |
-
{
|
| 38 |
-
"type": "NFD"
|
| 39 |
-
}
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
"pre_tokenizer": {
|
| 43 |
"type": "ByteLevel",
|
| 44 |
"add_prefix_space": true,
|
|
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
"pre_tokenizer": {
|
| 36 |
"type": "ByteLevel",
|
| 37 |
"add_prefix_space": true,
|
fw57Mmulti_Surprisal_thresholdB_64000/tokenizer.json
CHANGED
|
@@ -31,14 +31,7 @@
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
-
"normalizer":
|
| 35 |
-
"type": "Sequence",
|
| 36 |
-
"normalizers": [
|
| 37 |
-
{
|
| 38 |
-
"type": "NFD"
|
| 39 |
-
}
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
"pre_tokenizer": {
|
| 43 |
"type": "ByteLevel",
|
| 44 |
"add_prefix_space": true,
|
|
|
|
| 31 |
"special": true
|
| 32 |
}
|
| 33 |
],
|
| 34 |
+
"normalizer": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
"pre_tokenizer": {
|
| 36 |
"type": "ByteLevel",
|
| 37 |
"add_prefix_space": true,
|