Upload mgpt2 tokenizer
Browse files- added_tokens.json +1 -1
- evaluation.json +25 -25
- tokenizer.model +2 -2
- tokenizer.vocab +0 -0
- tokenizer_config.json +1 -1
added_tokens.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
{
|
| 2 |
-
"<|endoftext|>":
|
| 3 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"<|endoftext|>": 50256
|
| 3 |
}
|
evaluation.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"text": "tokenizer/artifacts/
|
| 3 |
"limit": 10000,
|
| 4 |
"overall": [
|
| 5 |
{
|
|
@@ -32,14 +32,14 @@
|
|
| 32 |
"name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
|
| 33 |
"total_chars": 43290048,
|
| 34 |
"total_bytes": 43442607,
|
| 35 |
-
"total_tokens":
|
| 36 |
-
"tokens_per_1k_chars":
|
| 37 |
-
"tokens_per_1k_bytes": 206.
|
| 38 |
-
"bytes_per_token": 4.
|
| 39 |
-
"chars_per_token": 4.
|
| 40 |
-
"p50_tokens_per_line":
|
| 41 |
-
"p95_tokens_per_line":
|
| 42 |
-
"p95_tokens_per_1k_bytes_per_line":
|
| 43 |
}
|
| 44 |
],
|
| 45 |
"by_bucket": {
|
|
@@ -74,14 +74,14 @@
|
|
| 74 |
"name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
|
| 75 |
"total_chars": 42393232,
|
| 76 |
"total_bytes": 42542626,
|
| 77 |
-
"total_tokens":
|
| 78 |
-
"tokens_per_1k_chars": 207.
|
| 79 |
-
"tokens_per_1k_bytes": 206.
|
| 80 |
-
"bytes_per_token": 4.
|
| 81 |
-
"chars_per_token": 4.
|
| 82 |
-
"p50_tokens_per_line":
|
| 83 |
-
"p95_tokens_per_line":
|
| 84 |
-
"p95_tokens_per_1k_bytes_per_line": 261.
|
| 85 |
}
|
| 86 |
],
|
| 87 |
"mixed": [
|
|
@@ -115,14 +115,14 @@
|
|
| 115 |
"name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
|
| 116 |
"total_chars": 896816,
|
| 117 |
"total_bytes": 899981,
|
| 118 |
-
"total_tokens":
|
| 119 |
-
"tokens_per_1k_chars":
|
| 120 |
-
"tokens_per_1k_bytes":
|
| 121 |
-
"bytes_per_token": 5.
|
| 122 |
-
"chars_per_token": 5.
|
| 123 |
-
"p50_tokens_per_line":
|
| 124 |
-
"p95_tokens_per_line":
|
| 125 |
-
"p95_tokens_per_1k_bytes_per_line":
|
| 126 |
}
|
| 127 |
]
|
| 128 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"text": "tokenizer/artifacts/heldout_eval.txt",
|
| 3 |
"limit": 10000,
|
| 4 |
"overall": [
|
| 5 |
{
|
|
|
|
| 32 |
"name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
|
| 33 |
"total_chars": 43290048,
|
| 34 |
"total_bytes": 43442607,
|
| 35 |
+
"total_tokens": 8960195,
|
| 36 |
+
"tokens_per_1k_chars": 206.98048198052356,
|
| 37 |
+
"tokens_per_1k_bytes": 206.25362101312197,
|
| 38 |
+
"bytes_per_token": 4.848399727907707,
|
| 39 |
+
"chars_per_token": 4.831373424350698,
|
| 40 |
+
"p50_tokens_per_line": 509,
|
| 41 |
+
"p95_tokens_per_line": 2677,
|
| 42 |
+
"p95_tokens_per_1k_bytes_per_line": 260.9841827768014
|
| 43 |
}
|
| 44 |
],
|
| 45 |
"by_bucket": {
|
|
|
|
| 74 |
"name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
|
| 75 |
"total_chars": 42393232,
|
| 76 |
"total_bytes": 42542626,
|
| 77 |
+
"total_tokens": 8786409,
|
| 78 |
+
"tokens_per_1k_chars": 207.25971070099115,
|
| 79 |
+
"tokens_per_1k_bytes": 206.5318910967085,
|
| 80 |
+
"bytes_per_token": 4.841867252025258,
|
| 81 |
+
"chars_per_token": 4.82486440137262,
|
| 82 |
+
"p50_tokens_per_line": 507,
|
| 83 |
+
"p95_tokens_per_line": 2644,
|
| 84 |
+
"p95_tokens_per_1k_bytes_per_line": 261.04640414124606
|
| 85 |
}
|
| 86 |
],
|
| 87 |
"mixed": [
|
|
|
|
| 115 |
"name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
|
| 116 |
"total_chars": 896816,
|
| 117 |
"total_bytes": 899981,
|
| 118 |
+
"total_tokens": 173786,
|
| 119 |
+
"tokens_per_1k_chars": 193.78111006048064,
|
| 120 |
+
"tokens_per_1k_bytes": 193.0996321033444,
|
| 121 |
+
"bytes_per_token": 5.178673771189854,
|
| 122 |
+
"chars_per_token": 5.160461717284476,
|
| 123 |
+
"p50_tokens_per_line": 924,
|
| 124 |
+
"p95_tokens_per_line": 6938,
|
| 125 |
+
"p95_tokens_per_1k_bytes_per_line": 248.08184143222508
|
| 126 |
}
|
| 127 |
]
|
| 128 |
}
|
tokenizer.model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b33c900b86a6fd548544ce693be7800b55cfd81576f0b2c3a9b1a91836c069ec
|
| 3 |
+
size 459544
|
tokenizer.vocab
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"added_tokens_decoder": {
|
| 3 |
-
"
|
| 4 |
"content": "<|endoftext|>",
|
| 5 |
"lstrip": false,
|
| 6 |
"normalized": false,
|
|
|
|
| 1 |
{
|
| 2 |
"added_tokens_decoder": {
|
| 3 |
+
"50256": {
|
| 4 |
"content": "<|endoftext|>",
|
| 5 |
"lstrip": false,
|
| 6 |
"normalized": false,
|