ace-1 commited on
Commit
2986bf6
·
verified ·
1 Parent(s): 1f6db67

Upload mgpt2 tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,3 +1,3 @@
1
  {
2
- "<|endoftext|>": 50252
3
  }
 
1
  {
2
+ "<|endoftext|>": 50256
3
  }
evaluation.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "text": "tokenizer/artifacts/heldout_eval_50k.txt",
3
  "limit": 10000,
4
  "overall": [
5
  {
@@ -32,14 +32,14 @@
32
  "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
33
  "total_chars": 43290048,
34
  "total_bytes": 43442607,
35
- "total_tokens": 8971173,
36
- "tokens_per_1k_chars": 207.23407375293277,
37
- "tokens_per_1k_bytes": 206.50632223798172,
38
- "bytes_per_token": 4.842466754347509,
39
- "chars_per_token": 4.825461285831853,
40
- "p50_tokens_per_line": 510,
41
- "p95_tokens_per_line": 2678,
42
- "p95_tokens_per_1k_bytes_per_line": 261.3065326633166
43
  }
44
  ],
45
  "by_bucket": {
@@ -74,14 +74,14 @@
74
  "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
75
  "total_chars": 42393232,
76
  "total_bytes": 42542626,
77
- "total_tokens": 8795657,
78
- "tokens_per_1k_chars": 207.47785872990292,
79
- "tokens_per_1k_bytes": 206.74927307026135,
80
- "bytes_per_token": 4.836776377250727,
81
- "chars_per_token": 4.8197914038712515,
82
- "p50_tokens_per_line": 508,
83
- "p95_tokens_per_line": 2643,
84
- "p95_tokens_per_1k_bytes_per_line": 261.53846153846155
85
  }
86
  ],
87
  "mixed": [
@@ -115,14 +115,14 @@
115
  "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
116
  "total_chars": 896816,
117
  "total_bytes": 899981,
118
- "total_tokens": 175516,
119
- "tokens_per_1k_chars": 195.71015682146617,
120
- "tokens_per_1k_bytes": 195.02189490667024,
121
- "bytes_per_token": 5.12762938991317,
122
- "chars_per_token": 5.109596845871601,
123
- "p50_tokens_per_line": 930,
124
- "p95_tokens_per_line": 6963,
125
- "p95_tokens_per_1k_bytes_per_line": 256.91514299109235
126
  }
127
  ]
128
  }
 
1
  {
2
+ "text": "tokenizer/artifacts/heldout_eval.txt",
3
  "limit": 10000,
4
  "overall": [
5
  {
 
32
  "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
33
  "total_chars": 43290048,
34
  "total_bytes": 43442607,
35
+ "total_tokens": 8960195,
36
+ "tokens_per_1k_chars": 206.98048198052356,
37
+ "tokens_per_1k_bytes": 206.25362101312197,
38
+ "bytes_per_token": 4.848399727907707,
39
+ "chars_per_token": 4.831373424350698,
40
+ "p50_tokens_per_line": 509,
41
+ "p95_tokens_per_line": 2677,
42
+ "p95_tokens_per_1k_bytes_per_line": 260.9841827768014
43
  }
44
  ],
45
  "by_bucket": {
 
74
  "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
75
  "total_chars": 42393232,
76
  "total_bytes": 42542626,
77
+ "total_tokens": 8786409,
78
+ "tokens_per_1k_chars": 207.25971070099115,
79
+ "tokens_per_1k_bytes": 206.5318910967085,
80
+ "bytes_per_token": 4.841867252025258,
81
+ "chars_per_token": 4.82486440137262,
82
+ "p50_tokens_per_line": 507,
83
+ "p95_tokens_per_line": 2644,
84
+ "p95_tokens_per_1k_bytes_per_line": 261.04640414124606
85
  }
86
  ],
87
  "mixed": [
 
115
  "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)",
116
  "total_chars": 896816,
117
  "total_bytes": 899981,
118
+ "total_tokens": 173786,
119
+ "tokens_per_1k_chars": 193.78111006048064,
120
+ "tokens_per_1k_bytes": 193.0996321033444,
121
+ "bytes_per_token": 5.178673771189854,
122
+ "chars_per_token": 5.160461717284476,
123
+ "p50_tokens_per_line": 924,
124
+ "p95_tokens_per_line": 6938,
125
+ "p95_tokens_per_1k_bytes_per_line": 248.08184143222508
126
  }
127
  ]
128
  }
tokenizer.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a54fe6d4082efbb4fdbcf662349226590b07ed6f2ff85dedf29d4585881d1e8
3
- size 460673
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b33c900b86a6fd548544ce693be7800b55cfd81576f0b2c3a9b1a91836c069ec
3
+ size 459544
tokenizer.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "added_tokens_decoder": {
3
- "50252": {
4
  "content": "<|endoftext|>",
5
  "lstrip": false,
6
  "normalized": false,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "50256": {
4
  "content": "<|endoftext|>",
5
  "lstrip": false,
6
  "normalized": false,