rootxhacker commited on
Commit
cf36257
·
verified ·
1 Parent(s): a043cfb

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-1000/config.json +30 -0
  2. checkpoint-1000/generation_config.json +7 -0
  3. checkpoint-1000/model.safetensors +3 -0
  4. checkpoint-1000/optimizer.pt +3 -0
  5. checkpoint-1000/rng_state.pth +3 -0
  6. checkpoint-1000/scheduler.pt +3 -0
  7. checkpoint-1000/special_tokens_map.json +24 -0
  8. checkpoint-1000/tokenizer.json +0 -0
  9. checkpoint-1000/tokenizer.model +3 -0
  10. checkpoint-1000/tokenizer_config.json +42 -0
  11. checkpoint-1000/trainer_state.json +934 -0
  12. checkpoint-1000/training_args.bin +3 -0
  13. checkpoint-10000/config.json +30 -0
  14. checkpoint-10000/generation_config.json +7 -0
  15. checkpoint-10000/model.safetensors +3 -0
  16. checkpoint-10000/optimizer.pt +3 -0
  17. checkpoint-10000/rng_state.pth +3 -0
  18. checkpoint-10000/scheduler.pt +3 -0
  19. checkpoint-10000/special_tokens_map.json +24 -0
  20. checkpoint-10000/tokenizer.json +0 -0
  21. checkpoint-10000/tokenizer.model +3 -0
  22. checkpoint-10000/tokenizer_config.json +42 -0
  23. checkpoint-10000/trainer_state.json +0 -0
  24. checkpoint-10000/training_args.bin +3 -0
  25. checkpoint-10500/config.json +30 -0
  26. checkpoint-10500/generation_config.json +7 -0
  27. checkpoint-10500/model.safetensors +3 -0
  28. checkpoint-10500/optimizer.pt +3 -0
  29. checkpoint-10500/rng_state.pth +3 -0
  30. checkpoint-10500/scheduler.pt +3 -0
  31. checkpoint-10500/special_tokens_map.json +24 -0
  32. checkpoint-10500/tokenizer.json +0 -0
  33. checkpoint-10500/tokenizer.model +3 -0
  34. checkpoint-10500/tokenizer_config.json +42 -0
  35. checkpoint-10500/trainer_state.json +0 -0
  36. checkpoint-10500/training_args.bin +3 -0
  37. checkpoint-11000/config.json +30 -0
  38. checkpoint-11000/generation_config.json +7 -0
  39. checkpoint-11000/model.safetensors +3 -0
  40. checkpoint-11000/optimizer.pt +3 -0
  41. checkpoint-11000/rng_state.pth +3 -0
  42. checkpoint-11000/scheduler.pt +3 -0
  43. checkpoint-11000/special_tokens_map.json +24 -0
  44. checkpoint-11000/tokenizer.json +0 -0
  45. checkpoint-11000/tokenizer.model +3 -0
  46. checkpoint-11000/tokenizer_config.json +42 -0
  47. checkpoint-11000/trainer_state.json +0 -0
  48. checkpoint-11000/training_args.bin +3 -0
  49. checkpoint-11500/config.json +30 -0
  50. checkpoint-11500/generation_config.json +7 -0
checkpoint-1000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 512,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 6,
18
+ "num_hidden_layers": 6,
19
+ "num_key_value_heads": 6,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e145ec4cb5b1c6fe568db7242a666c165cc4f4486b6c483180464bc77839d7f
3
+ size 309900448
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:120cb0400da1c194dd0c3d5e7d8d348a540146047d251fd82e0cf02175099d30
3
+ size 619836730
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
3
+ size 14244
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83c694d65a347a9628fbbeee3ca54f077b8171008c69915415bd49f7d02ea9bc
3
+ size 1064
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.08945143905002571,
6
+ "eval_steps": 500,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0008945143905002571,
14
+ "grad_norm": 4.029027938842773,
15
+ "learning_rate": 1.9996779676178552e-05,
16
+ "loss": 6.931,
17
+ "mean_token_accuracy": 0.1403668148443103,
18
+ "num_tokens": 224900.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.0017890287810005143,
23
+ "grad_norm": 3.8816776275634766,
24
+ "learning_rate": 1.999320153859916e-05,
25
+ "loss": 6.6428,
26
+ "mean_token_accuracy": 0.15601451508700848,
27
+ "num_tokens": 449350.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.0026835431715007715,
32
+ "grad_norm": 3.696845531463623,
33
+ "learning_rate": 1.9989623401019772e-05,
34
+ "loss": 6.3909,
35
+ "mean_token_accuracy": 0.17340584620833396,
36
+ "num_tokens": 673110.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.0035780575620010286,
41
+ "grad_norm": 3.57285213470459,
42
+ "learning_rate": 1.998604526344038e-05,
43
+ "loss": 6.2,
44
+ "mean_token_accuracy": 0.20353572219610214,
45
+ "num_tokens": 897311.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.004472571952501286,
50
+ "grad_norm": 3.2977962493896484,
51
+ "learning_rate": 1.9982467125860992e-05,
52
+ "loss": 5.9805,
53
+ "mean_token_accuracy": 0.21226616874337195,
54
+ "num_tokens": 1121992.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.005367086343001543,
59
+ "grad_norm": 3.0260677337646484,
60
+ "learning_rate": 1.99788889882816e-05,
61
+ "loss": 5.7785,
62
+ "mean_token_accuracy": 0.22370390295982362,
63
+ "num_tokens": 1345684.0,
64
+ "step": 60
65
+ },
66
+ {
67
+ "epoch": 0.0062616007335018,
68
+ "grad_norm": 2.671483039855957,
69
+ "learning_rate": 1.997531085070221e-05,
70
+ "loss": 5.5885,
71
+ "mean_token_accuracy": 0.23743247389793395,
72
+ "num_tokens": 1568546.0,
73
+ "step": 70
74
+ },
75
+ {
76
+ "epoch": 0.007156115124002057,
77
+ "grad_norm": 2.5646092891693115,
78
+ "learning_rate": 1.9971732713122823e-05,
79
+ "loss": 5.4085,
80
+ "mean_token_accuracy": 0.24720929898321628,
81
+ "num_tokens": 1791562.0,
82
+ "step": 80
83
+ },
84
+ {
85
+ "epoch": 0.008050629514502314,
86
+ "grad_norm": 4.9957194328308105,
87
+ "learning_rate": 1.996815457554343e-05,
88
+ "loss": 5.2511,
89
+ "mean_token_accuracy": 0.25344080217182635,
90
+ "num_tokens": 2016530.0,
91
+ "step": 90
92
+ },
93
+ {
94
+ "epoch": 0.008945143905002572,
95
+ "grad_norm": 2.079987049102783,
96
+ "learning_rate": 1.996457643796404e-05,
97
+ "loss": 5.1385,
98
+ "mean_token_accuracy": 0.2677029874175787,
99
+ "num_tokens": 2240898.0,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 0.009839658295502828,
104
+ "grad_norm": 1.875989556312561,
105
+ "learning_rate": 1.996099830038465e-05,
106
+ "loss": 4.9731,
107
+ "mean_token_accuracy": 0.2723236083984375,
108
+ "num_tokens": 2464033.0,
109
+ "step": 110
110
+ },
111
+ {
112
+ "epoch": 0.010734172686003086,
113
+ "grad_norm": 1.7837793827056885,
114
+ "learning_rate": 1.995742016280526e-05,
115
+ "loss": 4.8349,
116
+ "mean_token_accuracy": 0.27978694066405296,
117
+ "num_tokens": 2687104.0,
118
+ "step": 120
119
+ },
120
+ {
121
+ "epoch": 0.011628687076503344,
122
+ "grad_norm": 1.6277521848678589,
123
+ "learning_rate": 1.995384202522587e-05,
124
+ "loss": 4.6858,
125
+ "mean_token_accuracy": 0.29452711045742036,
126
+ "num_tokens": 2911815.0,
127
+ "step": 130
128
+ },
129
+ {
130
+ "epoch": 0.0125232014670036,
131
+ "grad_norm": 1.5478984117507935,
132
+ "learning_rate": 1.9950263887646483e-05,
133
+ "loss": 4.547,
134
+ "mean_token_accuracy": 0.3018424347043037,
135
+ "num_tokens": 3136290.0,
136
+ "step": 140
137
+ },
138
+ {
139
+ "epoch": 0.013417715857503858,
140
+ "grad_norm": 1.314953327178955,
141
+ "learning_rate": 1.994668575006709e-05,
142
+ "loss": 4.4302,
143
+ "mean_token_accuracy": 0.3075145646929741,
144
+ "num_tokens": 3361157.0,
145
+ "step": 150
146
+ },
147
+ {
148
+ "epoch": 0.014312230248004114,
149
+ "grad_norm": 1.1404365301132202,
150
+ "learning_rate": 1.9943107612487703e-05,
151
+ "loss": 4.329,
152
+ "mean_token_accuracy": 0.32004141956567767,
153
+ "num_tokens": 3585994.0,
154
+ "step": 160
155
+ },
156
+ {
157
+ "epoch": 0.015206744638504372,
158
+ "grad_norm": 1.1736400127410889,
159
+ "learning_rate": 1.993952947490831e-05,
160
+ "loss": 4.2576,
161
+ "mean_token_accuracy": 0.3258902974426746,
162
+ "num_tokens": 3810406.0,
163
+ "step": 170
164
+ },
165
+ {
166
+ "epoch": 0.01610125902900463,
167
+ "grad_norm": 0.945701003074646,
168
+ "learning_rate": 1.9935951337328923e-05,
169
+ "loss": 4.1854,
170
+ "mean_token_accuracy": 0.33087451085448266,
171
+ "num_tokens": 4033558.0,
172
+ "step": 180
173
+ },
174
+ {
175
+ "epoch": 0.016995773419504888,
176
+ "grad_norm": 0.8116940855979919,
177
+ "learning_rate": 1.9932373199749534e-05,
178
+ "loss": 4.1516,
179
+ "mean_token_accuracy": 0.33102416023612025,
180
+ "num_tokens": 4257791.0,
181
+ "step": 190
182
+ },
183
+ {
184
+ "epoch": 0.017890287810005144,
185
+ "grad_norm": 1.1380091905593872,
186
+ "learning_rate": 1.9928795062170142e-05,
187
+ "loss": 4.0638,
188
+ "mean_token_accuracy": 0.3386186122894287,
189
+ "num_tokens": 4481532.0,
190
+ "step": 200
191
+ },
192
+ {
193
+ "epoch": 0.0187848022005054,
194
+ "grad_norm": 0.690470814704895,
195
+ "learning_rate": 1.992521692459075e-05,
196
+ "loss": 4.0419,
197
+ "mean_token_accuracy": 0.33742879405617715,
198
+ "num_tokens": 4707304.0,
199
+ "step": 210
200
+ },
201
+ {
202
+ "epoch": 0.019679316591005656,
203
+ "grad_norm": 0.6643151044845581,
204
+ "learning_rate": 1.9921638787011362e-05,
205
+ "loss": 3.9833,
206
+ "mean_token_accuracy": 0.34320330172777175,
207
+ "num_tokens": 4933695.0,
208
+ "step": 220
209
+ },
210
+ {
211
+ "epoch": 0.020573830981505916,
212
+ "grad_norm": 0.7354695200920105,
213
+ "learning_rate": 1.991806064943197e-05,
214
+ "loss": 3.9584,
215
+ "mean_token_accuracy": 0.34674171581864355,
216
+ "num_tokens": 5157252.0,
217
+ "step": 230
218
+ },
219
+ {
220
+ "epoch": 0.021468345372006172,
221
+ "grad_norm": 0.6649508476257324,
222
+ "learning_rate": 1.9914482511852582e-05,
223
+ "loss": 3.8969,
224
+ "mean_token_accuracy": 0.3512790575623512,
225
+ "num_tokens": 5382779.0,
226
+ "step": 240
227
+ },
228
+ {
229
+ "epoch": 0.02236285976250643,
230
+ "grad_norm": 0.68132483959198,
231
+ "learning_rate": 1.9910904374273194e-05,
232
+ "loss": 3.8822,
233
+ "mean_token_accuracy": 0.35188654661178587,
234
+ "num_tokens": 5606727.0,
235
+ "step": 250
236
+ },
237
+ {
238
+ "epoch": 0.023257374153006688,
239
+ "grad_norm": 0.5945841670036316,
240
+ "learning_rate": 1.9907326236693802e-05,
241
+ "loss": 3.845,
242
+ "mean_token_accuracy": 0.3524425096809864,
243
+ "num_tokens": 5830914.0,
244
+ "step": 260
245
+ },
246
+ {
247
+ "epoch": 0.024151888543506944,
248
+ "grad_norm": 0.5904633402824402,
249
+ "learning_rate": 1.9903748099114414e-05,
250
+ "loss": 3.8385,
251
+ "mean_token_accuracy": 0.3543414056301117,
252
+ "num_tokens": 6054926.0,
253
+ "step": 270
254
+ },
255
+ {
256
+ "epoch": 0.0250464029340072,
257
+ "grad_norm": 0.5699465870857239,
258
+ "learning_rate": 1.9900169961535022e-05,
259
+ "loss": 3.7648,
260
+ "mean_token_accuracy": 0.36298312023282053,
261
+ "num_tokens": 6279612.0,
262
+ "step": 280
263
+ },
264
+ {
265
+ "epoch": 0.025940917324507456,
266
+ "grad_norm": 0.55901038646698,
267
+ "learning_rate": 1.9896591823955633e-05,
268
+ "loss": 3.7746,
269
+ "mean_token_accuracy": 0.36002359017729757,
270
+ "num_tokens": 6505051.0,
271
+ "step": 290
272
+ },
273
+ {
274
+ "epoch": 0.026835431715007716,
275
+ "grad_norm": 0.6177819967269897,
276
+ "learning_rate": 1.9893013686376245e-05,
277
+ "loss": 3.726,
278
+ "mean_token_accuracy": 0.366513279825449,
279
+ "num_tokens": 6728046.0,
280
+ "step": 300
281
+ },
282
+ {
283
+ "epoch": 0.027729946105507972,
284
+ "grad_norm": 0.525486409664154,
285
+ "learning_rate": 1.9889435548796853e-05,
286
+ "loss": 3.7335,
287
+ "mean_token_accuracy": 0.3637064002454281,
288
+ "num_tokens": 6952250.0,
289
+ "step": 310
290
+ },
291
+ {
292
+ "epoch": 0.02862446049600823,
293
+ "grad_norm": 0.5619438886642456,
294
+ "learning_rate": 1.9885857411217465e-05,
295
+ "loss": 3.7086,
296
+ "mean_token_accuracy": 0.3649679072201252,
297
+ "num_tokens": 7177356.0,
298
+ "step": 320
299
+ },
300
+ {
301
+ "epoch": 0.029518974886508488,
302
+ "grad_norm": 0.5715098977088928,
303
+ "learning_rate": 1.9882279273638073e-05,
304
+ "loss": 3.7003,
305
+ "mean_token_accuracy": 0.36583819389343264,
306
+ "num_tokens": 7401855.0,
307
+ "step": 330
308
+ },
309
+ {
310
+ "epoch": 0.030413489277008744,
311
+ "grad_norm": 0.5622343420982361,
312
+ "learning_rate": 1.987870113605868e-05,
313
+ "loss": 3.6937,
314
+ "mean_token_accuracy": 0.36574283242225647,
315
+ "num_tokens": 7627253.0,
316
+ "step": 340
317
+ },
318
+ {
319
+ "epoch": 0.031308003667509,
320
+ "grad_norm": 0.4998467266559601,
321
+ "learning_rate": 1.9875122998479293e-05,
322
+ "loss": 3.6644,
323
+ "mean_token_accuracy": 0.3698362477123737,
324
+ "num_tokens": 7851897.0,
325
+ "step": 350
326
+ },
327
+ {
328
+ "epoch": 0.03220251805800926,
329
+ "grad_norm": 0.6738699078559875,
330
+ "learning_rate": 1.98715448608999e-05,
331
+ "loss": 3.643,
332
+ "mean_token_accuracy": 0.3715419560670853,
333
+ "num_tokens": 8076130.0,
334
+ "step": 360
335
+ },
336
+ {
337
+ "epoch": 0.03309703244850951,
338
+ "grad_norm": 0.584710955619812,
339
+ "learning_rate": 1.9867966723320513e-05,
340
+ "loss": 3.6288,
341
+ "mean_token_accuracy": 0.3715605862438679,
342
+ "num_tokens": 8300267.0,
343
+ "step": 370
344
+ },
345
+ {
346
+ "epoch": 0.033991546839009776,
347
+ "grad_norm": 0.6669703722000122,
348
+ "learning_rate": 1.9864388585741125e-05,
349
+ "loss": 3.608,
350
+ "mean_token_accuracy": 0.374962493032217,
351
+ "num_tokens": 8523697.0,
352
+ "step": 380
353
+ },
354
+ {
355
+ "epoch": 0.03488606122951003,
356
+ "grad_norm": 0.7256543040275574,
357
+ "learning_rate": 1.9860810448161733e-05,
358
+ "loss": 3.5934,
359
+ "mean_token_accuracy": 0.3755581140518188,
360
+ "num_tokens": 8747098.0,
361
+ "step": 390
362
+ },
363
+ {
364
+ "epoch": 0.03578057562001029,
365
+ "grad_norm": 0.6731703281402588,
366
+ "learning_rate": 1.9857232310582344e-05,
367
+ "loss": 3.5885,
368
+ "mean_token_accuracy": 0.37688973248004914,
369
+ "num_tokens": 8971344.0,
370
+ "step": 400
371
+ },
372
+ {
373
+ "epoch": 0.036675090010510544,
374
+ "grad_norm": 0.9010092616081238,
375
+ "learning_rate": 1.9853654173002953e-05,
376
+ "loss": 3.5777,
377
+ "mean_token_accuracy": 0.37828439101576805,
378
+ "num_tokens": 9197198.0,
379
+ "step": 410
380
+ },
381
+ {
382
+ "epoch": 0.0375696044010108,
383
+ "grad_norm": 0.4792615473270416,
384
+ "learning_rate": 1.9850076035423564e-05,
385
+ "loss": 3.5561,
386
+ "mean_token_accuracy": 0.3797303937375546,
387
+ "num_tokens": 9421420.0,
388
+ "step": 420
389
+ },
390
+ {
391
+ "epoch": 0.03846411879151106,
392
+ "grad_norm": 0.5398027896881104,
393
+ "learning_rate": 1.9846497897844176e-05,
394
+ "loss": 3.5485,
395
+ "mean_token_accuracy": 0.38109720274806025,
396
+ "num_tokens": 9646007.0,
397
+ "step": 430
398
+ },
399
+ {
400
+ "epoch": 0.03935863318201131,
401
+ "grad_norm": 3.100107431411743,
402
+ "learning_rate": 1.9842919760264784e-05,
403
+ "loss": 3.5545,
404
+ "mean_token_accuracy": 0.3794835329055786,
405
+ "num_tokens": 9869980.0,
406
+ "step": 440
407
+ },
408
+ {
409
+ "epoch": 0.040253147572511576,
410
+ "grad_norm": 0.646981418132782,
411
+ "learning_rate": 1.9839341622685392e-05,
412
+ "loss": 3.536,
413
+ "mean_token_accuracy": 0.382675875723362,
414
+ "num_tokens": 10093289.0,
415
+ "step": 450
416
+ },
417
+ {
418
+ "epoch": 0.04114766196301183,
419
+ "grad_norm": 0.6310556530952454,
420
+ "learning_rate": 1.9835763485106004e-05,
421
+ "loss": 3.5114,
422
+ "mean_token_accuracy": 0.3832168258726597,
423
+ "num_tokens": 10318254.0,
424
+ "step": 460
425
+ },
426
+ {
427
+ "epoch": 0.04204217635351209,
428
+ "grad_norm": 0.5271363258361816,
429
+ "learning_rate": 1.9832185347526612e-05,
430
+ "loss": 3.4967,
431
+ "mean_token_accuracy": 0.3867050640285015,
432
+ "num_tokens": 10541404.0,
433
+ "step": 470
434
+ },
435
+ {
436
+ "epoch": 0.042936690744012344,
437
+ "grad_norm": 0.5193490982055664,
438
+ "learning_rate": 1.9828607209947224e-05,
439
+ "loss": 3.5113,
440
+ "mean_token_accuracy": 0.3860153049230576,
441
+ "num_tokens": 10765992.0,
442
+ "step": 480
443
+ },
444
+ {
445
+ "epoch": 0.0438312051345126,
446
+ "grad_norm": 0.5134871006011963,
447
+ "learning_rate": 1.9825029072367835e-05,
448
+ "loss": 3.5039,
449
+ "mean_token_accuracy": 0.38554045259952546,
450
+ "num_tokens": 10992282.0,
451
+ "step": 490
452
+ },
453
+ {
454
+ "epoch": 0.04472571952501286,
455
+ "grad_norm": 0.4887460768222809,
456
+ "learning_rate": 1.9821450934788444e-05,
457
+ "loss": 3.4895,
458
+ "mean_token_accuracy": 0.3860923834145069,
459
+ "num_tokens": 11217294.0,
460
+ "step": 500
461
+ },
462
+ {
463
+ "epoch": 0.04562023391551311,
464
+ "grad_norm": 0.5546553730964661,
465
+ "learning_rate": 1.9817872797209055e-05,
466
+ "loss": 3.4938,
467
+ "mean_token_accuracy": 0.38622146248817446,
468
+ "num_tokens": 11442608.0,
469
+ "step": 510
470
+ },
471
+ {
472
+ "epoch": 0.046514748306013376,
473
+ "grad_norm": 0.5706290602684021,
474
+ "learning_rate": 1.9814294659629664e-05,
475
+ "loss": 3.4489,
476
+ "mean_token_accuracy": 0.39343543276190757,
477
+ "num_tokens": 11665939.0,
478
+ "step": 520
479
+ },
480
+ {
481
+ "epoch": 0.04740926269651363,
482
+ "grad_norm": 0.4757273197174072,
483
+ "learning_rate": 1.9810716522050275e-05,
484
+ "loss": 3.4634,
485
+ "mean_token_accuracy": 0.3893909424543381,
486
+ "num_tokens": 11891171.0,
487
+ "step": 530
488
+ },
489
+ {
490
+ "epoch": 0.04830377708701389,
491
+ "grad_norm": 0.5140799283981323,
492
+ "learning_rate": 1.9807138384470887e-05,
493
+ "loss": 3.4538,
494
+ "mean_token_accuracy": 0.3925728119909763,
495
+ "num_tokens": 12115773.0,
496
+ "step": 540
497
+ },
498
+ {
499
+ "epoch": 0.049198291477514144,
500
+ "grad_norm": 0.4599289894104004,
501
+ "learning_rate": 1.9803560246891495e-05,
502
+ "loss": 3.4482,
503
+ "mean_token_accuracy": 0.39118969812989235,
504
+ "num_tokens": 12339933.0,
505
+ "step": 550
506
+ },
507
+ {
508
+ "epoch": 0.0500928058680144,
509
+ "grad_norm": 0.517117440700531,
510
+ "learning_rate": 1.9799982109312103e-05,
511
+ "loss": 3.4205,
512
+ "mean_token_accuracy": 0.3933353215456009,
513
+ "num_tokens": 12564950.0,
514
+ "step": 560
515
+ },
516
+ {
517
+ "epoch": 0.05098732025851466,
518
+ "grad_norm": 0.7124619483947754,
519
+ "learning_rate": 1.9796403971732715e-05,
520
+ "loss": 3.4234,
521
+ "mean_token_accuracy": 0.39552380964159967,
522
+ "num_tokens": 12790863.0,
523
+ "step": 570
524
+ },
525
+ {
526
+ "epoch": 0.05188183464901491,
527
+ "grad_norm": 10.448816299438477,
528
+ "learning_rate": 1.9792825834153323e-05,
529
+ "loss": 3.4277,
530
+ "mean_token_accuracy": 0.3924214608967304,
531
+ "num_tokens": 13017450.0,
532
+ "step": 580
533
+ },
534
+ {
535
+ "epoch": 0.052776349039515176,
536
+ "grad_norm": 0.5431159734725952,
537
+ "learning_rate": 1.9789247696573935e-05,
538
+ "loss": 3.4328,
539
+ "mean_token_accuracy": 0.39353245720267294,
540
+ "num_tokens": 13241608.0,
541
+ "step": 590
542
+ },
543
+ {
544
+ "epoch": 0.05367086343001543,
545
+ "grad_norm": 0.5687503814697266,
546
+ "learning_rate": 1.9785669558994546e-05,
547
+ "loss": 3.3989,
548
+ "mean_token_accuracy": 0.39768306240439416,
549
+ "num_tokens": 13465104.0,
550
+ "step": 600
551
+ },
552
+ {
553
+ "epoch": 0.05456537782051569,
554
+ "grad_norm": 0.5452563166618347,
555
+ "learning_rate": 1.9782091421415155e-05,
556
+ "loss": 3.3754,
557
+ "mean_token_accuracy": 0.39803339168429375,
558
+ "num_tokens": 13689303.0,
559
+ "step": 610
560
+ },
561
+ {
562
+ "epoch": 0.055459892211015945,
563
+ "grad_norm": 0.4787168800830841,
564
+ "learning_rate": 1.9778513283835766e-05,
565
+ "loss": 3.3908,
566
+ "mean_token_accuracy": 0.3983615793287754,
567
+ "num_tokens": 13913069.0,
568
+ "step": 620
569
+ },
570
+ {
571
+ "epoch": 0.0563544066015162,
572
+ "grad_norm": 0.533787190914154,
573
+ "learning_rate": 1.9774935146256374e-05,
574
+ "loss": 3.4083,
575
+ "mean_token_accuracy": 0.3976218432188034,
576
+ "num_tokens": 14136873.0,
577
+ "step": 630
578
+ },
579
+ {
580
+ "epoch": 0.05724892099201646,
581
+ "grad_norm": 0.6915440559387207,
582
+ "learning_rate": 1.9771357008676986e-05,
583
+ "loss": 3.3768,
584
+ "mean_token_accuracy": 0.40016965195536613,
585
+ "num_tokens": 14359757.0,
586
+ "step": 640
587
+ },
588
+ {
589
+ "epoch": 0.05814343538251671,
590
+ "grad_norm": 0.5388856530189514,
591
+ "learning_rate": 1.9767778871097598e-05,
592
+ "loss": 3.3652,
593
+ "mean_token_accuracy": 0.40155375823378564,
594
+ "num_tokens": 14583389.0,
595
+ "step": 650
596
+ },
597
+ {
598
+ "epoch": 0.059037949773016976,
599
+ "grad_norm": 0.5853003263473511,
600
+ "learning_rate": 1.9764200733518206e-05,
601
+ "loss": 3.401,
602
+ "mean_token_accuracy": 0.39647991508245467,
603
+ "num_tokens": 14806251.0,
604
+ "step": 660
605
+ },
606
+ {
607
+ "epoch": 0.05993246416351723,
608
+ "grad_norm": 0.6135736703872681,
609
+ "learning_rate": 1.9760622595938818e-05,
610
+ "loss": 3.4031,
611
+ "mean_token_accuracy": 0.3982516027987003,
612
+ "num_tokens": 15031471.0,
613
+ "step": 670
614
+ },
615
+ {
616
+ "epoch": 0.06082697855401749,
617
+ "grad_norm": 0.45374274253845215,
618
+ "learning_rate": 1.9757044458359426e-05,
619
+ "loss": 3.3841,
620
+ "mean_token_accuracy": 0.3975381299853325,
621
+ "num_tokens": 15256769.0,
622
+ "step": 680
623
+ },
624
+ {
625
+ "epoch": 0.061721492944517745,
626
+ "grad_norm": 0.5736910700798035,
627
+ "learning_rate": 1.9753466320780034e-05,
628
+ "loss": 3.383,
629
+ "mean_token_accuracy": 0.3996949538588524,
630
+ "num_tokens": 15479684.0,
631
+ "step": 690
632
+ },
633
+ {
634
+ "epoch": 0.062616007335018,
635
+ "grad_norm": 0.5454510450363159,
636
+ "learning_rate": 1.9749888183200646e-05,
637
+ "loss": 3.352,
638
+ "mean_token_accuracy": 0.40254419967532157,
639
+ "num_tokens": 15704356.0,
640
+ "step": 700
641
+ },
642
+ {
643
+ "epoch": 0.06351052172551826,
644
+ "grad_norm": 0.5370535850524902,
645
+ "learning_rate": 1.9746310045621254e-05,
646
+ "loss": 3.3559,
647
+ "mean_token_accuracy": 0.40261620208621024,
648
+ "num_tokens": 15928903.0,
649
+ "step": 710
650
+ },
651
+ {
652
+ "epoch": 0.06440503611601851,
653
+ "grad_norm": 0.6735969185829163,
654
+ "learning_rate": 1.9742731908041865e-05,
655
+ "loss": 3.3582,
656
+ "mean_token_accuracy": 0.40281880721449853,
657
+ "num_tokens": 16154120.0,
658
+ "step": 720
659
+ },
660
+ {
661
+ "epoch": 0.06529955050651877,
662
+ "grad_norm": 0.46152418851852417,
663
+ "learning_rate": 1.9739153770462477e-05,
664
+ "loss": 3.3388,
665
+ "mean_token_accuracy": 0.4018984198570251,
666
+ "num_tokens": 16378150.0,
667
+ "step": 730
668
+ },
669
+ {
670
+ "epoch": 0.06619406489701903,
671
+ "grad_norm": 0.5333797335624695,
672
+ "learning_rate": 1.9735575632883085e-05,
673
+ "loss": 3.358,
674
+ "mean_token_accuracy": 0.40131590217351915,
675
+ "num_tokens": 16602518.0,
676
+ "step": 740
677
+ },
678
+ {
679
+ "epoch": 0.06708857928751928,
680
+ "grad_norm": 0.6620674729347229,
681
+ "learning_rate": 1.9731997495303697e-05,
682
+ "loss": 3.3597,
683
+ "mean_token_accuracy": 0.40183877646923066,
684
+ "num_tokens": 16825308.0,
685
+ "step": 750
686
+ },
687
+ {
688
+ "epoch": 0.06798309367801955,
689
+ "grad_norm": 0.4112262427806854,
690
+ "learning_rate": 1.9728419357724305e-05,
691
+ "loss": 3.3498,
692
+ "mean_token_accuracy": 0.4032081626355648,
693
+ "num_tokens": 17048788.0,
694
+ "step": 760
695
+ },
696
+ {
697
+ "epoch": 0.06887760806851981,
698
+ "grad_norm": 0.49325069785118103,
699
+ "learning_rate": 1.9724841220144917e-05,
700
+ "loss": 3.3523,
701
+ "mean_token_accuracy": 0.4036438427865505,
702
+ "num_tokens": 17273104.0,
703
+ "step": 770
704
+ },
705
+ {
706
+ "epoch": 0.06977212245902006,
707
+ "grad_norm": 0.5840951204299927,
708
+ "learning_rate": 1.972126308256553e-05,
709
+ "loss": 3.3446,
710
+ "mean_token_accuracy": 0.4040109634399414,
711
+ "num_tokens": 17497961.0,
712
+ "step": 780
713
+ },
714
+ {
715
+ "epoch": 0.07066663684952032,
716
+ "grad_norm": 0.49413686990737915,
717
+ "learning_rate": 1.9717684944986137e-05,
718
+ "loss": 3.3309,
719
+ "mean_token_accuracy": 0.40450835302472116,
720
+ "num_tokens": 17722726.0,
721
+ "step": 790
722
+ },
723
+ {
724
+ "epoch": 0.07156115124002058,
725
+ "grad_norm": 0.6528025269508362,
726
+ "learning_rate": 1.9714106807406745e-05,
727
+ "loss": 3.3427,
728
+ "mean_token_accuracy": 0.4030210435390472,
729
+ "num_tokens": 17946378.0,
730
+ "step": 800
731
+ },
732
+ {
733
+ "epoch": 0.07245566563052083,
734
+ "grad_norm": 0.5769058465957642,
735
+ "learning_rate": 1.9710528669827357e-05,
736
+ "loss": 3.3423,
737
+ "mean_token_accuracy": 0.4021991953253746,
738
+ "num_tokens": 18170591.0,
739
+ "step": 810
740
+ },
741
+ {
742
+ "epoch": 0.07335018002102109,
743
+ "grad_norm": 0.6946350336074829,
744
+ "learning_rate": 1.9706950532247965e-05,
745
+ "loss": 3.322,
746
+ "mean_token_accuracy": 0.40520998015999793,
747
+ "num_tokens": 18395601.0,
748
+ "step": 820
749
+ },
750
+ {
751
+ "epoch": 0.07424469441152134,
752
+ "grad_norm": 0.5611916184425354,
753
+ "learning_rate": 1.9703372394668576e-05,
754
+ "loss": 3.3172,
755
+ "mean_token_accuracy": 0.4065264783799648,
756
+ "num_tokens": 18619829.0,
757
+ "step": 830
758
+ },
759
+ {
760
+ "epoch": 0.0751392088020216,
761
+ "grad_norm": 0.5267366170883179,
762
+ "learning_rate": 1.9699794257089188e-05,
763
+ "loss": 3.3504,
764
+ "mean_token_accuracy": 0.4045166805386543,
765
+ "num_tokens": 18844573.0,
766
+ "step": 840
767
+ },
768
+ {
769
+ "epoch": 0.07603372319252186,
770
+ "grad_norm": 0.5963064432144165,
771
+ "learning_rate": 1.9696216119509796e-05,
772
+ "loss": 3.3026,
773
+ "mean_token_accuracy": 0.4085647910833359,
774
+ "num_tokens": 19071174.0,
775
+ "step": 850
776
+ },
777
+ {
778
+ "epoch": 0.07692823758302211,
779
+ "grad_norm": 0.4585157632827759,
780
+ "learning_rate": 1.9692637981930408e-05,
781
+ "loss": 3.2948,
782
+ "mean_token_accuracy": 0.4082924917340279,
783
+ "num_tokens": 19296052.0,
784
+ "step": 860
785
+ },
786
+ {
787
+ "epoch": 0.07782275197352237,
788
+ "grad_norm": 0.5613287687301636,
789
+ "learning_rate": 1.9689059844351016e-05,
790
+ "loss": 3.3167,
791
+ "mean_token_accuracy": 0.40675563290715216,
792
+ "num_tokens": 19521709.0,
793
+ "step": 870
794
+ },
795
+ {
796
+ "epoch": 0.07871726636402263,
797
+ "grad_norm": 0.4587007761001587,
798
+ "learning_rate": 1.9685481706771628e-05,
799
+ "loss": 3.305,
800
+ "mean_token_accuracy": 0.4078836299479008,
801
+ "num_tokens": 19745334.0,
802
+ "step": 880
803
+ },
804
+ {
805
+ "epoch": 0.07961178075452288,
806
+ "grad_norm": 0.5072513818740845,
807
+ "learning_rate": 1.968190356919224e-05,
808
+ "loss": 3.3259,
809
+ "mean_token_accuracy": 0.40673111006617546,
810
+ "num_tokens": 19969825.0,
811
+ "step": 890
812
+ },
813
+ {
814
+ "epoch": 0.08050629514502315,
815
+ "grad_norm": 0.5777090787887573,
816
+ "learning_rate": 1.9678325431612848e-05,
817
+ "loss": 3.2855,
818
+ "mean_token_accuracy": 0.4096429578959942,
819
+ "num_tokens": 20193404.0,
820
+ "step": 900
821
+ },
822
+ {
823
+ "epoch": 0.08140080953552341,
824
+ "grad_norm": 0.5001935362815857,
825
+ "learning_rate": 1.967474729403346e-05,
826
+ "loss": 3.287,
827
+ "mean_token_accuracy": 0.41084871664643285,
828
+ "num_tokens": 20418412.0,
829
+ "step": 910
830
+ },
831
+ {
832
+ "epoch": 0.08229532392602366,
833
+ "grad_norm": 0.560683012008667,
834
+ "learning_rate": 1.9671169156454067e-05,
835
+ "loss": 3.3084,
836
+ "mean_token_accuracy": 0.4078595593571663,
837
+ "num_tokens": 20642807.0,
838
+ "step": 920
839
+ },
840
+ {
841
+ "epoch": 0.08318983831652392,
842
+ "grad_norm": 0.7433478832244873,
843
+ "learning_rate": 1.9667591018874676e-05,
844
+ "loss": 3.3168,
845
+ "mean_token_accuracy": 0.40866749435663224,
846
+ "num_tokens": 20866434.0,
847
+ "step": 930
848
+ },
849
+ {
850
+ "epoch": 0.08408435270702418,
851
+ "grad_norm": 0.47655490040779114,
852
+ "learning_rate": 1.9664012881295287e-05,
853
+ "loss": 3.2776,
854
+ "mean_token_accuracy": 0.40998933985829356,
855
+ "num_tokens": 21092326.0,
856
+ "step": 940
857
+ },
858
+ {
859
+ "epoch": 0.08497886709752443,
860
+ "grad_norm": 0.6603720784187317,
861
+ "learning_rate": 1.9660434743715895e-05,
862
+ "loss": 3.246,
863
+ "mean_token_accuracy": 0.41296741738915443,
864
+ "num_tokens": 21317008.0,
865
+ "step": 950
866
+ },
867
+ {
868
+ "epoch": 0.08587338148802469,
869
+ "grad_norm": 0.6937066912651062,
870
+ "learning_rate": 1.9656856606136507e-05,
871
+ "loss": 3.2978,
872
+ "mean_token_accuracy": 0.4103521354496479,
873
+ "num_tokens": 21541185.0,
874
+ "step": 960
875
+ },
876
+ {
877
+ "epoch": 0.08676789587852494,
878
+ "grad_norm": 0.5999243259429932,
879
+ "learning_rate": 1.965327846855712e-05,
880
+ "loss": 3.2726,
881
+ "mean_token_accuracy": 0.41270416751503947,
882
+ "num_tokens": 21765805.0,
883
+ "step": 970
884
+ },
885
+ {
886
+ "epoch": 0.0876624102690252,
887
+ "grad_norm": 0.4740482270717621,
888
+ "learning_rate": 1.9649700330977727e-05,
889
+ "loss": 3.2616,
890
+ "mean_token_accuracy": 0.41271830424666406,
891
+ "num_tokens": 21993399.0,
892
+ "step": 980
893
+ },
894
+ {
895
+ "epoch": 0.08855692465952546,
896
+ "grad_norm": 0.4714813530445099,
897
+ "learning_rate": 1.964612219339834e-05,
898
+ "loss": 3.2992,
899
+ "mean_token_accuracy": 0.4101860985159874,
900
+ "num_tokens": 22217604.0,
901
+ "step": 990
902
+ },
903
+ {
904
+ "epoch": 0.08945143905002571,
905
+ "grad_norm": 0.422974556684494,
906
+ "learning_rate": 1.9642544055818947e-05,
907
+ "loss": 3.2634,
908
+ "mean_token_accuracy": 0.41224386021494863,
909
+ "num_tokens": 22442384.0,
910
+ "step": 1000
911
+ }
912
+ ],
913
+ "logging_steps": 10,
914
+ "max_steps": 55895,
915
+ "num_input_tokens_seen": 0,
916
+ "num_train_epochs": 5,
917
+ "save_steps": 500,
918
+ "stateful_callbacks": {
919
+ "TrainerControl": {
920
+ "args": {
921
+ "should_epoch_stop": false,
922
+ "should_evaluate": false,
923
+ "should_log": false,
924
+ "should_save": true,
925
+ "should_training_stop": false
926
+ },
927
+ "attributes": {}
928
+ }
929
+ },
930
+ "total_flos": 1.1315610691043328e+16,
931
+ "train_batch_size": 64,
932
+ "trial_name": null,
933
+ "trial_params": null
934
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eec90019e03b3308744cf7428c0bc743b8dd914d925c9059ff38ec49b74a159f
3
+ size 5688
checkpoint-10000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 512,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 6,
18
+ "num_hidden_layers": 6,
19
+ "num_key_value_heads": 6,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-10000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-10000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:458c8eba82e517f5ba85b362bd54b06f115679c56b6b1c4071cb969b5c55915f
3
+ size 309900448
checkpoint-10000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ed7277ef8481f73983e89138757736d72118e72dd5e978643a274ade68505a4
3
+ size 619836730
checkpoint-10000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
3
+ size 14244
checkpoint-10000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d5bb0c2380f80076bc24221e137f860d6cf7f973dc1aa82881bce26bcfde343
3
+ size 1064
checkpoint-10000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-10000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-10000/tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }
checkpoint-10000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eec90019e03b3308744cf7428c0bc743b8dd914d925c9059ff38ec49b74a159f
3
+ size 5688
checkpoint-10500/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 512,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 6,
18
+ "num_hidden_layers": 6,
19
+ "num_key_value_heads": 6,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-10500/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-10500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90611e57d216eaabc14ef4dbed35b6d3636be548a9d21e0c7a3d9be4f566bb4e
3
+ size 309900448
checkpoint-10500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b59b975e194b7bb1e98b21294e78c9c86dc0b58cd3892b629c2479b022994f8
3
+ size 619836730
checkpoint-10500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
3
+ size 14244
checkpoint-10500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a2d11ff118d0553d52a97bddfd207f05928501796b8701d3a53e420beb2f5b7
3
+ size 1064
checkpoint-10500/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-10500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10500/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-10500/tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }
checkpoint-10500/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eec90019e03b3308744cf7428c0bc743b8dd914d925c9059ff38ec49b74a159f
3
+ size 5688
checkpoint-11000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 512,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 6,
18
+ "num_hidden_layers": 6,
19
+ "num_key_value_heads": 6,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-11000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-11000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:933c3626401ae46c5786aecd6efab533e1656993686117b3be8aaf60bb034acb
3
+ size 309900448
checkpoint-11000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b75082e4ecdcf5ef7d34918fb783df1e626aaf7dd5e930474d55cb41da9ab754
3
+ size 619836730
checkpoint-11000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
3
+ size 14244
checkpoint-11000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebabd05435cd42dd6dc311e938b4060596334d385dedeffb84634c6f408fa78b
3
+ size 1064
checkpoint-11000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-11000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-11000/tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }
checkpoint-11000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eec90019e03b3308744cf7428c0bc743b8dd914d925c9059ff38ec49b74a159f
3
+ size 5688
checkpoint-11500/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 512,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 6,
18
+ "num_hidden_layers": 6,
19
+ "num_key_value_heads": 6,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-11500/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.51.3"
7
+ }