rootxhacker commited on
Commit
78725df
·
verified ·
1 Parent(s): e1f2317

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-1000/config.json +30 -0
  2. checkpoint-1000/generation_config.json +7 -0
  3. checkpoint-1000/model.safetensors +3 -0
  4. checkpoint-1000/optimizer.pt +3 -0
  5. checkpoint-1000/rng_state.pth +3 -0
  6. checkpoint-1000/scheduler.pt +3 -0
  7. checkpoint-1000/special_tokens_map.json +24 -0
  8. checkpoint-1000/tokenizer.json +0 -0
  9. checkpoint-1000/tokenizer.model +3 -0
  10. checkpoint-1000/tokenizer_config.json +42 -0
  11. checkpoint-1000/trainer_state.json +934 -0
  12. checkpoint-1000/training_args.bin +3 -0
  13. checkpoint-1500/config.json +30 -0
  14. checkpoint-1500/generation_config.json +7 -0
  15. checkpoint-1500/model.safetensors +3 -0
  16. checkpoint-1500/optimizer.pt +3 -0
  17. checkpoint-1500/rng_state.pth +3 -0
  18. checkpoint-1500/scheduler.pt +3 -0
  19. checkpoint-1500/special_tokens_map.json +24 -0
  20. checkpoint-1500/tokenizer.json +0 -0
  21. checkpoint-1500/tokenizer.model +3 -0
  22. checkpoint-1500/tokenizer_config.json +42 -0
  23. checkpoint-1500/trainer_state.json +1384 -0
  24. checkpoint-1500/training_args.bin +3 -0
  25. checkpoint-2000/config.json +30 -0
  26. checkpoint-2000/generation_config.json +7 -0
  27. checkpoint-2000/model.safetensors +3 -0
  28. checkpoint-2000/optimizer.pt +3 -0
  29. checkpoint-2000/rng_state.pth +3 -0
  30. checkpoint-2000/scheduler.pt +3 -0
  31. checkpoint-2000/special_tokens_map.json +24 -0
  32. checkpoint-2000/tokenizer.json +0 -0
  33. checkpoint-2000/tokenizer.model +3 -0
  34. checkpoint-2000/tokenizer_config.json +42 -0
  35. checkpoint-2000/trainer_state.json +1834 -0
  36. checkpoint-2000/training_args.bin +3 -0
  37. checkpoint-2500/config.json +30 -0
  38. checkpoint-2500/generation_config.json +7 -0
  39. checkpoint-2500/model.safetensors +3 -0
  40. checkpoint-2500/optimizer.pt +3 -0
  41. checkpoint-2500/rng_state.pth +3 -0
  42. checkpoint-2500/scheduler.pt +3 -0
  43. checkpoint-2500/special_tokens_map.json +24 -0
  44. checkpoint-2500/tokenizer.json +0 -0
  45. checkpoint-2500/tokenizer.model +3 -0
  46. checkpoint-2500/tokenizer_config.json +42 -0
  47. checkpoint-2500/trainer_state.json +2284 -0
  48. checkpoint-2500/training_args.bin +3 -0
  49. checkpoint-3000/config.json +30 -0
  50. checkpoint-3000/generation_config.json +7 -0
checkpoint-1000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 256,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 6,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 6,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb46cfaae2320bf5206ac688a80cb7c7f7ef42b9a169fdc82532dafcb9c6f146
3
+ size 800819936
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30feba927f7a61d420c1a89f27c12332867a40b47a0644de5b4ac0edc72d0cba
3
+ size 1601820026
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b66e3cc7c452b707ddac5caf0aa17618afb9bc1a0333600a22c4afb353f3165
3
+ size 14244
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a940b2cced4848bf98a9115225fcd56fc54233d916d2e25d79408155785b6f78
3
+ size 1064
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.9517308629936616,
6
+ "eval_steps": 500,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.019502681618722574,
14
+ "grad_norm": 3.0208523273468018,
15
+ "learning_rate": 1.9964843750000004e-05,
16
+ "loss": 7.8867,
17
+ "mean_token_accuracy": 0.0920736625790596,
18
+ "num_tokens": 920759.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.03900536323744515,
23
+ "grad_norm": 2.2770543098449707,
24
+ "learning_rate": 1.9925781250000002e-05,
25
+ "loss": 7.5013,
26
+ "mean_token_accuracy": 0.10232679340988397,
27
+ "num_tokens": 1848077.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.05850804485616772,
32
+ "grad_norm": 1.6555824279785156,
33
+ "learning_rate": 1.9886718750000004e-05,
34
+ "loss": 7.0436,
35
+ "mean_token_accuracy": 0.11740029789507389,
36
+ "num_tokens": 2781210.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.0780107264748903,
41
+ "grad_norm": 1.7775338888168335,
42
+ "learning_rate": 1.9847656250000002e-05,
43
+ "loss": 6.7631,
44
+ "mean_token_accuracy": 0.1280333673581481,
45
+ "num_tokens": 3689316.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.09751340809361288,
50
+ "grad_norm": 1.2532657384872437,
51
+ "learning_rate": 1.9808593750000003e-05,
52
+ "loss": 6.515,
53
+ "mean_token_accuracy": 0.13626975379884243,
54
+ "num_tokens": 4616761.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.11701608971233544,
59
+ "grad_norm": 1.1137648820877075,
60
+ "learning_rate": 1.976953125e-05,
61
+ "loss": 6.3421,
62
+ "mean_token_accuracy": 0.1418815266340971,
63
+ "num_tokens": 5544631.0,
64
+ "step": 60
65
+ },
66
+ {
67
+ "epoch": 0.13651877133105803,
68
+ "grad_norm": 0.9245680570602417,
69
+ "learning_rate": 1.9730468750000003e-05,
70
+ "loss": 6.2092,
71
+ "mean_token_accuracy": 0.1463709220290184,
72
+ "num_tokens": 6483486.0,
73
+ "step": 70
74
+ },
75
+ {
76
+ "epoch": 0.1560214529497806,
77
+ "grad_norm": 0.8324124813079834,
78
+ "learning_rate": 1.969140625e-05,
79
+ "loss": 6.0411,
80
+ "mean_token_accuracy": 0.15314992293715476,
81
+ "num_tokens": 7412558.0,
82
+ "step": 80
83
+ },
84
+ {
85
+ "epoch": 0.17552413456850316,
86
+ "grad_norm": 0.920666515827179,
87
+ "learning_rate": 1.9652343750000003e-05,
88
+ "loss": 5.9319,
89
+ "mean_token_accuracy": 0.16162274666130544,
90
+ "num_tokens": 8332801.0,
91
+ "step": 90
92
+ },
93
+ {
94
+ "epoch": 0.19502681618722575,
95
+ "grad_norm": 0.8294446468353271,
96
+ "learning_rate": 1.961328125e-05,
97
+ "loss": 5.8516,
98
+ "mean_token_accuracy": 0.16927699856460093,
99
+ "num_tokens": 9274826.0,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 0.21452949780594832,
104
+ "grad_norm": 0.7535350322723389,
105
+ "learning_rate": 1.9574218750000003e-05,
106
+ "loss": 5.7591,
107
+ "mean_token_accuracy": 0.17415257096290587,
108
+ "num_tokens": 10190661.0,
109
+ "step": 110
110
+ },
111
+ {
112
+ "epoch": 0.2340321794246709,
113
+ "grad_norm": 0.9731621742248535,
114
+ "learning_rate": 1.953515625e-05,
115
+ "loss": 5.6905,
116
+ "mean_token_accuracy": 0.17712676227092744,
117
+ "num_tokens": 11113827.0,
118
+ "step": 120
119
+ },
120
+ {
121
+ "epoch": 0.25353486104339346,
122
+ "grad_norm": 1.5154342651367188,
123
+ "learning_rate": 1.9496093750000003e-05,
124
+ "loss": 5.6104,
125
+ "mean_token_accuracy": 0.1849387872964144,
126
+ "num_tokens": 12034156.0,
127
+ "step": 130
128
+ },
129
+ {
130
+ "epoch": 0.27303754266211605,
131
+ "grad_norm": 0.7294800877571106,
132
+ "learning_rate": 1.9457031250000004e-05,
133
+ "loss": 5.5413,
134
+ "mean_token_accuracy": 0.19220538288354874,
135
+ "num_tokens": 12950143.0,
136
+ "step": 140
137
+ },
138
+ {
139
+ "epoch": 0.2925402242808386,
140
+ "grad_norm": 0.6792197823524475,
141
+ "learning_rate": 1.9417968750000002e-05,
142
+ "loss": 5.5046,
143
+ "mean_token_accuracy": 0.1948456909507513,
144
+ "num_tokens": 13864130.0,
145
+ "step": 150
146
+ },
147
+ {
148
+ "epoch": 0.3120429058995612,
149
+ "grad_norm": 0.6913173794746399,
150
+ "learning_rate": 1.9378906250000004e-05,
151
+ "loss": 5.4398,
152
+ "mean_token_accuracy": 0.19714849777519702,
153
+ "num_tokens": 14795220.0,
154
+ "step": 160
155
+ },
156
+ {
157
+ "epoch": 0.3315455875182838,
158
+ "grad_norm": 0.6437414288520813,
159
+ "learning_rate": 1.9339843750000002e-05,
160
+ "loss": 5.4273,
161
+ "mean_token_accuracy": 0.19942218959331512,
162
+ "num_tokens": 15722954.0,
163
+ "step": 170
164
+ },
165
+ {
166
+ "epoch": 0.3510482691370063,
167
+ "grad_norm": 2.1863138675689697,
168
+ "learning_rate": 1.9300781250000004e-05,
169
+ "loss": 5.3666,
170
+ "mean_token_accuracy": 0.2016600638628006,
171
+ "num_tokens": 16637414.0,
172
+ "step": 180
173
+ },
174
+ {
175
+ "epoch": 0.3705509507557289,
176
+ "grad_norm": 0.8689864873886108,
177
+ "learning_rate": 1.9261718750000002e-05,
178
+ "loss": 5.3421,
179
+ "mean_token_accuracy": 0.20273192636668683,
180
+ "num_tokens": 17569476.0,
181
+ "step": 190
182
+ },
183
+ {
184
+ "epoch": 0.3900536323744515,
185
+ "grad_norm": 1.2784861326217651,
186
+ "learning_rate": 1.9222656250000003e-05,
187
+ "loss": 5.3323,
188
+ "mean_token_accuracy": 0.20496859662234784,
189
+ "num_tokens": 18498067.0,
190
+ "step": 200
191
+ },
192
+ {
193
+ "epoch": 0.40955631399317405,
194
+ "grad_norm": 0.6330307722091675,
195
+ "learning_rate": 1.9183593750000002e-05,
196
+ "loss": 5.2827,
197
+ "mean_token_accuracy": 0.2135307714343071,
198
+ "num_tokens": 19416964.0,
199
+ "step": 210
200
+ },
201
+ {
202
+ "epoch": 0.42905899561189664,
203
+ "grad_norm": 1.1162034273147583,
204
+ "learning_rate": 1.9144531250000003e-05,
205
+ "loss": 5.2121,
206
+ "mean_token_accuracy": 0.21816504523158073,
207
+ "num_tokens": 20341056.0,
208
+ "step": 220
209
+ },
210
+ {
211
+ "epoch": 0.4485616772306192,
212
+ "grad_norm": 0.6339605450630188,
213
+ "learning_rate": 1.910546875e-05,
214
+ "loss": 5.2187,
215
+ "mean_token_accuracy": 0.21803640052676201,
216
+ "num_tokens": 21267082.0,
217
+ "step": 230
218
+ },
219
+ {
220
+ "epoch": 0.4680643588493418,
221
+ "grad_norm": 0.607659637928009,
222
+ "learning_rate": 1.9066406250000003e-05,
223
+ "loss": 5.1826,
224
+ "mean_token_accuracy": 0.22011552266776563,
225
+ "num_tokens": 22195442.0,
226
+ "step": 240
227
+ },
228
+ {
229
+ "epoch": 0.48756704046806437,
230
+ "grad_norm": 0.5029736757278442,
231
+ "learning_rate": 1.902734375e-05,
232
+ "loss": 5.1424,
233
+ "mean_token_accuracy": 0.22285537868738176,
234
+ "num_tokens": 23144077.0,
235
+ "step": 250
236
+ },
237
+ {
238
+ "epoch": 0.5070697220867869,
239
+ "grad_norm": 0.9591688513755798,
240
+ "learning_rate": 1.8988281250000003e-05,
241
+ "loss": 5.0913,
242
+ "mean_token_accuracy": 0.22736062072217464,
243
+ "num_tokens": 24068643.0,
244
+ "step": 260
245
+ },
246
+ {
247
+ "epoch": 0.5265724037055095,
248
+ "grad_norm": 0.5418295860290527,
249
+ "learning_rate": 1.894921875e-05,
250
+ "loss": 5.1015,
251
+ "mean_token_accuracy": 0.22693138755857944,
252
+ "num_tokens": 24996534.0,
253
+ "step": 270
254
+ },
255
+ {
256
+ "epoch": 0.5460750853242321,
257
+ "grad_norm": 0.5258099436759949,
258
+ "learning_rate": 1.8910156250000003e-05,
259
+ "loss": 5.0746,
260
+ "mean_token_accuracy": 0.23026154786348343,
261
+ "num_tokens": 25920902.0,
262
+ "step": 280
263
+ },
264
+ {
265
+ "epoch": 0.5655777669429547,
266
+ "grad_norm": 0.5592005252838135,
267
+ "learning_rate": 1.887109375e-05,
268
+ "loss": 5.0566,
269
+ "mean_token_accuracy": 0.23160071447491645,
270
+ "num_tokens": 26845104.0,
271
+ "step": 290
272
+ },
273
+ {
274
+ "epoch": 0.5850804485616772,
275
+ "grad_norm": 0.5427853465080261,
276
+ "learning_rate": 1.8832031250000002e-05,
277
+ "loss": 5.0565,
278
+ "mean_token_accuracy": 0.2316149313002825,
279
+ "num_tokens": 27782058.0,
280
+ "step": 300
281
+ },
282
+ {
283
+ "epoch": 0.6045831301803998,
284
+ "grad_norm": 0.9386640191078186,
285
+ "learning_rate": 1.8792968750000004e-05,
286
+ "loss": 5.0003,
287
+ "mean_token_accuracy": 0.2349798556417227,
288
+ "num_tokens": 28707140.0,
289
+ "step": 310
290
+ },
291
+ {
292
+ "epoch": 0.6240858117991224,
293
+ "grad_norm": 0.8473969101905823,
294
+ "learning_rate": 1.8753906250000002e-05,
295
+ "loss": 4.9954,
296
+ "mean_token_accuracy": 0.23647231683135034,
297
+ "num_tokens": 29637908.0,
298
+ "step": 320
299
+ },
300
+ {
301
+ "epoch": 0.643588493417845,
302
+ "grad_norm": 0.5143874883651733,
303
+ "learning_rate": 1.8714843750000004e-05,
304
+ "loss": 4.9881,
305
+ "mean_token_accuracy": 0.23599626012146474,
306
+ "num_tokens": 30559254.0,
307
+ "step": 330
308
+ },
309
+ {
310
+ "epoch": 0.6630911750365676,
311
+ "grad_norm": 0.4523729979991913,
312
+ "learning_rate": 1.8675781250000002e-05,
313
+ "loss": 4.9695,
314
+ "mean_token_accuracy": 0.23746853992342948,
315
+ "num_tokens": 31487152.0,
316
+ "step": 340
317
+ },
318
+ {
319
+ "epoch": 0.6825938566552902,
320
+ "grad_norm": 0.5573343634605408,
321
+ "learning_rate": 1.8636718750000004e-05,
322
+ "loss": 4.9525,
323
+ "mean_token_accuracy": 0.23925678990781307,
324
+ "num_tokens": 32396911.0,
325
+ "step": 350
326
+ },
327
+ {
328
+ "epoch": 0.7020965382740126,
329
+ "grad_norm": 1.2790151834487915,
330
+ "learning_rate": 1.8597656250000002e-05,
331
+ "loss": 4.9338,
332
+ "mean_token_accuracy": 0.24103106185793877,
333
+ "num_tokens": 33325126.0,
334
+ "step": 360
335
+ },
336
+ {
337
+ "epoch": 0.7215992198927352,
338
+ "grad_norm": 0.8714343309402466,
339
+ "learning_rate": 1.8558593750000003e-05,
340
+ "loss": 4.9252,
341
+ "mean_token_accuracy": 0.24138498678803444,
342
+ "num_tokens": 34262332.0,
343
+ "step": 370
344
+ },
345
+ {
346
+ "epoch": 0.7411019015114578,
347
+ "grad_norm": 0.5251726508140564,
348
+ "learning_rate": 1.851953125e-05,
349
+ "loss": 4.8883,
350
+ "mean_token_accuracy": 0.24456401653587817,
351
+ "num_tokens": 35181080.0,
352
+ "step": 380
353
+ },
354
+ {
355
+ "epoch": 0.7606045831301804,
356
+ "grad_norm": 0.46523743867874146,
357
+ "learning_rate": 1.8480468750000003e-05,
358
+ "loss": 4.8887,
359
+ "mean_token_accuracy": 0.24552332125604154,
360
+ "num_tokens": 36105038.0,
361
+ "step": 390
362
+ },
363
+ {
364
+ "epoch": 0.780107264748903,
365
+ "grad_norm": 0.5080934166908264,
366
+ "learning_rate": 1.844140625e-05,
367
+ "loss": 4.8659,
368
+ "mean_token_accuracy": 0.245796899497509,
369
+ "num_tokens": 37036998.0,
370
+ "step": 400
371
+ },
372
+ {
373
+ "epoch": 0.7996099463676255,
374
+ "grad_norm": 1.117693543434143,
375
+ "learning_rate": 1.8402343750000003e-05,
376
+ "loss": 4.8681,
377
+ "mean_token_accuracy": 0.2463846940547228,
378
+ "num_tokens": 37968683.0,
379
+ "step": 410
380
+ },
381
+ {
382
+ "epoch": 0.8191126279863481,
383
+ "grad_norm": 0.5299689769744873,
384
+ "learning_rate": 1.836328125e-05,
385
+ "loss": 4.8305,
386
+ "mean_token_accuracy": 0.24968006946146487,
387
+ "num_tokens": 38888752.0,
388
+ "step": 420
389
+ },
390
+ {
391
+ "epoch": 0.8386153096050707,
392
+ "grad_norm": 0.8458639979362488,
393
+ "learning_rate": 1.8324218750000003e-05,
394
+ "loss": 4.8279,
395
+ "mean_token_accuracy": 0.25057865455746653,
396
+ "num_tokens": 39823608.0,
397
+ "step": 430
398
+ },
399
+ {
400
+ "epoch": 0.8581179912237933,
401
+ "grad_norm": 0.5429758429527283,
402
+ "learning_rate": 1.828515625e-05,
403
+ "loss": 4.8243,
404
+ "mean_token_accuracy": 0.2514403607696295,
405
+ "num_tokens": 40748246.0,
406
+ "step": 440
407
+ },
408
+ {
409
+ "epoch": 0.8776206728425159,
410
+ "grad_norm": 0.47386595606803894,
411
+ "learning_rate": 1.8246093750000003e-05,
412
+ "loss": 4.7881,
413
+ "mean_token_accuracy": 0.25412631034851074,
414
+ "num_tokens": 41662563.0,
415
+ "step": 450
416
+ },
417
+ {
418
+ "epoch": 0.8971233544612384,
419
+ "grad_norm": 0.41789767146110535,
420
+ "learning_rate": 1.820703125e-05,
421
+ "loss": 4.7944,
422
+ "mean_token_accuracy": 0.25327568165957925,
423
+ "num_tokens": 42588923.0,
424
+ "step": 460
425
+ },
426
+ {
427
+ "epoch": 0.916626036079961,
428
+ "grad_norm": 0.43711453676223755,
429
+ "learning_rate": 1.8167968750000002e-05,
430
+ "loss": 4.7758,
431
+ "mean_token_accuracy": 0.2548953540623188,
432
+ "num_tokens": 43515886.0,
433
+ "step": 470
434
+ },
435
+ {
436
+ "epoch": 0.9361287176986836,
437
+ "grad_norm": 0.6433466076850891,
438
+ "learning_rate": 1.8128906250000004e-05,
439
+ "loss": 4.7632,
440
+ "mean_token_accuracy": 0.2562540594488382,
441
+ "num_tokens": 44446196.0,
442
+ "step": 480
443
+ },
444
+ {
445
+ "epoch": 0.9556313993174061,
446
+ "grad_norm": 0.7580122351646423,
447
+ "learning_rate": 1.8089843750000002e-05,
448
+ "loss": 4.7559,
449
+ "mean_token_accuracy": 0.25687045492231847,
450
+ "num_tokens": 45389436.0,
451
+ "step": 490
452
+ },
453
+ {
454
+ "epoch": 0.9751340809361287,
455
+ "grad_norm": 0.9800453782081604,
456
+ "learning_rate": 1.8050781250000004e-05,
457
+ "loss": 4.7711,
458
+ "mean_token_accuracy": 0.2573121260851622,
459
+ "num_tokens": 46313035.0,
460
+ "step": 500
461
+ },
462
+ {
463
+ "epoch": 0.9946367625548513,
464
+ "grad_norm": 0.636842668056488,
465
+ "learning_rate": 1.8011718750000002e-05,
466
+ "loss": 4.7693,
467
+ "mean_token_accuracy": 0.2563398856669664,
468
+ "num_tokens": 47231018.0,
469
+ "step": 510
470
+ },
471
+ {
472
+ "epoch": 1.0156021452949782,
473
+ "grad_norm": 0.9280142188072205,
474
+ "learning_rate": 1.7972656250000004e-05,
475
+ "loss": 5.2112,
476
+ "mean_token_accuracy": 0.25861204106633257,
477
+ "num_tokens": 48181298.0,
478
+ "step": 520
479
+ },
480
+ {
481
+ "epoch": 1.0351048269137006,
482
+ "grad_norm": 0.5360897183418274,
483
+ "learning_rate": 1.7933593750000002e-05,
484
+ "loss": 4.7274,
485
+ "mean_token_accuracy": 0.26002744026482105,
486
+ "num_tokens": 49105264.0,
487
+ "step": 530
488
+ },
489
+ {
490
+ "epoch": 1.0546075085324231,
491
+ "grad_norm": 0.5811850428581238,
492
+ "learning_rate": 1.7894531250000003e-05,
493
+ "loss": 4.7052,
494
+ "mean_token_accuracy": 0.2608158510178328,
495
+ "num_tokens": 50033001.0,
496
+ "step": 540
497
+ },
498
+ {
499
+ "epoch": 1.0741101901511458,
500
+ "grad_norm": 0.9147205352783203,
501
+ "learning_rate": 1.785546875e-05,
502
+ "loss": 4.6854,
503
+ "mean_token_accuracy": 0.26275911666452884,
504
+ "num_tokens": 50954465.0,
505
+ "step": 550
506
+ },
507
+ {
508
+ "epoch": 1.0936128717698683,
509
+ "grad_norm": 0.46637555956840515,
510
+ "learning_rate": 1.7816406250000003e-05,
511
+ "loss": 4.6969,
512
+ "mean_token_accuracy": 0.26292436122894286,
513
+ "num_tokens": 51876402.0,
514
+ "step": 560
515
+ },
516
+ {
517
+ "epoch": 1.113115553388591,
518
+ "grad_norm": 1.1488078832626343,
519
+ "learning_rate": 1.777734375e-05,
520
+ "loss": 4.6934,
521
+ "mean_token_accuracy": 0.26502432897686956,
522
+ "num_tokens": 52789774.0,
523
+ "step": 570
524
+ },
525
+ {
526
+ "epoch": 1.1326182350073135,
527
+ "grad_norm": 0.42444393038749695,
528
+ "learning_rate": 1.7738281250000003e-05,
529
+ "loss": 4.6616,
530
+ "mean_token_accuracy": 0.2661720596253872,
531
+ "num_tokens": 53700514.0,
532
+ "step": 580
533
+ },
534
+ {
535
+ "epoch": 1.152120916626036,
536
+ "grad_norm": 0.46090322732925415,
537
+ "learning_rate": 1.769921875e-05,
538
+ "loss": 4.6677,
539
+ "mean_token_accuracy": 0.2653431937098503,
540
+ "num_tokens": 54637906.0,
541
+ "step": 590
542
+ },
543
+ {
544
+ "epoch": 1.1716235982447587,
545
+ "grad_norm": 1.0310624837875366,
546
+ "learning_rate": 1.7660156250000003e-05,
547
+ "loss": 4.6441,
548
+ "mean_token_accuracy": 0.26668640449643133,
549
+ "num_tokens": 55566104.0,
550
+ "step": 600
551
+ },
552
+ {
553
+ "epoch": 1.1911262798634812,
554
+ "grad_norm": 0.5190272927284241,
555
+ "learning_rate": 1.762109375e-05,
556
+ "loss": 4.6388,
557
+ "mean_token_accuracy": 0.26863499656319617,
558
+ "num_tokens": 56475233.0,
559
+ "step": 610
560
+ },
561
+ {
562
+ "epoch": 1.210628961482204,
563
+ "grad_norm": 0.4435961842536926,
564
+ "learning_rate": 1.7582031250000003e-05,
565
+ "loss": 4.6344,
566
+ "mean_token_accuracy": 0.27009780779480935,
567
+ "num_tokens": 57393208.0,
568
+ "step": 620
569
+ },
570
+ {
571
+ "epoch": 1.2301316431009264,
572
+ "grad_norm": 1.5555769205093384,
573
+ "learning_rate": 1.754296875e-05,
574
+ "loss": 4.6239,
575
+ "mean_token_accuracy": 0.26839635893702507,
576
+ "num_tokens": 58320792.0,
577
+ "step": 630
578
+ },
579
+ {
580
+ "epoch": 1.2496343247196489,
581
+ "grad_norm": 0.45114317536354065,
582
+ "learning_rate": 1.7503906250000002e-05,
583
+ "loss": 4.6231,
584
+ "mean_token_accuracy": 0.26895947232842443,
585
+ "num_tokens": 59247413.0,
586
+ "step": 640
587
+ },
588
+ {
589
+ "epoch": 1.2691370063383716,
590
+ "grad_norm": 0.5050057768821716,
591
+ "learning_rate": 1.7464843750000004e-05,
592
+ "loss": 4.6231,
593
+ "mean_token_accuracy": 0.2689752779901028,
594
+ "num_tokens": 60180529.0,
595
+ "step": 650
596
+ },
597
+ {
598
+ "epoch": 1.288639687957094,
599
+ "grad_norm": 0.4494447708129883,
600
+ "learning_rate": 1.7425781250000002e-05,
601
+ "loss": 4.5939,
602
+ "mean_token_accuracy": 0.2701444610953331,
603
+ "num_tokens": 61116823.0,
604
+ "step": 660
605
+ },
606
+ {
607
+ "epoch": 1.3081423695758168,
608
+ "grad_norm": 0.5126622915267944,
609
+ "learning_rate": 1.7386718750000004e-05,
610
+ "loss": 4.5905,
611
+ "mean_token_accuracy": 0.27250412106513977,
612
+ "num_tokens": 62036873.0,
613
+ "step": 670
614
+ },
615
+ {
616
+ "epoch": 1.3276450511945392,
617
+ "grad_norm": 0.4027528166770935,
618
+ "learning_rate": 1.7347656250000002e-05,
619
+ "loss": 4.5702,
620
+ "mean_token_accuracy": 0.2741738684475422,
621
+ "num_tokens": 62955213.0,
622
+ "step": 680
623
+ },
624
+ {
625
+ "epoch": 1.3471477328132617,
626
+ "grad_norm": 0.42226913571357727,
627
+ "learning_rate": 1.7308593750000004e-05,
628
+ "loss": 4.6027,
629
+ "mean_token_accuracy": 0.2714010961353779,
630
+ "num_tokens": 63879282.0,
631
+ "step": 690
632
+ },
633
+ {
634
+ "epoch": 1.3666504144319844,
635
+ "grad_norm": 0.7456600069999695,
636
+ "learning_rate": 1.7269531250000002e-05,
637
+ "loss": 4.6076,
638
+ "mean_token_accuracy": 0.27038322016596794,
639
+ "num_tokens": 64813842.0,
640
+ "step": 700
641
+ },
642
+ {
643
+ "epoch": 1.386153096050707,
644
+ "grad_norm": 0.4936697483062744,
645
+ "learning_rate": 1.7230468750000003e-05,
646
+ "loss": 4.5808,
647
+ "mean_token_accuracy": 0.27259208634495735,
648
+ "num_tokens": 65750164.0,
649
+ "step": 710
650
+ },
651
+ {
652
+ "epoch": 1.4056557776694296,
653
+ "grad_norm": 0.40352705121040344,
654
+ "learning_rate": 1.719140625e-05,
655
+ "loss": 4.5687,
656
+ "mean_token_accuracy": 0.2735755704343319,
657
+ "num_tokens": 66671414.0,
658
+ "step": 720
659
+ },
660
+ {
661
+ "epoch": 1.425158459288152,
662
+ "grad_norm": 0.5929037928581238,
663
+ "learning_rate": 1.7152343750000003e-05,
664
+ "loss": 4.5549,
665
+ "mean_token_accuracy": 0.27455407530069353,
666
+ "num_tokens": 67601565.0,
667
+ "step": 730
668
+ },
669
+ {
670
+ "epoch": 1.4446611409068746,
671
+ "grad_norm": 2.1730380058288574,
672
+ "learning_rate": 1.711328125e-05,
673
+ "loss": 4.5471,
674
+ "mean_token_accuracy": 0.2747652716934681,
675
+ "num_tokens": 68543369.0,
676
+ "step": 740
677
+ },
678
+ {
679
+ "epoch": 1.4641638225255973,
680
+ "grad_norm": 0.45310553908348083,
681
+ "learning_rate": 1.7074218750000003e-05,
682
+ "loss": 4.5379,
683
+ "mean_token_accuracy": 0.2759984292089939,
684
+ "num_tokens": 69475220.0,
685
+ "step": 750
686
+ },
687
+ {
688
+ "epoch": 1.4836665041443198,
689
+ "grad_norm": 0.44060948491096497,
690
+ "learning_rate": 1.703515625e-05,
691
+ "loss": 4.5207,
692
+ "mean_token_accuracy": 0.2781739257276058,
693
+ "num_tokens": 70397859.0,
694
+ "step": 760
695
+ },
696
+ {
697
+ "epoch": 1.5031691857630425,
698
+ "grad_norm": 0.9775242209434509,
699
+ "learning_rate": 1.6996093750000003e-05,
700
+ "loss": 4.5283,
701
+ "mean_token_accuracy": 0.27766570150852204,
702
+ "num_tokens": 71329289.0,
703
+ "step": 770
704
+ },
705
+ {
706
+ "epoch": 1.522671867381765,
707
+ "grad_norm": 0.476166695356369,
708
+ "learning_rate": 1.695703125e-05,
709
+ "loss": 4.5227,
710
+ "mean_token_accuracy": 0.2781515374779701,
711
+ "num_tokens": 72251787.0,
712
+ "step": 780
713
+ },
714
+ {
715
+ "epoch": 1.5421745490004874,
716
+ "grad_norm": 0.7697413563728333,
717
+ "learning_rate": 1.6917968750000003e-05,
718
+ "loss": 4.5089,
719
+ "mean_token_accuracy": 0.2777483291924,
720
+ "num_tokens": 73184759.0,
721
+ "step": 790
722
+ },
723
+ {
724
+ "epoch": 1.5616772306192102,
725
+ "grad_norm": 0.39053142070770264,
726
+ "learning_rate": 1.687890625e-05,
727
+ "loss": 4.5224,
728
+ "mean_token_accuracy": 0.2780651919543743,
729
+ "num_tokens": 74104519.0,
730
+ "step": 800
731
+ },
732
+ {
733
+ "epoch": 1.5811799122379329,
734
+ "grad_norm": 0.5136573910713196,
735
+ "learning_rate": 1.6839843750000002e-05,
736
+ "loss": 4.5168,
737
+ "mean_token_accuracy": 0.27778707146644593,
738
+ "num_tokens": 75041132.0,
739
+ "step": 810
740
+ },
741
+ {
742
+ "epoch": 1.6006825938566553,
743
+ "grad_norm": 0.4006953239440918,
744
+ "learning_rate": 1.6800781250000004e-05,
745
+ "loss": 4.4969,
746
+ "mean_token_accuracy": 0.2798936806619167,
747
+ "num_tokens": 75957080.0,
748
+ "step": 820
749
+ },
750
+ {
751
+ "epoch": 1.6201852754753778,
752
+ "grad_norm": 0.8261349201202393,
753
+ "learning_rate": 1.6761718750000002e-05,
754
+ "loss": 4.503,
755
+ "mean_token_accuracy": 0.27909068912267687,
756
+ "num_tokens": 76885767.0,
757
+ "step": 830
758
+ },
759
+ {
760
+ "epoch": 1.6396879570941003,
761
+ "grad_norm": 0.6244539022445679,
762
+ "learning_rate": 1.6722656250000004e-05,
763
+ "loss": 4.4941,
764
+ "mean_token_accuracy": 0.27921902686357497,
765
+ "num_tokens": 77828819.0,
766
+ "step": 840
767
+ },
768
+ {
769
+ "epoch": 1.659190638712823,
770
+ "grad_norm": 0.3824380040168762,
771
+ "learning_rate": 1.6683593750000002e-05,
772
+ "loss": 4.5036,
773
+ "mean_token_accuracy": 0.2791468746960163,
774
+ "num_tokens": 78767094.0,
775
+ "step": 850
776
+ },
777
+ {
778
+ "epoch": 1.6786933203315457,
779
+ "grad_norm": 0.41238024830818176,
780
+ "learning_rate": 1.6644531250000004e-05,
781
+ "loss": 4.498,
782
+ "mean_token_accuracy": 0.28029546365141866,
783
+ "num_tokens": 79699908.0,
784
+ "step": 860
785
+ },
786
+ {
787
+ "epoch": 1.6981960019502682,
788
+ "grad_norm": 0.4335004389286041,
789
+ "learning_rate": 1.6605468750000002e-05,
790
+ "loss": 4.4684,
791
+ "mean_token_accuracy": 0.2824758395552635,
792
+ "num_tokens": 80618966.0,
793
+ "step": 870
794
+ },
795
+ {
796
+ "epoch": 1.7176986835689907,
797
+ "grad_norm": 0.48968249559402466,
798
+ "learning_rate": 1.6566406250000003e-05,
799
+ "loss": 4.4526,
800
+ "mean_token_accuracy": 0.28368064761161804,
801
+ "num_tokens": 81545205.0,
802
+ "step": 880
803
+ },
804
+ {
805
+ "epoch": 1.7372013651877132,
806
+ "grad_norm": 0.41890937089920044,
807
+ "learning_rate": 1.652734375e-05,
808
+ "loss": 4.4734,
809
+ "mean_token_accuracy": 0.2822652608156204,
810
+ "num_tokens": 82463087.0,
811
+ "step": 890
812
+ },
813
+ {
814
+ "epoch": 1.7567040468064359,
815
+ "grad_norm": 0.40375128388404846,
816
+ "learning_rate": 1.6488281250000003e-05,
817
+ "loss": 4.4329,
818
+ "mean_token_accuracy": 0.2864795848727226,
819
+ "num_tokens": 83384315.0,
820
+ "step": 900
821
+ },
822
+ {
823
+ "epoch": 1.7762067284251586,
824
+ "grad_norm": 1.6861543655395508,
825
+ "learning_rate": 1.644921875e-05,
826
+ "loss": 4.4701,
827
+ "mean_token_accuracy": 0.2822112552821636,
828
+ "num_tokens": 84314435.0,
829
+ "step": 910
830
+ },
831
+ {
832
+ "epoch": 1.795709410043881,
833
+ "grad_norm": 0.42918047308921814,
834
+ "learning_rate": 1.6410156250000003e-05,
835
+ "loss": 4.4543,
836
+ "mean_token_accuracy": 0.282807744294405,
837
+ "num_tokens": 85248307.0,
838
+ "step": 920
839
+ },
840
+ {
841
+ "epoch": 1.8152120916626036,
842
+ "grad_norm": 0.38594943284988403,
843
+ "learning_rate": 1.637109375e-05,
844
+ "loss": 4.4509,
845
+ "mean_token_accuracy": 0.28484038934111594,
846
+ "num_tokens": 86168104.0,
847
+ "step": 930
848
+ },
849
+ {
850
+ "epoch": 1.834714773281326,
851
+ "grad_norm": 0.37766233086586,
852
+ "learning_rate": 1.6332031250000003e-05,
853
+ "loss": 4.4378,
854
+ "mean_token_accuracy": 0.28509455919265747,
855
+ "num_tokens": 87085294.0,
856
+ "step": 940
857
+ },
858
+ {
859
+ "epoch": 1.8542174549000487,
860
+ "grad_norm": 0.5309925079345703,
861
+ "learning_rate": 1.629296875e-05,
862
+ "loss": 4.45,
863
+ "mean_token_accuracy": 0.283291470259428,
864
+ "num_tokens": 88022698.0,
865
+ "step": 950
866
+ },
867
+ {
868
+ "epoch": 1.8737201365187715,
869
+ "grad_norm": 0.5312850475311279,
870
+ "learning_rate": 1.6253906250000002e-05,
871
+ "loss": 4.3849,
872
+ "mean_token_accuracy": 0.28923906683921813,
873
+ "num_tokens": 88947429.0,
874
+ "step": 960
875
+ },
876
+ {
877
+ "epoch": 1.893222818137494,
878
+ "grad_norm": 0.5679749250411987,
879
+ "learning_rate": 1.621484375e-05,
880
+ "loss": 4.3953,
881
+ "mean_token_accuracy": 0.28862822949886324,
882
+ "num_tokens": 89878787.0,
883
+ "step": 970
884
+ },
885
+ {
886
+ "epoch": 1.9127254997562164,
887
+ "grad_norm": 0.6506769061088562,
888
+ "learning_rate": 1.6175781250000002e-05,
889
+ "loss": 4.4121,
890
+ "mean_token_accuracy": 0.287344753742218,
891
+ "num_tokens": 90794282.0,
892
+ "step": 980
893
+ },
894
+ {
895
+ "epoch": 1.932228181374939,
896
+ "grad_norm": 0.5218345522880554,
897
+ "learning_rate": 1.6136718750000004e-05,
898
+ "loss": 4.422,
899
+ "mean_token_accuracy": 0.28664510771632196,
900
+ "num_tokens": 91720204.0,
901
+ "step": 990
902
+ },
903
+ {
904
+ "epoch": 1.9517308629936616,
905
+ "grad_norm": 0.38746026158332825,
906
+ "learning_rate": 1.6097656250000002e-05,
907
+ "loss": 4.3909,
908
+ "mean_token_accuracy": 0.2888357400894165,
909
+ "num_tokens": 92640826.0,
910
+ "step": 1000
911
+ }
912
+ ],
913
+ "logging_steps": 10,
914
+ "max_steps": 5120,
915
+ "num_input_tokens_seen": 0,
916
+ "num_train_epochs": 10,
917
+ "save_steps": 500,
918
+ "stateful_callbacks": {
919
+ "TrainerControl": {
920
+ "args": {
921
+ "should_epoch_stop": false,
922
+ "should_evaluate": false,
923
+ "should_log": false,
924
+ "should_save": true,
925
+ "should_training_stop": false
926
+ },
927
+ "attributes": {}
928
+ }
929
+ },
930
+ "total_flos": 1.3814839578407731e+17,
931
+ "train_batch_size": 64,
932
+ "trial_name": null,
933
+ "trial_params": null
934
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:213ef3ed16d9dd20d51f6355dc64c9dc5ebcaf8490efb503d6a15061df366d53
3
+ size 5624
checkpoint-1500/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 256,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 6,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 6,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-1500/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-1500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44602a20dc7488c84e2de4c1f9eced6eb97a945d9096a31cd58a8c5fb5b9fa48
3
+ size 800819936
checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3217e84ec9de6af187b9736f67c269257d35adde746ae7a9121adbffa31cbb7
3
+ size 1601820026
checkpoint-1500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cf9097d4513154245c48236b6ec5137b7ee2a21c9f58f2cba798ea275c6026f
3
+ size 14244
checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58d10611d1af9e5196efca726466d068b69833ca0bd6c31c0b381d1b497a9690
3
+ size 1064
checkpoint-1500/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-1500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-1500/tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }
checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,1384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.9283276450511946,
6
+ "eval_steps": 500,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.019502681618722574,
14
+ "grad_norm": 3.0208523273468018,
15
+ "learning_rate": 1.9964843750000004e-05,
16
+ "loss": 7.8867,
17
+ "mean_token_accuracy": 0.0920736625790596,
18
+ "num_tokens": 920759.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.03900536323744515,
23
+ "grad_norm": 2.2770543098449707,
24
+ "learning_rate": 1.9925781250000002e-05,
25
+ "loss": 7.5013,
26
+ "mean_token_accuracy": 0.10232679340988397,
27
+ "num_tokens": 1848077.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.05850804485616772,
32
+ "grad_norm": 1.6555824279785156,
33
+ "learning_rate": 1.9886718750000004e-05,
34
+ "loss": 7.0436,
35
+ "mean_token_accuracy": 0.11740029789507389,
36
+ "num_tokens": 2781210.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.0780107264748903,
41
+ "grad_norm": 1.7775338888168335,
42
+ "learning_rate": 1.9847656250000002e-05,
43
+ "loss": 6.7631,
44
+ "mean_token_accuracy": 0.1280333673581481,
45
+ "num_tokens": 3689316.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.09751340809361288,
50
+ "grad_norm": 1.2532657384872437,
51
+ "learning_rate": 1.9808593750000003e-05,
52
+ "loss": 6.515,
53
+ "mean_token_accuracy": 0.13626975379884243,
54
+ "num_tokens": 4616761.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.11701608971233544,
59
+ "grad_norm": 1.1137648820877075,
60
+ "learning_rate": 1.976953125e-05,
61
+ "loss": 6.3421,
62
+ "mean_token_accuracy": 0.1418815266340971,
63
+ "num_tokens": 5544631.0,
64
+ "step": 60
65
+ },
66
+ {
67
+ "epoch": 0.13651877133105803,
68
+ "grad_norm": 0.9245680570602417,
69
+ "learning_rate": 1.9730468750000003e-05,
70
+ "loss": 6.2092,
71
+ "mean_token_accuracy": 0.1463709220290184,
72
+ "num_tokens": 6483486.0,
73
+ "step": 70
74
+ },
75
+ {
76
+ "epoch": 0.1560214529497806,
77
+ "grad_norm": 0.8324124813079834,
78
+ "learning_rate": 1.969140625e-05,
79
+ "loss": 6.0411,
80
+ "mean_token_accuracy": 0.15314992293715476,
81
+ "num_tokens": 7412558.0,
82
+ "step": 80
83
+ },
84
+ {
85
+ "epoch": 0.17552413456850316,
86
+ "grad_norm": 0.920666515827179,
87
+ "learning_rate": 1.9652343750000003e-05,
88
+ "loss": 5.9319,
89
+ "mean_token_accuracy": 0.16162274666130544,
90
+ "num_tokens": 8332801.0,
91
+ "step": 90
92
+ },
93
+ {
94
+ "epoch": 0.19502681618722575,
95
+ "grad_norm": 0.8294446468353271,
96
+ "learning_rate": 1.961328125e-05,
97
+ "loss": 5.8516,
98
+ "mean_token_accuracy": 0.16927699856460093,
99
+ "num_tokens": 9274826.0,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 0.21452949780594832,
104
+ "grad_norm": 0.7535350322723389,
105
+ "learning_rate": 1.9574218750000003e-05,
106
+ "loss": 5.7591,
107
+ "mean_token_accuracy": 0.17415257096290587,
108
+ "num_tokens": 10190661.0,
109
+ "step": 110
110
+ },
111
+ {
112
+ "epoch": 0.2340321794246709,
113
+ "grad_norm": 0.9731621742248535,
114
+ "learning_rate": 1.953515625e-05,
115
+ "loss": 5.6905,
116
+ "mean_token_accuracy": 0.17712676227092744,
117
+ "num_tokens": 11113827.0,
118
+ "step": 120
119
+ },
120
+ {
121
+ "epoch": 0.25353486104339346,
122
+ "grad_norm": 1.5154342651367188,
123
+ "learning_rate": 1.9496093750000003e-05,
124
+ "loss": 5.6104,
125
+ "mean_token_accuracy": 0.1849387872964144,
126
+ "num_tokens": 12034156.0,
127
+ "step": 130
128
+ },
129
+ {
130
+ "epoch": 0.27303754266211605,
131
+ "grad_norm": 0.7294800877571106,
132
+ "learning_rate": 1.9457031250000004e-05,
133
+ "loss": 5.5413,
134
+ "mean_token_accuracy": 0.19220538288354874,
135
+ "num_tokens": 12950143.0,
136
+ "step": 140
137
+ },
138
+ {
139
+ "epoch": 0.2925402242808386,
140
+ "grad_norm": 0.6792197823524475,
141
+ "learning_rate": 1.9417968750000002e-05,
142
+ "loss": 5.5046,
143
+ "mean_token_accuracy": 0.1948456909507513,
144
+ "num_tokens": 13864130.0,
145
+ "step": 150
146
+ },
147
+ {
148
+ "epoch": 0.3120429058995612,
149
+ "grad_norm": 0.6913173794746399,
150
+ "learning_rate": 1.9378906250000004e-05,
151
+ "loss": 5.4398,
152
+ "mean_token_accuracy": 0.19714849777519702,
153
+ "num_tokens": 14795220.0,
154
+ "step": 160
155
+ },
156
+ {
157
+ "epoch": 0.3315455875182838,
158
+ "grad_norm": 0.6437414288520813,
159
+ "learning_rate": 1.9339843750000002e-05,
160
+ "loss": 5.4273,
161
+ "mean_token_accuracy": 0.19942218959331512,
162
+ "num_tokens": 15722954.0,
163
+ "step": 170
164
+ },
165
+ {
166
+ "epoch": 0.3510482691370063,
167
+ "grad_norm": 2.1863138675689697,
168
+ "learning_rate": 1.9300781250000004e-05,
169
+ "loss": 5.3666,
170
+ "mean_token_accuracy": 0.2016600638628006,
171
+ "num_tokens": 16637414.0,
172
+ "step": 180
173
+ },
174
+ {
175
+ "epoch": 0.3705509507557289,
176
+ "grad_norm": 0.8689864873886108,
177
+ "learning_rate": 1.9261718750000002e-05,
178
+ "loss": 5.3421,
179
+ "mean_token_accuracy": 0.20273192636668683,
180
+ "num_tokens": 17569476.0,
181
+ "step": 190
182
+ },
183
+ {
184
+ "epoch": 0.3900536323744515,
185
+ "grad_norm": 1.2784861326217651,
186
+ "learning_rate": 1.9222656250000003e-05,
187
+ "loss": 5.3323,
188
+ "mean_token_accuracy": 0.20496859662234784,
189
+ "num_tokens": 18498067.0,
190
+ "step": 200
191
+ },
192
+ {
193
+ "epoch": 0.40955631399317405,
194
+ "grad_norm": 0.6330307722091675,
195
+ "learning_rate": 1.9183593750000002e-05,
196
+ "loss": 5.2827,
197
+ "mean_token_accuracy": 0.2135307714343071,
198
+ "num_tokens": 19416964.0,
199
+ "step": 210
200
+ },
201
+ {
202
+ "epoch": 0.42905899561189664,
203
+ "grad_norm": 1.1162034273147583,
204
+ "learning_rate": 1.9144531250000003e-05,
205
+ "loss": 5.2121,
206
+ "mean_token_accuracy": 0.21816504523158073,
207
+ "num_tokens": 20341056.0,
208
+ "step": 220
209
+ },
210
+ {
211
+ "epoch": 0.4485616772306192,
212
+ "grad_norm": 0.6339605450630188,
213
+ "learning_rate": 1.910546875e-05,
214
+ "loss": 5.2187,
215
+ "mean_token_accuracy": 0.21803640052676201,
216
+ "num_tokens": 21267082.0,
217
+ "step": 230
218
+ },
219
+ {
220
+ "epoch": 0.4680643588493418,
221
+ "grad_norm": 0.607659637928009,
222
+ "learning_rate": 1.9066406250000003e-05,
223
+ "loss": 5.1826,
224
+ "mean_token_accuracy": 0.22011552266776563,
225
+ "num_tokens": 22195442.0,
226
+ "step": 240
227
+ },
228
+ {
229
+ "epoch": 0.48756704046806437,
230
+ "grad_norm": 0.5029736757278442,
231
+ "learning_rate": 1.902734375e-05,
232
+ "loss": 5.1424,
233
+ "mean_token_accuracy": 0.22285537868738176,
234
+ "num_tokens": 23144077.0,
235
+ "step": 250
236
+ },
237
+ {
238
+ "epoch": 0.5070697220867869,
239
+ "grad_norm": 0.9591688513755798,
240
+ "learning_rate": 1.8988281250000003e-05,
241
+ "loss": 5.0913,
242
+ "mean_token_accuracy": 0.22736062072217464,
243
+ "num_tokens": 24068643.0,
244
+ "step": 260
245
+ },
246
+ {
247
+ "epoch": 0.5265724037055095,
248
+ "grad_norm": 0.5418295860290527,
249
+ "learning_rate": 1.894921875e-05,
250
+ "loss": 5.1015,
251
+ "mean_token_accuracy": 0.22693138755857944,
252
+ "num_tokens": 24996534.0,
253
+ "step": 270
254
+ },
255
+ {
256
+ "epoch": 0.5460750853242321,
257
+ "grad_norm": 0.5258099436759949,
258
+ "learning_rate": 1.8910156250000003e-05,
259
+ "loss": 5.0746,
260
+ "mean_token_accuracy": 0.23026154786348343,
261
+ "num_tokens": 25920902.0,
262
+ "step": 280
263
+ },
264
+ {
265
+ "epoch": 0.5655777669429547,
266
+ "grad_norm": 0.5592005252838135,
267
+ "learning_rate": 1.887109375e-05,
268
+ "loss": 5.0566,
269
+ "mean_token_accuracy": 0.23160071447491645,
270
+ "num_tokens": 26845104.0,
271
+ "step": 290
272
+ },
273
+ {
274
+ "epoch": 0.5850804485616772,
275
+ "grad_norm": 0.5427853465080261,
276
+ "learning_rate": 1.8832031250000002e-05,
277
+ "loss": 5.0565,
278
+ "mean_token_accuracy": 0.2316149313002825,
279
+ "num_tokens": 27782058.0,
280
+ "step": 300
281
+ },
282
+ {
283
+ "epoch": 0.6045831301803998,
284
+ "grad_norm": 0.9386640191078186,
285
+ "learning_rate": 1.8792968750000004e-05,
286
+ "loss": 5.0003,
287
+ "mean_token_accuracy": 0.2349798556417227,
288
+ "num_tokens": 28707140.0,
289
+ "step": 310
290
+ },
291
+ {
292
+ "epoch": 0.6240858117991224,
293
+ "grad_norm": 0.8473969101905823,
294
+ "learning_rate": 1.8753906250000002e-05,
295
+ "loss": 4.9954,
296
+ "mean_token_accuracy": 0.23647231683135034,
297
+ "num_tokens": 29637908.0,
298
+ "step": 320
299
+ },
300
+ {
301
+ "epoch": 0.643588493417845,
302
+ "grad_norm": 0.5143874883651733,
303
+ "learning_rate": 1.8714843750000004e-05,
304
+ "loss": 4.9881,
305
+ "mean_token_accuracy": 0.23599626012146474,
306
+ "num_tokens": 30559254.0,
307
+ "step": 330
308
+ },
309
+ {
310
+ "epoch": 0.6630911750365676,
311
+ "grad_norm": 0.4523729979991913,
312
+ "learning_rate": 1.8675781250000002e-05,
313
+ "loss": 4.9695,
314
+ "mean_token_accuracy": 0.23746853992342948,
315
+ "num_tokens": 31487152.0,
316
+ "step": 340
317
+ },
318
+ {
319
+ "epoch": 0.6825938566552902,
320
+ "grad_norm": 0.5573343634605408,
321
+ "learning_rate": 1.8636718750000004e-05,
322
+ "loss": 4.9525,
323
+ "mean_token_accuracy": 0.23925678990781307,
324
+ "num_tokens": 32396911.0,
325
+ "step": 350
326
+ },
327
+ {
328
+ "epoch": 0.7020965382740126,
329
+ "grad_norm": 1.2790151834487915,
330
+ "learning_rate": 1.8597656250000002e-05,
331
+ "loss": 4.9338,
332
+ "mean_token_accuracy": 0.24103106185793877,
333
+ "num_tokens": 33325126.0,
334
+ "step": 360
335
+ },
336
+ {
337
+ "epoch": 0.7215992198927352,
338
+ "grad_norm": 0.8714343309402466,
339
+ "learning_rate": 1.8558593750000003e-05,
340
+ "loss": 4.9252,
341
+ "mean_token_accuracy": 0.24138498678803444,
342
+ "num_tokens": 34262332.0,
343
+ "step": 370
344
+ },
345
+ {
346
+ "epoch": 0.7411019015114578,
347
+ "grad_norm": 0.5251726508140564,
348
+ "learning_rate": 1.851953125e-05,
349
+ "loss": 4.8883,
350
+ "mean_token_accuracy": 0.24456401653587817,
351
+ "num_tokens": 35181080.0,
352
+ "step": 380
353
+ },
354
+ {
355
+ "epoch": 0.7606045831301804,
356
+ "grad_norm": 0.46523743867874146,
357
+ "learning_rate": 1.8480468750000003e-05,
358
+ "loss": 4.8887,
359
+ "mean_token_accuracy": 0.24552332125604154,
360
+ "num_tokens": 36105038.0,
361
+ "step": 390
362
+ },
363
+ {
364
+ "epoch": 0.780107264748903,
365
+ "grad_norm": 0.5080934166908264,
366
+ "learning_rate": 1.844140625e-05,
367
+ "loss": 4.8659,
368
+ "mean_token_accuracy": 0.245796899497509,
369
+ "num_tokens": 37036998.0,
370
+ "step": 400
371
+ },
372
+ {
373
+ "epoch": 0.7996099463676255,
374
+ "grad_norm": 1.117693543434143,
375
+ "learning_rate": 1.8402343750000003e-05,
376
+ "loss": 4.8681,
377
+ "mean_token_accuracy": 0.2463846940547228,
378
+ "num_tokens": 37968683.0,
379
+ "step": 410
380
+ },
381
+ {
382
+ "epoch": 0.8191126279863481,
383
+ "grad_norm": 0.5299689769744873,
384
+ "learning_rate": 1.836328125e-05,
385
+ "loss": 4.8305,
386
+ "mean_token_accuracy": 0.24968006946146487,
387
+ "num_tokens": 38888752.0,
388
+ "step": 420
389
+ },
390
+ {
391
+ "epoch": 0.8386153096050707,
392
+ "grad_norm": 0.8458639979362488,
393
+ "learning_rate": 1.8324218750000003e-05,
394
+ "loss": 4.8279,
395
+ "mean_token_accuracy": 0.25057865455746653,
396
+ "num_tokens": 39823608.0,
397
+ "step": 430
398
+ },
399
+ {
400
+ "epoch": 0.8581179912237933,
401
+ "grad_norm": 0.5429758429527283,
402
+ "learning_rate": 1.828515625e-05,
403
+ "loss": 4.8243,
404
+ "mean_token_accuracy": 0.2514403607696295,
405
+ "num_tokens": 40748246.0,
406
+ "step": 440
407
+ },
408
+ {
409
+ "epoch": 0.8776206728425159,
410
+ "grad_norm": 0.47386595606803894,
411
+ "learning_rate": 1.8246093750000003e-05,
412
+ "loss": 4.7881,
413
+ "mean_token_accuracy": 0.25412631034851074,
414
+ "num_tokens": 41662563.0,
415
+ "step": 450
416
+ },
417
+ {
418
+ "epoch": 0.8971233544612384,
419
+ "grad_norm": 0.41789767146110535,
420
+ "learning_rate": 1.820703125e-05,
421
+ "loss": 4.7944,
422
+ "mean_token_accuracy": 0.25327568165957925,
423
+ "num_tokens": 42588923.0,
424
+ "step": 460
425
+ },
426
+ {
427
+ "epoch": 0.916626036079961,
428
+ "grad_norm": 0.43711453676223755,
429
+ "learning_rate": 1.8167968750000002e-05,
430
+ "loss": 4.7758,
431
+ "mean_token_accuracy": 0.2548953540623188,
432
+ "num_tokens": 43515886.0,
433
+ "step": 470
434
+ },
435
+ {
436
+ "epoch": 0.9361287176986836,
437
+ "grad_norm": 0.6433466076850891,
438
+ "learning_rate": 1.8128906250000004e-05,
439
+ "loss": 4.7632,
440
+ "mean_token_accuracy": 0.2562540594488382,
441
+ "num_tokens": 44446196.0,
442
+ "step": 480
443
+ },
444
+ {
445
+ "epoch": 0.9556313993174061,
446
+ "grad_norm": 0.7580122351646423,
447
+ "learning_rate": 1.8089843750000002e-05,
448
+ "loss": 4.7559,
449
+ "mean_token_accuracy": 0.25687045492231847,
450
+ "num_tokens": 45389436.0,
451
+ "step": 490
452
+ },
453
+ {
454
+ "epoch": 0.9751340809361287,
455
+ "grad_norm": 0.9800453782081604,
456
+ "learning_rate": 1.8050781250000004e-05,
457
+ "loss": 4.7711,
458
+ "mean_token_accuracy": 0.2573121260851622,
459
+ "num_tokens": 46313035.0,
460
+ "step": 500
461
+ },
462
+ {
463
+ "epoch": 0.9946367625548513,
464
+ "grad_norm": 0.636842668056488,
465
+ "learning_rate": 1.8011718750000002e-05,
466
+ "loss": 4.7693,
467
+ "mean_token_accuracy": 0.2563398856669664,
468
+ "num_tokens": 47231018.0,
469
+ "step": 510
470
+ },
471
+ {
472
+ "epoch": 1.0156021452949782,
473
+ "grad_norm": 0.9280142188072205,
474
+ "learning_rate": 1.7972656250000004e-05,
475
+ "loss": 5.2112,
476
+ "mean_token_accuracy": 0.25861204106633257,
477
+ "num_tokens": 48181298.0,
478
+ "step": 520
479
+ },
480
+ {
481
+ "epoch": 1.0351048269137006,
482
+ "grad_norm": 0.5360897183418274,
483
+ "learning_rate": 1.7933593750000002e-05,
484
+ "loss": 4.7274,
485
+ "mean_token_accuracy": 0.26002744026482105,
486
+ "num_tokens": 49105264.0,
487
+ "step": 530
488
+ },
489
+ {
490
+ "epoch": 1.0546075085324231,
491
+ "grad_norm": 0.5811850428581238,
492
+ "learning_rate": 1.7894531250000003e-05,
493
+ "loss": 4.7052,
494
+ "mean_token_accuracy": 0.2608158510178328,
495
+ "num_tokens": 50033001.0,
496
+ "step": 540
497
+ },
498
+ {
499
+ "epoch": 1.0741101901511458,
500
+ "grad_norm": 0.9147205352783203,
501
+ "learning_rate": 1.785546875e-05,
502
+ "loss": 4.6854,
503
+ "mean_token_accuracy": 0.26275911666452884,
504
+ "num_tokens": 50954465.0,
505
+ "step": 550
506
+ },
507
+ {
508
+ "epoch": 1.0936128717698683,
509
+ "grad_norm": 0.46637555956840515,
510
+ "learning_rate": 1.7816406250000003e-05,
511
+ "loss": 4.6969,
512
+ "mean_token_accuracy": 0.26292436122894286,
513
+ "num_tokens": 51876402.0,
514
+ "step": 560
515
+ },
516
+ {
517
+ "epoch": 1.113115553388591,
518
+ "grad_norm": 1.1488078832626343,
519
+ "learning_rate": 1.777734375e-05,
520
+ "loss": 4.6934,
521
+ "mean_token_accuracy": 0.26502432897686956,
522
+ "num_tokens": 52789774.0,
523
+ "step": 570
524
+ },
525
+ {
526
+ "epoch": 1.1326182350073135,
527
+ "grad_norm": 0.42444393038749695,
528
+ "learning_rate": 1.7738281250000003e-05,
529
+ "loss": 4.6616,
530
+ "mean_token_accuracy": 0.2661720596253872,
531
+ "num_tokens": 53700514.0,
532
+ "step": 580
533
+ },
534
+ {
535
+ "epoch": 1.152120916626036,
536
+ "grad_norm": 0.46090322732925415,
537
+ "learning_rate": 1.769921875e-05,
538
+ "loss": 4.6677,
539
+ "mean_token_accuracy": 0.2653431937098503,
540
+ "num_tokens": 54637906.0,
541
+ "step": 590
542
+ },
543
+ {
544
+ "epoch": 1.1716235982447587,
545
+ "grad_norm": 1.0310624837875366,
546
+ "learning_rate": 1.7660156250000003e-05,
547
+ "loss": 4.6441,
548
+ "mean_token_accuracy": 0.26668640449643133,
549
+ "num_tokens": 55566104.0,
550
+ "step": 600
551
+ },
552
+ {
553
+ "epoch": 1.1911262798634812,
554
+ "grad_norm": 0.5190272927284241,
555
+ "learning_rate": 1.762109375e-05,
556
+ "loss": 4.6388,
557
+ "mean_token_accuracy": 0.26863499656319617,
558
+ "num_tokens": 56475233.0,
559
+ "step": 610
560
+ },
561
+ {
562
+ "epoch": 1.210628961482204,
563
+ "grad_norm": 0.4435961842536926,
564
+ "learning_rate": 1.7582031250000003e-05,
565
+ "loss": 4.6344,
566
+ "mean_token_accuracy": 0.27009780779480935,
567
+ "num_tokens": 57393208.0,
568
+ "step": 620
569
+ },
570
+ {
571
+ "epoch": 1.2301316431009264,
572
+ "grad_norm": 1.5555769205093384,
573
+ "learning_rate": 1.754296875e-05,
574
+ "loss": 4.6239,
575
+ "mean_token_accuracy": 0.26839635893702507,
576
+ "num_tokens": 58320792.0,
577
+ "step": 630
578
+ },
579
+ {
580
+ "epoch": 1.2496343247196489,
581
+ "grad_norm": 0.45114317536354065,
582
+ "learning_rate": 1.7503906250000002e-05,
583
+ "loss": 4.6231,
584
+ "mean_token_accuracy": 0.26895947232842443,
585
+ "num_tokens": 59247413.0,
586
+ "step": 640
587
+ },
588
+ {
589
+ "epoch": 1.2691370063383716,
590
+ "grad_norm": 0.5050057768821716,
591
+ "learning_rate": 1.7464843750000004e-05,
592
+ "loss": 4.6231,
593
+ "mean_token_accuracy": 0.2689752779901028,
594
+ "num_tokens": 60180529.0,
595
+ "step": 650
596
+ },
597
+ {
598
+ "epoch": 1.288639687957094,
599
+ "grad_norm": 0.4494447708129883,
600
+ "learning_rate": 1.7425781250000002e-05,
601
+ "loss": 4.5939,
602
+ "mean_token_accuracy": 0.2701444610953331,
603
+ "num_tokens": 61116823.0,
604
+ "step": 660
605
+ },
606
+ {
607
+ "epoch": 1.3081423695758168,
608
+ "grad_norm": 0.5126622915267944,
609
+ "learning_rate": 1.7386718750000004e-05,
610
+ "loss": 4.5905,
611
+ "mean_token_accuracy": 0.27250412106513977,
612
+ "num_tokens": 62036873.0,
613
+ "step": 670
614
+ },
615
+ {
616
+ "epoch": 1.3276450511945392,
617
+ "grad_norm": 0.4027528166770935,
618
+ "learning_rate": 1.7347656250000002e-05,
619
+ "loss": 4.5702,
620
+ "mean_token_accuracy": 0.2741738684475422,
621
+ "num_tokens": 62955213.0,
622
+ "step": 680
623
+ },
624
+ {
625
+ "epoch": 1.3471477328132617,
626
+ "grad_norm": 0.42226913571357727,
627
+ "learning_rate": 1.7308593750000004e-05,
628
+ "loss": 4.6027,
629
+ "mean_token_accuracy": 0.2714010961353779,
630
+ "num_tokens": 63879282.0,
631
+ "step": 690
632
+ },
633
+ {
634
+ "epoch": 1.3666504144319844,
635
+ "grad_norm": 0.7456600069999695,
636
+ "learning_rate": 1.7269531250000002e-05,
637
+ "loss": 4.6076,
638
+ "mean_token_accuracy": 0.27038322016596794,
639
+ "num_tokens": 64813842.0,
640
+ "step": 700
641
+ },
642
+ {
643
+ "epoch": 1.386153096050707,
644
+ "grad_norm": 0.4936697483062744,
645
+ "learning_rate": 1.7230468750000003e-05,
646
+ "loss": 4.5808,
647
+ "mean_token_accuracy": 0.27259208634495735,
648
+ "num_tokens": 65750164.0,
649
+ "step": 710
650
+ },
651
+ {
652
+ "epoch": 1.4056557776694296,
653
+ "grad_norm": 0.40352705121040344,
654
+ "learning_rate": 1.719140625e-05,
655
+ "loss": 4.5687,
656
+ "mean_token_accuracy": 0.2735755704343319,
657
+ "num_tokens": 66671414.0,
658
+ "step": 720
659
+ },
660
+ {
661
+ "epoch": 1.425158459288152,
662
+ "grad_norm": 0.5929037928581238,
663
+ "learning_rate": 1.7152343750000003e-05,
664
+ "loss": 4.5549,
665
+ "mean_token_accuracy": 0.27455407530069353,
666
+ "num_tokens": 67601565.0,
667
+ "step": 730
668
+ },
669
+ {
670
+ "epoch": 1.4446611409068746,
671
+ "grad_norm": 2.1730380058288574,
672
+ "learning_rate": 1.711328125e-05,
673
+ "loss": 4.5471,
674
+ "mean_token_accuracy": 0.2747652716934681,
675
+ "num_tokens": 68543369.0,
676
+ "step": 740
677
+ },
678
+ {
679
+ "epoch": 1.4641638225255973,
680
+ "grad_norm": 0.45310553908348083,
681
+ "learning_rate": 1.7074218750000003e-05,
682
+ "loss": 4.5379,
683
+ "mean_token_accuracy": 0.2759984292089939,
684
+ "num_tokens": 69475220.0,
685
+ "step": 750
686
+ },
687
+ {
688
+ "epoch": 1.4836665041443198,
689
+ "grad_norm": 0.44060948491096497,
690
+ "learning_rate": 1.703515625e-05,
691
+ "loss": 4.5207,
692
+ "mean_token_accuracy": 0.2781739257276058,
693
+ "num_tokens": 70397859.0,
694
+ "step": 760
695
+ },
696
+ {
697
+ "epoch": 1.5031691857630425,
698
+ "grad_norm": 0.9775242209434509,
699
+ "learning_rate": 1.6996093750000003e-05,
700
+ "loss": 4.5283,
701
+ "mean_token_accuracy": 0.27766570150852204,
702
+ "num_tokens": 71329289.0,
703
+ "step": 770
704
+ },
705
+ {
706
+ "epoch": 1.522671867381765,
707
+ "grad_norm": 0.476166695356369,
708
+ "learning_rate": 1.695703125e-05,
709
+ "loss": 4.5227,
710
+ "mean_token_accuracy": 0.2781515374779701,
711
+ "num_tokens": 72251787.0,
712
+ "step": 780
713
+ },
714
+ {
715
+ "epoch": 1.5421745490004874,
716
+ "grad_norm": 0.7697413563728333,
717
+ "learning_rate": 1.6917968750000003e-05,
718
+ "loss": 4.5089,
719
+ "mean_token_accuracy": 0.2777483291924,
720
+ "num_tokens": 73184759.0,
721
+ "step": 790
722
+ },
723
+ {
724
+ "epoch": 1.5616772306192102,
725
+ "grad_norm": 0.39053142070770264,
726
+ "learning_rate": 1.687890625e-05,
727
+ "loss": 4.5224,
728
+ "mean_token_accuracy": 0.2780651919543743,
729
+ "num_tokens": 74104519.0,
730
+ "step": 800
731
+ },
732
+ {
733
+ "epoch": 1.5811799122379329,
734
+ "grad_norm": 0.5136573910713196,
735
+ "learning_rate": 1.6839843750000002e-05,
736
+ "loss": 4.5168,
737
+ "mean_token_accuracy": 0.27778707146644593,
738
+ "num_tokens": 75041132.0,
739
+ "step": 810
740
+ },
741
+ {
742
+ "epoch": 1.6006825938566553,
743
+ "grad_norm": 0.4006953239440918,
744
+ "learning_rate": 1.6800781250000004e-05,
745
+ "loss": 4.4969,
746
+ "mean_token_accuracy": 0.2798936806619167,
747
+ "num_tokens": 75957080.0,
748
+ "step": 820
749
+ },
750
+ {
751
+ "epoch": 1.6201852754753778,
752
+ "grad_norm": 0.8261349201202393,
753
+ "learning_rate": 1.6761718750000002e-05,
754
+ "loss": 4.503,
755
+ "mean_token_accuracy": 0.27909068912267687,
756
+ "num_tokens": 76885767.0,
757
+ "step": 830
758
+ },
759
+ {
760
+ "epoch": 1.6396879570941003,
761
+ "grad_norm": 0.6244539022445679,
762
+ "learning_rate": 1.6722656250000004e-05,
763
+ "loss": 4.4941,
764
+ "mean_token_accuracy": 0.27921902686357497,
765
+ "num_tokens": 77828819.0,
766
+ "step": 840
767
+ },
768
+ {
769
+ "epoch": 1.659190638712823,
770
+ "grad_norm": 0.3824380040168762,
771
+ "learning_rate": 1.6683593750000002e-05,
772
+ "loss": 4.5036,
773
+ "mean_token_accuracy": 0.2791468746960163,
774
+ "num_tokens": 78767094.0,
775
+ "step": 850
776
+ },
777
+ {
778
+ "epoch": 1.6786933203315457,
779
+ "grad_norm": 0.41238024830818176,
780
+ "learning_rate": 1.6644531250000004e-05,
781
+ "loss": 4.498,
782
+ "mean_token_accuracy": 0.28029546365141866,
783
+ "num_tokens": 79699908.0,
784
+ "step": 860
785
+ },
786
+ {
787
+ "epoch": 1.6981960019502682,
788
+ "grad_norm": 0.4335004389286041,
789
+ "learning_rate": 1.6605468750000002e-05,
790
+ "loss": 4.4684,
791
+ "mean_token_accuracy": 0.2824758395552635,
792
+ "num_tokens": 80618966.0,
793
+ "step": 870
794
+ },
795
+ {
796
+ "epoch": 1.7176986835689907,
797
+ "grad_norm": 0.48968249559402466,
798
+ "learning_rate": 1.6566406250000003e-05,
799
+ "loss": 4.4526,
800
+ "mean_token_accuracy": 0.28368064761161804,
801
+ "num_tokens": 81545205.0,
802
+ "step": 880
803
+ },
804
+ {
805
+ "epoch": 1.7372013651877132,
806
+ "grad_norm": 0.41890937089920044,
807
+ "learning_rate": 1.652734375e-05,
808
+ "loss": 4.4734,
809
+ "mean_token_accuracy": 0.2822652608156204,
810
+ "num_tokens": 82463087.0,
811
+ "step": 890
812
+ },
813
+ {
814
+ "epoch": 1.7567040468064359,
815
+ "grad_norm": 0.40375128388404846,
816
+ "learning_rate": 1.6488281250000003e-05,
817
+ "loss": 4.4329,
818
+ "mean_token_accuracy": 0.2864795848727226,
819
+ "num_tokens": 83384315.0,
820
+ "step": 900
821
+ },
822
+ {
823
+ "epoch": 1.7762067284251586,
824
+ "grad_norm": 1.6861543655395508,
825
+ "learning_rate": 1.644921875e-05,
826
+ "loss": 4.4701,
827
+ "mean_token_accuracy": 0.2822112552821636,
828
+ "num_tokens": 84314435.0,
829
+ "step": 910
830
+ },
831
+ {
832
+ "epoch": 1.795709410043881,
833
+ "grad_norm": 0.42918047308921814,
834
+ "learning_rate": 1.6410156250000003e-05,
835
+ "loss": 4.4543,
836
+ "mean_token_accuracy": 0.282807744294405,
837
+ "num_tokens": 85248307.0,
838
+ "step": 920
839
+ },
840
+ {
841
+ "epoch": 1.8152120916626036,
842
+ "grad_norm": 0.38594943284988403,
843
+ "learning_rate": 1.637109375e-05,
844
+ "loss": 4.4509,
845
+ "mean_token_accuracy": 0.28484038934111594,
846
+ "num_tokens": 86168104.0,
847
+ "step": 930
848
+ },
849
+ {
850
+ "epoch": 1.834714773281326,
851
+ "grad_norm": 0.37766233086586,
852
+ "learning_rate": 1.6332031250000003e-05,
853
+ "loss": 4.4378,
854
+ "mean_token_accuracy": 0.28509455919265747,
855
+ "num_tokens": 87085294.0,
856
+ "step": 940
857
+ },
858
+ {
859
+ "epoch": 1.8542174549000487,
860
+ "grad_norm": 0.5309925079345703,
861
+ "learning_rate": 1.629296875e-05,
862
+ "loss": 4.45,
863
+ "mean_token_accuracy": 0.283291470259428,
864
+ "num_tokens": 88022698.0,
865
+ "step": 950
866
+ },
867
+ {
868
+ "epoch": 1.8737201365187715,
869
+ "grad_norm": 0.5312850475311279,
870
+ "learning_rate": 1.6253906250000002e-05,
871
+ "loss": 4.3849,
872
+ "mean_token_accuracy": 0.28923906683921813,
873
+ "num_tokens": 88947429.0,
874
+ "step": 960
875
+ },
876
+ {
877
+ "epoch": 1.893222818137494,
878
+ "grad_norm": 0.5679749250411987,
879
+ "learning_rate": 1.621484375e-05,
880
+ "loss": 4.3953,
881
+ "mean_token_accuracy": 0.28862822949886324,
882
+ "num_tokens": 89878787.0,
883
+ "step": 970
884
+ },
885
+ {
886
+ "epoch": 1.9127254997562164,
887
+ "grad_norm": 0.6506769061088562,
888
+ "learning_rate": 1.6175781250000002e-05,
889
+ "loss": 4.4121,
890
+ "mean_token_accuracy": 0.287344753742218,
891
+ "num_tokens": 90794282.0,
892
+ "step": 980
893
+ },
894
+ {
895
+ "epoch": 1.932228181374939,
896
+ "grad_norm": 0.5218345522880554,
897
+ "learning_rate": 1.6136718750000004e-05,
898
+ "loss": 4.422,
899
+ "mean_token_accuracy": 0.28664510771632196,
900
+ "num_tokens": 91720204.0,
901
+ "step": 990
902
+ },
903
+ {
904
+ "epoch": 1.9517308629936616,
905
+ "grad_norm": 0.38746026158332825,
906
+ "learning_rate": 1.6097656250000002e-05,
907
+ "loss": 4.3909,
908
+ "mean_token_accuracy": 0.2888357400894165,
909
+ "num_tokens": 92640826.0,
910
+ "step": 1000
911
+ },
912
+ {
913
+ "epoch": 1.9712335446123843,
914
+ "grad_norm": 0.4358104467391968,
915
+ "learning_rate": 1.6058593750000004e-05,
916
+ "loss": 4.4009,
917
+ "mean_token_accuracy": 0.28872263357043265,
918
+ "num_tokens": 93557645.0,
919
+ "step": 1010
920
+ },
921
+ {
922
+ "epoch": 1.9907362262311068,
923
+ "grad_norm": 0.399600088596344,
924
+ "learning_rate": 1.6019531250000002e-05,
925
+ "loss": 4.41,
926
+ "mean_token_accuracy": 0.28658533096313477,
927
+ "num_tokens": 94488514.0,
928
+ "step": 1020
929
+ },
930
+ {
931
+ "epoch": 2.0117016089712334,
932
+ "grad_norm": 0.4287355840206146,
933
+ "learning_rate": 1.5980468750000003e-05,
934
+ "loss": 4.8637,
935
+ "mean_token_accuracy": 0.2884528564243782,
936
+ "num_tokens": 95429559.0,
937
+ "step": 1030
938
+ },
939
+ {
940
+ "epoch": 2.0312042905899563,
941
+ "grad_norm": 0.43448057770729065,
942
+ "learning_rate": 1.594140625e-05,
943
+ "loss": 4.411,
944
+ "mean_token_accuracy": 0.28822447881102564,
945
+ "num_tokens": 96370969.0,
946
+ "step": 1040
947
+ },
948
+ {
949
+ "epoch": 2.050706972208679,
950
+ "grad_norm": 0.4509051442146301,
951
+ "learning_rate": 1.5902343750000003e-05,
952
+ "loss": 4.3998,
953
+ "mean_token_accuracy": 0.2875743143260479,
954
+ "num_tokens": 97297487.0,
955
+ "step": 1050
956
+ },
957
+ {
958
+ "epoch": 2.0702096538274013,
959
+ "grad_norm": 0.4603135585784912,
960
+ "learning_rate": 1.586328125e-05,
961
+ "loss": 4.3895,
962
+ "mean_token_accuracy": 0.28888514786958697,
963
+ "num_tokens": 98234116.0,
964
+ "step": 1060
965
+ },
966
+ {
967
+ "epoch": 2.089712335446124,
968
+ "grad_norm": 0.45260968804359436,
969
+ "learning_rate": 1.5824218750000003e-05,
970
+ "loss": 4.3827,
971
+ "mean_token_accuracy": 0.28953884318470957,
972
+ "num_tokens": 99158170.0,
973
+ "step": 1070
974
+ },
975
+ {
976
+ "epoch": 2.1092150170648463,
977
+ "grad_norm": 0.4549092650413513,
978
+ "learning_rate": 1.578515625e-05,
979
+ "loss": 4.3818,
980
+ "mean_token_accuracy": 0.2901176653802395,
981
+ "num_tokens": 100089007.0,
982
+ "step": 1080
983
+ },
984
+ {
985
+ "epoch": 2.128717698683569,
986
+ "grad_norm": 0.4202571213245392,
987
+ "learning_rate": 1.5746093750000003e-05,
988
+ "loss": 4.3617,
989
+ "mean_token_accuracy": 0.2919108562171459,
990
+ "num_tokens": 101002323.0,
991
+ "step": 1090
992
+ },
993
+ {
994
+ "epoch": 2.1482203803022917,
995
+ "grad_norm": 0.5119932889938354,
996
+ "learning_rate": 1.570703125e-05,
997
+ "loss": 4.365,
998
+ "mean_token_accuracy": 0.2918895035982132,
999
+ "num_tokens": 101915323.0,
1000
+ "step": 1100
1001
+ },
1002
+ {
1003
+ "epoch": 2.167723061921014,
1004
+ "grad_norm": 0.49400025606155396,
1005
+ "learning_rate": 1.5667968750000003e-05,
1006
+ "loss": 4.3662,
1007
+ "mean_token_accuracy": 0.29112903624773023,
1008
+ "num_tokens": 102848704.0,
1009
+ "step": 1110
1010
+ },
1011
+ {
1012
+ "epoch": 2.1872257435397366,
1013
+ "grad_norm": 9.812466621398926,
1014
+ "learning_rate": 1.562890625e-05,
1015
+ "loss": 4.3536,
1016
+ "mean_token_accuracy": 0.29283427745103835,
1017
+ "num_tokens": 103770127.0,
1018
+ "step": 1120
1019
+ },
1020
+ {
1021
+ "epoch": 2.206728425158459,
1022
+ "grad_norm": 0.6520562171936035,
1023
+ "learning_rate": 1.5589843750000002e-05,
1024
+ "loss": 4.3685,
1025
+ "mean_token_accuracy": 0.29140080511569977,
1026
+ "num_tokens": 104696965.0,
1027
+ "step": 1130
1028
+ },
1029
+ {
1030
+ "epoch": 2.226231106777182,
1031
+ "grad_norm": 0.3824687600135803,
1032
+ "learning_rate": 1.555078125e-05,
1033
+ "loss": 4.3617,
1034
+ "mean_token_accuracy": 0.29200059548020363,
1035
+ "num_tokens": 105627411.0,
1036
+ "step": 1140
1037
+ },
1038
+ {
1039
+ "epoch": 2.2457337883959045,
1040
+ "grad_norm": 0.40885069966316223,
1041
+ "learning_rate": 1.5511718750000002e-05,
1042
+ "loss": 4.3486,
1043
+ "mean_token_accuracy": 0.29261764511466026,
1044
+ "num_tokens": 106560394.0,
1045
+ "step": 1150
1046
+ },
1047
+ {
1048
+ "epoch": 2.265236470014627,
1049
+ "grad_norm": 0.5578988194465637,
1050
+ "learning_rate": 1.5472656250000004e-05,
1051
+ "loss": 4.3477,
1052
+ "mean_token_accuracy": 0.2928113825619221,
1053
+ "num_tokens": 107484781.0,
1054
+ "step": 1160
1055
+ },
1056
+ {
1057
+ "epoch": 2.2847391516333495,
1058
+ "grad_norm": 2.4490866661071777,
1059
+ "learning_rate": 1.5433593750000002e-05,
1060
+ "loss": 4.3497,
1061
+ "mean_token_accuracy": 0.2932279795408249,
1062
+ "num_tokens": 108403123.0,
1063
+ "step": 1170
1064
+ },
1065
+ {
1066
+ "epoch": 2.304241833252072,
1067
+ "grad_norm": 0.4807080030441284,
1068
+ "learning_rate": 1.5394531250000004e-05,
1069
+ "loss": 4.3279,
1070
+ "mean_token_accuracy": 0.2941134661436081,
1071
+ "num_tokens": 109334624.0,
1072
+ "step": 1180
1073
+ },
1074
+ {
1075
+ "epoch": 2.323744514870795,
1076
+ "grad_norm": 0.7090457677841187,
1077
+ "learning_rate": 1.5355468750000002e-05,
1078
+ "loss": 4.345,
1079
+ "mean_token_accuracy": 0.2933297656476498,
1080
+ "num_tokens": 110262463.0,
1081
+ "step": 1190
1082
+ },
1083
+ {
1084
+ "epoch": 2.3432471964895174,
1085
+ "grad_norm": 0.46787402033805847,
1086
+ "learning_rate": 1.5316406250000003e-05,
1087
+ "loss": 4.3362,
1088
+ "mean_token_accuracy": 0.2946368932723999,
1089
+ "num_tokens": 111180510.0,
1090
+ "step": 1200
1091
+ },
1092
+ {
1093
+ "epoch": 2.36274987810824,
1094
+ "grad_norm": 0.45472535490989685,
1095
+ "learning_rate": 1.527734375e-05,
1096
+ "loss": 4.32,
1097
+ "mean_token_accuracy": 0.2949611395597458,
1098
+ "num_tokens": 112102026.0,
1099
+ "step": 1210
1100
+ },
1101
+ {
1102
+ "epoch": 2.3822525597269624,
1103
+ "grad_norm": 0.5668436288833618,
1104
+ "learning_rate": 1.5238281250000002e-05,
1105
+ "loss": 4.3424,
1106
+ "mean_token_accuracy": 0.2933102063834667,
1107
+ "num_tokens": 113030329.0,
1108
+ "step": 1220
1109
+ },
1110
+ {
1111
+ "epoch": 2.401755241345685,
1112
+ "grad_norm": 0.446575403213501,
1113
+ "learning_rate": 1.5199218750000001e-05,
1114
+ "loss": 4.3253,
1115
+ "mean_token_accuracy": 0.29592231959104537,
1116
+ "num_tokens": 113952583.0,
1117
+ "step": 1230
1118
+ },
1119
+ {
1120
+ "epoch": 2.421257922964408,
1121
+ "grad_norm": 0.47586357593536377,
1122
+ "learning_rate": 1.5160156250000001e-05,
1123
+ "loss": 4.3155,
1124
+ "mean_token_accuracy": 0.2961124524474144,
1125
+ "num_tokens": 114874886.0,
1126
+ "step": 1240
1127
+ },
1128
+ {
1129
+ "epoch": 2.4407606045831303,
1130
+ "grad_norm": 0.5272416472434998,
1131
+ "learning_rate": 1.5121093750000003e-05,
1132
+ "loss": 4.3256,
1133
+ "mean_token_accuracy": 0.29535831734538076,
1134
+ "num_tokens": 115809893.0,
1135
+ "step": 1250
1136
+ },
1137
+ {
1138
+ "epoch": 2.4602632862018528,
1139
+ "grad_norm": 0.5159743428230286,
1140
+ "learning_rate": 1.5082031250000003e-05,
1141
+ "loss": 4.3045,
1142
+ "mean_token_accuracy": 0.2964573077857494,
1143
+ "num_tokens": 116738350.0,
1144
+ "step": 1260
1145
+ },
1146
+ {
1147
+ "epoch": 2.4797659678205752,
1148
+ "grad_norm": 0.3612087070941925,
1149
+ "learning_rate": 1.5042968750000003e-05,
1150
+ "loss": 4.3171,
1151
+ "mean_token_accuracy": 0.296286004781723,
1152
+ "num_tokens": 117673838.0,
1153
+ "step": 1270
1154
+ },
1155
+ {
1156
+ "epoch": 2.4992686494392977,
1157
+ "grad_norm": 1.1809757947921753,
1158
+ "learning_rate": 1.5003906250000003e-05,
1159
+ "loss": 4.3191,
1160
+ "mean_token_accuracy": 0.2969906762242317,
1161
+ "num_tokens": 118603195.0,
1162
+ "step": 1280
1163
+ },
1164
+ {
1165
+ "epoch": 2.51877133105802,
1166
+ "grad_norm": 0.6246888041496277,
1167
+ "learning_rate": 1.4964843750000002e-05,
1168
+ "loss": 4.3009,
1169
+ "mean_token_accuracy": 0.2975566402077675,
1170
+ "num_tokens": 119519063.0,
1171
+ "step": 1290
1172
+ },
1173
+ {
1174
+ "epoch": 2.538274012676743,
1175
+ "grad_norm": 0.8195675611495972,
1176
+ "learning_rate": 1.4925781250000002e-05,
1177
+ "loss": 4.3006,
1178
+ "mean_token_accuracy": 0.297472283244133,
1179
+ "num_tokens": 120445030.0,
1180
+ "step": 1300
1181
+ },
1182
+ {
1183
+ "epoch": 2.5577766942954656,
1184
+ "grad_norm": 0.4961223602294922,
1185
+ "learning_rate": 1.4886718750000002e-05,
1186
+ "loss": 4.2969,
1187
+ "mean_token_accuracy": 0.29855757504701613,
1188
+ "num_tokens": 121369449.0,
1189
+ "step": 1310
1190
+ },
1191
+ {
1192
+ "epoch": 2.577279375914188,
1193
+ "grad_norm": 0.5146915912628174,
1194
+ "learning_rate": 1.4847656250000002e-05,
1195
+ "loss": 4.2858,
1196
+ "mean_token_accuracy": 0.2985923945903778,
1197
+ "num_tokens": 122298443.0,
1198
+ "step": 1320
1199
+ },
1200
+ {
1201
+ "epoch": 2.596782057532911,
1202
+ "grad_norm": 0.6109800934791565,
1203
+ "learning_rate": 1.4808593750000002e-05,
1204
+ "loss": 4.2935,
1205
+ "mean_token_accuracy": 0.29859942123293876,
1206
+ "num_tokens": 123233917.0,
1207
+ "step": 1330
1208
+ },
1209
+ {
1210
+ "epoch": 2.6162847391516335,
1211
+ "grad_norm": 0.40669572353363037,
1212
+ "learning_rate": 1.4769531250000002e-05,
1213
+ "loss": 4.2888,
1214
+ "mean_token_accuracy": 0.29900421276688577,
1215
+ "num_tokens": 124161805.0,
1216
+ "step": 1340
1217
+ },
1218
+ {
1219
+ "epoch": 2.635787420770356,
1220
+ "grad_norm": 1.3442695140838623,
1221
+ "learning_rate": 1.4730468750000002e-05,
1222
+ "loss": 4.2757,
1223
+ "mean_token_accuracy": 0.2997878722846508,
1224
+ "num_tokens": 125085090.0,
1225
+ "step": 1350
1226
+ },
1227
+ {
1228
+ "epoch": 2.6552901023890785,
1229
+ "grad_norm": 0.5308565497398376,
1230
+ "learning_rate": 1.4691406250000002e-05,
1231
+ "loss": 4.2859,
1232
+ "mean_token_accuracy": 0.29943727552890775,
1233
+ "num_tokens": 126012123.0,
1234
+ "step": 1360
1235
+ },
1236
+ {
1237
+ "epoch": 2.674792784007801,
1238
+ "grad_norm": 0.5062427520751953,
1239
+ "learning_rate": 1.4652343750000002e-05,
1240
+ "loss": 4.2803,
1241
+ "mean_token_accuracy": 0.2989941954612732,
1242
+ "num_tokens": 126935848.0,
1243
+ "step": 1370
1244
+ },
1245
+ {
1246
+ "epoch": 2.6942954656265234,
1247
+ "grad_norm": 0.41506361961364746,
1248
+ "learning_rate": 1.4613281250000002e-05,
1249
+ "loss": 4.2803,
1250
+ "mean_token_accuracy": 0.3002371557056904,
1251
+ "num_tokens": 127857739.0,
1252
+ "step": 1380
1253
+ },
1254
+ {
1255
+ "epoch": 2.7137981472452464,
1256
+ "grad_norm": 0.44968003034591675,
1257
+ "learning_rate": 1.4574218750000001e-05,
1258
+ "loss": 4.2577,
1259
+ "mean_token_accuracy": 0.30137933045625687,
1260
+ "num_tokens": 128776712.0,
1261
+ "step": 1390
1262
+ },
1263
+ {
1264
+ "epoch": 2.733300828863969,
1265
+ "grad_norm": 0.41343918442726135,
1266
+ "learning_rate": 1.4535156250000001e-05,
1267
+ "loss": 4.2617,
1268
+ "mean_token_accuracy": 0.3014510445296764,
1269
+ "num_tokens": 129707252.0,
1270
+ "step": 1400
1271
+ },
1272
+ {
1273
+ "epoch": 2.7528035104826913,
1274
+ "grad_norm": 0.7177313566207886,
1275
+ "learning_rate": 1.4496093750000001e-05,
1276
+ "loss": 4.2673,
1277
+ "mean_token_accuracy": 0.3009005382657051,
1278
+ "num_tokens": 130637763.0,
1279
+ "step": 1410
1280
+ },
1281
+ {
1282
+ "epoch": 2.772306192101414,
1283
+ "grad_norm": 1.7760525941848755,
1284
+ "learning_rate": 1.4457031250000003e-05,
1285
+ "loss": 4.2834,
1286
+ "mean_token_accuracy": 0.2999152898788452,
1287
+ "num_tokens": 131573493.0,
1288
+ "step": 1420
1289
+ },
1290
+ {
1291
+ "epoch": 2.7918088737201368,
1292
+ "grad_norm": 0.397335022687912,
1293
+ "learning_rate": 1.4417968750000003e-05,
1294
+ "loss": 4.2466,
1295
+ "mean_token_accuracy": 0.30402503311634066,
1296
+ "num_tokens": 132504591.0,
1297
+ "step": 1430
1298
+ },
1299
+ {
1300
+ "epoch": 2.8113115553388592,
1301
+ "grad_norm": 0.3949294686317444,
1302
+ "learning_rate": 1.4378906250000003e-05,
1303
+ "loss": 4.265,
1304
+ "mean_token_accuracy": 0.30095369294285773,
1305
+ "num_tokens": 133431529.0,
1306
+ "step": 1440
1307
+ },
1308
+ {
1309
+ "epoch": 2.8308142369575817,
1310
+ "grad_norm": 0.4513266682624817,
1311
+ "learning_rate": 1.4339843750000003e-05,
1312
+ "loss": 4.2622,
1313
+ "mean_token_accuracy": 0.30230883583426477,
1314
+ "num_tokens": 134351259.0,
1315
+ "step": 1450
1316
+ },
1317
+ {
1318
+ "epoch": 2.850316918576304,
1319
+ "grad_norm": 0.42385134100914,
1320
+ "learning_rate": 1.4300781250000002e-05,
1321
+ "loss": 4.2306,
1322
+ "mean_token_accuracy": 0.3048314802348614,
1323
+ "num_tokens": 135276683.0,
1324
+ "step": 1460
1325
+ },
1326
+ {
1327
+ "epoch": 2.8698196001950267,
1328
+ "grad_norm": 0.9934040307998657,
1329
+ "learning_rate": 1.4261718750000002e-05,
1330
+ "loss": 4.237,
1331
+ "mean_token_accuracy": 0.303445303440094,
1332
+ "num_tokens": 136210229.0,
1333
+ "step": 1470
1334
+ },
1335
+ {
1336
+ "epoch": 2.889322281813749,
1337
+ "grad_norm": 0.7958151698112488,
1338
+ "learning_rate": 1.4222656250000002e-05,
1339
+ "loss": 4.2307,
1340
+ "mean_token_accuracy": 0.305256237834692,
1341
+ "num_tokens": 137139018.0,
1342
+ "step": 1480
1343
+ },
1344
+ {
1345
+ "epoch": 2.908824963432472,
1346
+ "grad_norm": 0.5570520758628845,
1347
+ "learning_rate": 1.4183593750000002e-05,
1348
+ "loss": 4.2503,
1349
+ "mean_token_accuracy": 0.3026120513677597,
1350
+ "num_tokens": 138071332.0,
1351
+ "step": 1490
1352
+ },
1353
+ {
1354
+ "epoch": 2.9283276450511946,
1355
+ "grad_norm": 0.41619789600372314,
1356
+ "learning_rate": 1.4144531250000002e-05,
1357
+ "loss": 4.2189,
1358
+ "mean_token_accuracy": 0.3050854988396168,
1359
+ "num_tokens": 139000222.0,
1360
+ "step": 1500
1361
+ }
1362
+ ],
1363
+ "logging_steps": 10,
1364
+ "max_steps": 5120,
1365
+ "num_input_tokens_seen": 0,
1366
+ "num_train_epochs": 10,
1367
+ "save_steps": 500,
1368
+ "stateful_callbacks": {
1369
+ "TrainerControl": {
1370
+ "args": {
1371
+ "should_epoch_stop": false,
1372
+ "should_evaluate": false,
1373
+ "should_log": false,
1374
+ "should_save": true,
1375
+ "should_training_stop": false
1376
+ },
1377
+ "attributes": {}
1378
+ }
1379
+ },
1380
+ "total_flos": 2.0723985790953062e+17,
1381
+ "train_batch_size": 64,
1382
+ "trial_name": null,
1383
+ "trial_params": null
1384
+ }
checkpoint-1500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:213ef3ed16d9dd20d51f6355dc64c9dc5ebcaf8490efb503d6a15061df366d53
3
+ size 5624
checkpoint-2000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 256,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 6,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 6,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-2000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-2000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75ee8078c79fbcea50176b5c0de43dcf949b63a1c8db09aca507a9a4c8baa786
3
+ size 800819936
checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c086051c918c31c8d930bb19776487a25112c5071863632cebce0094f9eb801
3
+ size 1601820026
checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f37c40ce327861a7ca13b719d3aa37510a143368b6e74358bdb14becb3899e1e
3
+ size 14244
checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e9596bb1de5cdc578ada9b51bec55f2c2810e91409461788a07a6e2d139d11a
3
+ size 1064
checkpoint-2000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-2000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-2000/tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }
checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,1834 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.9049244271087273,
6
+ "eval_steps": 500,
7
+ "global_step": 2000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.019502681618722574,
14
+ "grad_norm": 3.0208523273468018,
15
+ "learning_rate": 1.9964843750000004e-05,
16
+ "loss": 7.8867,
17
+ "mean_token_accuracy": 0.0920736625790596,
18
+ "num_tokens": 920759.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.03900536323744515,
23
+ "grad_norm": 2.2770543098449707,
24
+ "learning_rate": 1.9925781250000002e-05,
25
+ "loss": 7.5013,
26
+ "mean_token_accuracy": 0.10232679340988397,
27
+ "num_tokens": 1848077.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.05850804485616772,
32
+ "grad_norm": 1.6555824279785156,
33
+ "learning_rate": 1.9886718750000004e-05,
34
+ "loss": 7.0436,
35
+ "mean_token_accuracy": 0.11740029789507389,
36
+ "num_tokens": 2781210.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.0780107264748903,
41
+ "grad_norm": 1.7775338888168335,
42
+ "learning_rate": 1.9847656250000002e-05,
43
+ "loss": 6.7631,
44
+ "mean_token_accuracy": 0.1280333673581481,
45
+ "num_tokens": 3689316.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.09751340809361288,
50
+ "grad_norm": 1.2532657384872437,
51
+ "learning_rate": 1.9808593750000003e-05,
52
+ "loss": 6.515,
53
+ "mean_token_accuracy": 0.13626975379884243,
54
+ "num_tokens": 4616761.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.11701608971233544,
59
+ "grad_norm": 1.1137648820877075,
60
+ "learning_rate": 1.976953125e-05,
61
+ "loss": 6.3421,
62
+ "mean_token_accuracy": 0.1418815266340971,
63
+ "num_tokens": 5544631.0,
64
+ "step": 60
65
+ },
66
+ {
67
+ "epoch": 0.13651877133105803,
68
+ "grad_norm": 0.9245680570602417,
69
+ "learning_rate": 1.9730468750000003e-05,
70
+ "loss": 6.2092,
71
+ "mean_token_accuracy": 0.1463709220290184,
72
+ "num_tokens": 6483486.0,
73
+ "step": 70
74
+ },
75
+ {
76
+ "epoch": 0.1560214529497806,
77
+ "grad_norm": 0.8324124813079834,
78
+ "learning_rate": 1.969140625e-05,
79
+ "loss": 6.0411,
80
+ "mean_token_accuracy": 0.15314992293715476,
81
+ "num_tokens": 7412558.0,
82
+ "step": 80
83
+ },
84
+ {
85
+ "epoch": 0.17552413456850316,
86
+ "grad_norm": 0.920666515827179,
87
+ "learning_rate": 1.9652343750000003e-05,
88
+ "loss": 5.9319,
89
+ "mean_token_accuracy": 0.16162274666130544,
90
+ "num_tokens": 8332801.0,
91
+ "step": 90
92
+ },
93
+ {
94
+ "epoch": 0.19502681618722575,
95
+ "grad_norm": 0.8294446468353271,
96
+ "learning_rate": 1.961328125e-05,
97
+ "loss": 5.8516,
98
+ "mean_token_accuracy": 0.16927699856460093,
99
+ "num_tokens": 9274826.0,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 0.21452949780594832,
104
+ "grad_norm": 0.7535350322723389,
105
+ "learning_rate": 1.9574218750000003e-05,
106
+ "loss": 5.7591,
107
+ "mean_token_accuracy": 0.17415257096290587,
108
+ "num_tokens": 10190661.0,
109
+ "step": 110
110
+ },
111
+ {
112
+ "epoch": 0.2340321794246709,
113
+ "grad_norm": 0.9731621742248535,
114
+ "learning_rate": 1.953515625e-05,
115
+ "loss": 5.6905,
116
+ "mean_token_accuracy": 0.17712676227092744,
117
+ "num_tokens": 11113827.0,
118
+ "step": 120
119
+ },
120
+ {
121
+ "epoch": 0.25353486104339346,
122
+ "grad_norm": 1.5154342651367188,
123
+ "learning_rate": 1.9496093750000003e-05,
124
+ "loss": 5.6104,
125
+ "mean_token_accuracy": 0.1849387872964144,
126
+ "num_tokens": 12034156.0,
127
+ "step": 130
128
+ },
129
+ {
130
+ "epoch": 0.27303754266211605,
131
+ "grad_norm": 0.7294800877571106,
132
+ "learning_rate": 1.9457031250000004e-05,
133
+ "loss": 5.5413,
134
+ "mean_token_accuracy": 0.19220538288354874,
135
+ "num_tokens": 12950143.0,
136
+ "step": 140
137
+ },
138
+ {
139
+ "epoch": 0.2925402242808386,
140
+ "grad_norm": 0.6792197823524475,
141
+ "learning_rate": 1.9417968750000002e-05,
142
+ "loss": 5.5046,
143
+ "mean_token_accuracy": 0.1948456909507513,
144
+ "num_tokens": 13864130.0,
145
+ "step": 150
146
+ },
147
+ {
148
+ "epoch": 0.3120429058995612,
149
+ "grad_norm": 0.6913173794746399,
150
+ "learning_rate": 1.9378906250000004e-05,
151
+ "loss": 5.4398,
152
+ "mean_token_accuracy": 0.19714849777519702,
153
+ "num_tokens": 14795220.0,
154
+ "step": 160
155
+ },
156
+ {
157
+ "epoch": 0.3315455875182838,
158
+ "grad_norm": 0.6437414288520813,
159
+ "learning_rate": 1.9339843750000002e-05,
160
+ "loss": 5.4273,
161
+ "mean_token_accuracy": 0.19942218959331512,
162
+ "num_tokens": 15722954.0,
163
+ "step": 170
164
+ },
165
+ {
166
+ "epoch": 0.3510482691370063,
167
+ "grad_norm": 2.1863138675689697,
168
+ "learning_rate": 1.9300781250000004e-05,
169
+ "loss": 5.3666,
170
+ "mean_token_accuracy": 0.2016600638628006,
171
+ "num_tokens": 16637414.0,
172
+ "step": 180
173
+ },
174
+ {
175
+ "epoch": 0.3705509507557289,
176
+ "grad_norm": 0.8689864873886108,
177
+ "learning_rate": 1.9261718750000002e-05,
178
+ "loss": 5.3421,
179
+ "mean_token_accuracy": 0.20273192636668683,
180
+ "num_tokens": 17569476.0,
181
+ "step": 190
182
+ },
183
+ {
184
+ "epoch": 0.3900536323744515,
185
+ "grad_norm": 1.2784861326217651,
186
+ "learning_rate": 1.9222656250000003e-05,
187
+ "loss": 5.3323,
188
+ "mean_token_accuracy": 0.20496859662234784,
189
+ "num_tokens": 18498067.0,
190
+ "step": 200
191
+ },
192
+ {
193
+ "epoch": 0.40955631399317405,
194
+ "grad_norm": 0.6330307722091675,
195
+ "learning_rate": 1.9183593750000002e-05,
196
+ "loss": 5.2827,
197
+ "mean_token_accuracy": 0.2135307714343071,
198
+ "num_tokens": 19416964.0,
199
+ "step": 210
200
+ },
201
+ {
202
+ "epoch": 0.42905899561189664,
203
+ "grad_norm": 1.1162034273147583,
204
+ "learning_rate": 1.9144531250000003e-05,
205
+ "loss": 5.2121,
206
+ "mean_token_accuracy": 0.21816504523158073,
207
+ "num_tokens": 20341056.0,
208
+ "step": 220
209
+ },
210
+ {
211
+ "epoch": 0.4485616772306192,
212
+ "grad_norm": 0.6339605450630188,
213
+ "learning_rate": 1.910546875e-05,
214
+ "loss": 5.2187,
215
+ "mean_token_accuracy": 0.21803640052676201,
216
+ "num_tokens": 21267082.0,
217
+ "step": 230
218
+ },
219
+ {
220
+ "epoch": 0.4680643588493418,
221
+ "grad_norm": 0.607659637928009,
222
+ "learning_rate": 1.9066406250000003e-05,
223
+ "loss": 5.1826,
224
+ "mean_token_accuracy": 0.22011552266776563,
225
+ "num_tokens": 22195442.0,
226
+ "step": 240
227
+ },
228
+ {
229
+ "epoch": 0.48756704046806437,
230
+ "grad_norm": 0.5029736757278442,
231
+ "learning_rate": 1.902734375e-05,
232
+ "loss": 5.1424,
233
+ "mean_token_accuracy": 0.22285537868738176,
234
+ "num_tokens": 23144077.0,
235
+ "step": 250
236
+ },
237
+ {
238
+ "epoch": 0.5070697220867869,
239
+ "grad_norm": 0.9591688513755798,
240
+ "learning_rate": 1.8988281250000003e-05,
241
+ "loss": 5.0913,
242
+ "mean_token_accuracy": 0.22736062072217464,
243
+ "num_tokens": 24068643.0,
244
+ "step": 260
245
+ },
246
+ {
247
+ "epoch": 0.5265724037055095,
248
+ "grad_norm": 0.5418295860290527,
249
+ "learning_rate": 1.894921875e-05,
250
+ "loss": 5.1015,
251
+ "mean_token_accuracy": 0.22693138755857944,
252
+ "num_tokens": 24996534.0,
253
+ "step": 270
254
+ },
255
+ {
256
+ "epoch": 0.5460750853242321,
257
+ "grad_norm": 0.5258099436759949,
258
+ "learning_rate": 1.8910156250000003e-05,
259
+ "loss": 5.0746,
260
+ "mean_token_accuracy": 0.23026154786348343,
261
+ "num_tokens": 25920902.0,
262
+ "step": 280
263
+ },
264
+ {
265
+ "epoch": 0.5655777669429547,
266
+ "grad_norm": 0.5592005252838135,
267
+ "learning_rate": 1.887109375e-05,
268
+ "loss": 5.0566,
269
+ "mean_token_accuracy": 0.23160071447491645,
270
+ "num_tokens": 26845104.0,
271
+ "step": 290
272
+ },
273
+ {
274
+ "epoch": 0.5850804485616772,
275
+ "grad_norm": 0.5427853465080261,
276
+ "learning_rate": 1.8832031250000002e-05,
277
+ "loss": 5.0565,
278
+ "mean_token_accuracy": 0.2316149313002825,
279
+ "num_tokens": 27782058.0,
280
+ "step": 300
281
+ },
282
+ {
283
+ "epoch": 0.6045831301803998,
284
+ "grad_norm": 0.9386640191078186,
285
+ "learning_rate": 1.8792968750000004e-05,
286
+ "loss": 5.0003,
287
+ "mean_token_accuracy": 0.2349798556417227,
288
+ "num_tokens": 28707140.0,
289
+ "step": 310
290
+ },
291
+ {
292
+ "epoch": 0.6240858117991224,
293
+ "grad_norm": 0.8473969101905823,
294
+ "learning_rate": 1.8753906250000002e-05,
295
+ "loss": 4.9954,
296
+ "mean_token_accuracy": 0.23647231683135034,
297
+ "num_tokens": 29637908.0,
298
+ "step": 320
299
+ },
300
+ {
301
+ "epoch": 0.643588493417845,
302
+ "grad_norm": 0.5143874883651733,
303
+ "learning_rate": 1.8714843750000004e-05,
304
+ "loss": 4.9881,
305
+ "mean_token_accuracy": 0.23599626012146474,
306
+ "num_tokens": 30559254.0,
307
+ "step": 330
308
+ },
309
+ {
310
+ "epoch": 0.6630911750365676,
311
+ "grad_norm": 0.4523729979991913,
312
+ "learning_rate": 1.8675781250000002e-05,
313
+ "loss": 4.9695,
314
+ "mean_token_accuracy": 0.23746853992342948,
315
+ "num_tokens": 31487152.0,
316
+ "step": 340
317
+ },
318
+ {
319
+ "epoch": 0.6825938566552902,
320
+ "grad_norm": 0.5573343634605408,
321
+ "learning_rate": 1.8636718750000004e-05,
322
+ "loss": 4.9525,
323
+ "mean_token_accuracy": 0.23925678990781307,
324
+ "num_tokens": 32396911.0,
325
+ "step": 350
326
+ },
327
+ {
328
+ "epoch": 0.7020965382740126,
329
+ "grad_norm": 1.2790151834487915,
330
+ "learning_rate": 1.8597656250000002e-05,
331
+ "loss": 4.9338,
332
+ "mean_token_accuracy": 0.24103106185793877,
333
+ "num_tokens": 33325126.0,
334
+ "step": 360
335
+ },
336
+ {
337
+ "epoch": 0.7215992198927352,
338
+ "grad_norm": 0.8714343309402466,
339
+ "learning_rate": 1.8558593750000003e-05,
340
+ "loss": 4.9252,
341
+ "mean_token_accuracy": 0.24138498678803444,
342
+ "num_tokens": 34262332.0,
343
+ "step": 370
344
+ },
345
+ {
346
+ "epoch": 0.7411019015114578,
347
+ "grad_norm": 0.5251726508140564,
348
+ "learning_rate": 1.851953125e-05,
349
+ "loss": 4.8883,
350
+ "mean_token_accuracy": 0.24456401653587817,
351
+ "num_tokens": 35181080.0,
352
+ "step": 380
353
+ },
354
+ {
355
+ "epoch": 0.7606045831301804,
356
+ "grad_norm": 0.46523743867874146,
357
+ "learning_rate": 1.8480468750000003e-05,
358
+ "loss": 4.8887,
359
+ "mean_token_accuracy": 0.24552332125604154,
360
+ "num_tokens": 36105038.0,
361
+ "step": 390
362
+ },
363
+ {
364
+ "epoch": 0.780107264748903,
365
+ "grad_norm": 0.5080934166908264,
366
+ "learning_rate": 1.844140625e-05,
367
+ "loss": 4.8659,
368
+ "mean_token_accuracy": 0.245796899497509,
369
+ "num_tokens": 37036998.0,
370
+ "step": 400
371
+ },
372
+ {
373
+ "epoch": 0.7996099463676255,
374
+ "grad_norm": 1.117693543434143,
375
+ "learning_rate": 1.8402343750000003e-05,
376
+ "loss": 4.8681,
377
+ "mean_token_accuracy": 0.2463846940547228,
378
+ "num_tokens": 37968683.0,
379
+ "step": 410
380
+ },
381
+ {
382
+ "epoch": 0.8191126279863481,
383
+ "grad_norm": 0.5299689769744873,
384
+ "learning_rate": 1.836328125e-05,
385
+ "loss": 4.8305,
386
+ "mean_token_accuracy": 0.24968006946146487,
387
+ "num_tokens": 38888752.0,
388
+ "step": 420
389
+ },
390
+ {
391
+ "epoch": 0.8386153096050707,
392
+ "grad_norm": 0.8458639979362488,
393
+ "learning_rate": 1.8324218750000003e-05,
394
+ "loss": 4.8279,
395
+ "mean_token_accuracy": 0.25057865455746653,
396
+ "num_tokens": 39823608.0,
397
+ "step": 430
398
+ },
399
+ {
400
+ "epoch": 0.8581179912237933,
401
+ "grad_norm": 0.5429758429527283,
402
+ "learning_rate": 1.828515625e-05,
403
+ "loss": 4.8243,
404
+ "mean_token_accuracy": 0.2514403607696295,
405
+ "num_tokens": 40748246.0,
406
+ "step": 440
407
+ },
408
+ {
409
+ "epoch": 0.8776206728425159,
410
+ "grad_norm": 0.47386595606803894,
411
+ "learning_rate": 1.8246093750000003e-05,
412
+ "loss": 4.7881,
413
+ "mean_token_accuracy": 0.25412631034851074,
414
+ "num_tokens": 41662563.0,
415
+ "step": 450
416
+ },
417
+ {
418
+ "epoch": 0.8971233544612384,
419
+ "grad_norm": 0.41789767146110535,
420
+ "learning_rate": 1.820703125e-05,
421
+ "loss": 4.7944,
422
+ "mean_token_accuracy": 0.25327568165957925,
423
+ "num_tokens": 42588923.0,
424
+ "step": 460
425
+ },
426
+ {
427
+ "epoch": 0.916626036079961,
428
+ "grad_norm": 0.43711453676223755,
429
+ "learning_rate": 1.8167968750000002e-05,
430
+ "loss": 4.7758,
431
+ "mean_token_accuracy": 0.2548953540623188,
432
+ "num_tokens": 43515886.0,
433
+ "step": 470
434
+ },
435
+ {
436
+ "epoch": 0.9361287176986836,
437
+ "grad_norm": 0.6433466076850891,
438
+ "learning_rate": 1.8128906250000004e-05,
439
+ "loss": 4.7632,
440
+ "mean_token_accuracy": 0.2562540594488382,
441
+ "num_tokens": 44446196.0,
442
+ "step": 480
443
+ },
444
+ {
445
+ "epoch": 0.9556313993174061,
446
+ "grad_norm": 0.7580122351646423,
447
+ "learning_rate": 1.8089843750000002e-05,
448
+ "loss": 4.7559,
449
+ "mean_token_accuracy": 0.25687045492231847,
450
+ "num_tokens": 45389436.0,
451
+ "step": 490
452
+ },
453
+ {
454
+ "epoch": 0.9751340809361287,
455
+ "grad_norm": 0.9800453782081604,
456
+ "learning_rate": 1.8050781250000004e-05,
457
+ "loss": 4.7711,
458
+ "mean_token_accuracy": 0.2573121260851622,
459
+ "num_tokens": 46313035.0,
460
+ "step": 500
461
+ },
462
+ {
463
+ "epoch": 0.9946367625548513,
464
+ "grad_norm": 0.636842668056488,
465
+ "learning_rate": 1.8011718750000002e-05,
466
+ "loss": 4.7693,
467
+ "mean_token_accuracy": 0.2563398856669664,
468
+ "num_tokens": 47231018.0,
469
+ "step": 510
470
+ },
471
+ {
472
+ "epoch": 1.0156021452949782,
473
+ "grad_norm": 0.9280142188072205,
474
+ "learning_rate": 1.7972656250000004e-05,
475
+ "loss": 5.2112,
476
+ "mean_token_accuracy": 0.25861204106633257,
477
+ "num_tokens": 48181298.0,
478
+ "step": 520
479
+ },
480
+ {
481
+ "epoch": 1.0351048269137006,
482
+ "grad_norm": 0.5360897183418274,
483
+ "learning_rate": 1.7933593750000002e-05,
484
+ "loss": 4.7274,
485
+ "mean_token_accuracy": 0.26002744026482105,
486
+ "num_tokens": 49105264.0,
487
+ "step": 530
488
+ },
489
+ {
490
+ "epoch": 1.0546075085324231,
491
+ "grad_norm": 0.5811850428581238,
492
+ "learning_rate": 1.7894531250000003e-05,
493
+ "loss": 4.7052,
494
+ "mean_token_accuracy": 0.2608158510178328,
495
+ "num_tokens": 50033001.0,
496
+ "step": 540
497
+ },
498
+ {
499
+ "epoch": 1.0741101901511458,
500
+ "grad_norm": 0.9147205352783203,
501
+ "learning_rate": 1.785546875e-05,
502
+ "loss": 4.6854,
503
+ "mean_token_accuracy": 0.26275911666452884,
504
+ "num_tokens": 50954465.0,
505
+ "step": 550
506
+ },
507
+ {
508
+ "epoch": 1.0936128717698683,
509
+ "grad_norm": 0.46637555956840515,
510
+ "learning_rate": 1.7816406250000003e-05,
511
+ "loss": 4.6969,
512
+ "mean_token_accuracy": 0.26292436122894286,
513
+ "num_tokens": 51876402.0,
514
+ "step": 560
515
+ },
516
+ {
517
+ "epoch": 1.113115553388591,
518
+ "grad_norm": 1.1488078832626343,
519
+ "learning_rate": 1.777734375e-05,
520
+ "loss": 4.6934,
521
+ "mean_token_accuracy": 0.26502432897686956,
522
+ "num_tokens": 52789774.0,
523
+ "step": 570
524
+ },
525
+ {
526
+ "epoch": 1.1326182350073135,
527
+ "grad_norm": 0.42444393038749695,
528
+ "learning_rate": 1.7738281250000003e-05,
529
+ "loss": 4.6616,
530
+ "mean_token_accuracy": 0.2661720596253872,
531
+ "num_tokens": 53700514.0,
532
+ "step": 580
533
+ },
534
+ {
535
+ "epoch": 1.152120916626036,
536
+ "grad_norm": 0.46090322732925415,
537
+ "learning_rate": 1.769921875e-05,
538
+ "loss": 4.6677,
539
+ "mean_token_accuracy": 0.2653431937098503,
540
+ "num_tokens": 54637906.0,
541
+ "step": 590
542
+ },
543
+ {
544
+ "epoch": 1.1716235982447587,
545
+ "grad_norm": 1.0310624837875366,
546
+ "learning_rate": 1.7660156250000003e-05,
547
+ "loss": 4.6441,
548
+ "mean_token_accuracy": 0.26668640449643133,
549
+ "num_tokens": 55566104.0,
550
+ "step": 600
551
+ },
552
+ {
553
+ "epoch": 1.1911262798634812,
554
+ "grad_norm": 0.5190272927284241,
555
+ "learning_rate": 1.762109375e-05,
556
+ "loss": 4.6388,
557
+ "mean_token_accuracy": 0.26863499656319617,
558
+ "num_tokens": 56475233.0,
559
+ "step": 610
560
+ },
561
+ {
562
+ "epoch": 1.210628961482204,
563
+ "grad_norm": 0.4435961842536926,
564
+ "learning_rate": 1.7582031250000003e-05,
565
+ "loss": 4.6344,
566
+ "mean_token_accuracy": 0.27009780779480935,
567
+ "num_tokens": 57393208.0,
568
+ "step": 620
569
+ },
570
+ {
571
+ "epoch": 1.2301316431009264,
572
+ "grad_norm": 1.5555769205093384,
573
+ "learning_rate": 1.754296875e-05,
574
+ "loss": 4.6239,
575
+ "mean_token_accuracy": 0.26839635893702507,
576
+ "num_tokens": 58320792.0,
577
+ "step": 630
578
+ },
579
+ {
580
+ "epoch": 1.2496343247196489,
581
+ "grad_norm": 0.45114317536354065,
582
+ "learning_rate": 1.7503906250000002e-05,
583
+ "loss": 4.6231,
584
+ "mean_token_accuracy": 0.26895947232842443,
585
+ "num_tokens": 59247413.0,
586
+ "step": 640
587
+ },
588
+ {
589
+ "epoch": 1.2691370063383716,
590
+ "grad_norm": 0.5050057768821716,
591
+ "learning_rate": 1.7464843750000004e-05,
592
+ "loss": 4.6231,
593
+ "mean_token_accuracy": 0.2689752779901028,
594
+ "num_tokens": 60180529.0,
595
+ "step": 650
596
+ },
597
+ {
598
+ "epoch": 1.288639687957094,
599
+ "grad_norm": 0.4494447708129883,
600
+ "learning_rate": 1.7425781250000002e-05,
601
+ "loss": 4.5939,
602
+ "mean_token_accuracy": 0.2701444610953331,
603
+ "num_tokens": 61116823.0,
604
+ "step": 660
605
+ },
606
+ {
607
+ "epoch": 1.3081423695758168,
608
+ "grad_norm": 0.5126622915267944,
609
+ "learning_rate": 1.7386718750000004e-05,
610
+ "loss": 4.5905,
611
+ "mean_token_accuracy": 0.27250412106513977,
612
+ "num_tokens": 62036873.0,
613
+ "step": 670
614
+ },
615
+ {
616
+ "epoch": 1.3276450511945392,
617
+ "grad_norm": 0.4027528166770935,
618
+ "learning_rate": 1.7347656250000002e-05,
619
+ "loss": 4.5702,
620
+ "mean_token_accuracy": 0.2741738684475422,
621
+ "num_tokens": 62955213.0,
622
+ "step": 680
623
+ },
624
+ {
625
+ "epoch": 1.3471477328132617,
626
+ "grad_norm": 0.42226913571357727,
627
+ "learning_rate": 1.7308593750000004e-05,
628
+ "loss": 4.6027,
629
+ "mean_token_accuracy": 0.2714010961353779,
630
+ "num_tokens": 63879282.0,
631
+ "step": 690
632
+ },
633
+ {
634
+ "epoch": 1.3666504144319844,
635
+ "grad_norm": 0.7456600069999695,
636
+ "learning_rate": 1.7269531250000002e-05,
637
+ "loss": 4.6076,
638
+ "mean_token_accuracy": 0.27038322016596794,
639
+ "num_tokens": 64813842.0,
640
+ "step": 700
641
+ },
642
+ {
643
+ "epoch": 1.386153096050707,
644
+ "grad_norm": 0.4936697483062744,
645
+ "learning_rate": 1.7230468750000003e-05,
646
+ "loss": 4.5808,
647
+ "mean_token_accuracy": 0.27259208634495735,
648
+ "num_tokens": 65750164.0,
649
+ "step": 710
650
+ },
651
+ {
652
+ "epoch": 1.4056557776694296,
653
+ "grad_norm": 0.40352705121040344,
654
+ "learning_rate": 1.719140625e-05,
655
+ "loss": 4.5687,
656
+ "mean_token_accuracy": 0.2735755704343319,
657
+ "num_tokens": 66671414.0,
658
+ "step": 720
659
+ },
660
+ {
661
+ "epoch": 1.425158459288152,
662
+ "grad_norm": 0.5929037928581238,
663
+ "learning_rate": 1.7152343750000003e-05,
664
+ "loss": 4.5549,
665
+ "mean_token_accuracy": 0.27455407530069353,
666
+ "num_tokens": 67601565.0,
667
+ "step": 730
668
+ },
669
+ {
670
+ "epoch": 1.4446611409068746,
671
+ "grad_norm": 2.1730380058288574,
672
+ "learning_rate": 1.711328125e-05,
673
+ "loss": 4.5471,
674
+ "mean_token_accuracy": 0.2747652716934681,
675
+ "num_tokens": 68543369.0,
676
+ "step": 740
677
+ },
678
+ {
679
+ "epoch": 1.4641638225255973,
680
+ "grad_norm": 0.45310553908348083,
681
+ "learning_rate": 1.7074218750000003e-05,
682
+ "loss": 4.5379,
683
+ "mean_token_accuracy": 0.2759984292089939,
684
+ "num_tokens": 69475220.0,
685
+ "step": 750
686
+ },
687
+ {
688
+ "epoch": 1.4836665041443198,
689
+ "grad_norm": 0.44060948491096497,
690
+ "learning_rate": 1.703515625e-05,
691
+ "loss": 4.5207,
692
+ "mean_token_accuracy": 0.2781739257276058,
693
+ "num_tokens": 70397859.0,
694
+ "step": 760
695
+ },
696
+ {
697
+ "epoch": 1.5031691857630425,
698
+ "grad_norm": 0.9775242209434509,
699
+ "learning_rate": 1.6996093750000003e-05,
700
+ "loss": 4.5283,
701
+ "mean_token_accuracy": 0.27766570150852204,
702
+ "num_tokens": 71329289.0,
703
+ "step": 770
704
+ },
705
+ {
706
+ "epoch": 1.522671867381765,
707
+ "grad_norm": 0.476166695356369,
708
+ "learning_rate": 1.695703125e-05,
709
+ "loss": 4.5227,
710
+ "mean_token_accuracy": 0.2781515374779701,
711
+ "num_tokens": 72251787.0,
712
+ "step": 780
713
+ },
714
+ {
715
+ "epoch": 1.5421745490004874,
716
+ "grad_norm": 0.7697413563728333,
717
+ "learning_rate": 1.6917968750000003e-05,
718
+ "loss": 4.5089,
719
+ "mean_token_accuracy": 0.2777483291924,
720
+ "num_tokens": 73184759.0,
721
+ "step": 790
722
+ },
723
+ {
724
+ "epoch": 1.5616772306192102,
725
+ "grad_norm": 0.39053142070770264,
726
+ "learning_rate": 1.687890625e-05,
727
+ "loss": 4.5224,
728
+ "mean_token_accuracy": 0.2780651919543743,
729
+ "num_tokens": 74104519.0,
730
+ "step": 800
731
+ },
732
+ {
733
+ "epoch": 1.5811799122379329,
734
+ "grad_norm": 0.5136573910713196,
735
+ "learning_rate": 1.6839843750000002e-05,
736
+ "loss": 4.5168,
737
+ "mean_token_accuracy": 0.27778707146644593,
738
+ "num_tokens": 75041132.0,
739
+ "step": 810
740
+ },
741
+ {
742
+ "epoch": 1.6006825938566553,
743
+ "grad_norm": 0.4006953239440918,
744
+ "learning_rate": 1.6800781250000004e-05,
745
+ "loss": 4.4969,
746
+ "mean_token_accuracy": 0.2798936806619167,
747
+ "num_tokens": 75957080.0,
748
+ "step": 820
749
+ },
750
+ {
751
+ "epoch": 1.6201852754753778,
752
+ "grad_norm": 0.8261349201202393,
753
+ "learning_rate": 1.6761718750000002e-05,
754
+ "loss": 4.503,
755
+ "mean_token_accuracy": 0.27909068912267687,
756
+ "num_tokens": 76885767.0,
757
+ "step": 830
758
+ },
759
+ {
760
+ "epoch": 1.6396879570941003,
761
+ "grad_norm": 0.6244539022445679,
762
+ "learning_rate": 1.6722656250000004e-05,
763
+ "loss": 4.4941,
764
+ "mean_token_accuracy": 0.27921902686357497,
765
+ "num_tokens": 77828819.0,
766
+ "step": 840
767
+ },
768
+ {
769
+ "epoch": 1.659190638712823,
770
+ "grad_norm": 0.3824380040168762,
771
+ "learning_rate": 1.6683593750000002e-05,
772
+ "loss": 4.5036,
773
+ "mean_token_accuracy": 0.2791468746960163,
774
+ "num_tokens": 78767094.0,
775
+ "step": 850
776
+ },
777
+ {
778
+ "epoch": 1.6786933203315457,
779
+ "grad_norm": 0.41238024830818176,
780
+ "learning_rate": 1.6644531250000004e-05,
781
+ "loss": 4.498,
782
+ "mean_token_accuracy": 0.28029546365141866,
783
+ "num_tokens": 79699908.0,
784
+ "step": 860
785
+ },
786
+ {
787
+ "epoch": 1.6981960019502682,
788
+ "grad_norm": 0.4335004389286041,
789
+ "learning_rate": 1.6605468750000002e-05,
790
+ "loss": 4.4684,
791
+ "mean_token_accuracy": 0.2824758395552635,
792
+ "num_tokens": 80618966.0,
793
+ "step": 870
794
+ },
795
+ {
796
+ "epoch": 1.7176986835689907,
797
+ "grad_norm": 0.48968249559402466,
798
+ "learning_rate": 1.6566406250000003e-05,
799
+ "loss": 4.4526,
800
+ "mean_token_accuracy": 0.28368064761161804,
801
+ "num_tokens": 81545205.0,
802
+ "step": 880
803
+ },
804
+ {
805
+ "epoch": 1.7372013651877132,
806
+ "grad_norm": 0.41890937089920044,
807
+ "learning_rate": 1.652734375e-05,
808
+ "loss": 4.4734,
809
+ "mean_token_accuracy": 0.2822652608156204,
810
+ "num_tokens": 82463087.0,
811
+ "step": 890
812
+ },
813
+ {
814
+ "epoch": 1.7567040468064359,
815
+ "grad_norm": 0.40375128388404846,
816
+ "learning_rate": 1.6488281250000003e-05,
817
+ "loss": 4.4329,
818
+ "mean_token_accuracy": 0.2864795848727226,
819
+ "num_tokens": 83384315.0,
820
+ "step": 900
821
+ },
822
+ {
823
+ "epoch": 1.7762067284251586,
824
+ "grad_norm": 1.6861543655395508,
825
+ "learning_rate": 1.644921875e-05,
826
+ "loss": 4.4701,
827
+ "mean_token_accuracy": 0.2822112552821636,
828
+ "num_tokens": 84314435.0,
829
+ "step": 910
830
+ },
831
+ {
832
+ "epoch": 1.795709410043881,
833
+ "grad_norm": 0.42918047308921814,
834
+ "learning_rate": 1.6410156250000003e-05,
835
+ "loss": 4.4543,
836
+ "mean_token_accuracy": 0.282807744294405,
837
+ "num_tokens": 85248307.0,
838
+ "step": 920
839
+ },
840
+ {
841
+ "epoch": 1.8152120916626036,
842
+ "grad_norm": 0.38594943284988403,
843
+ "learning_rate": 1.637109375e-05,
844
+ "loss": 4.4509,
845
+ "mean_token_accuracy": 0.28484038934111594,
846
+ "num_tokens": 86168104.0,
847
+ "step": 930
848
+ },
849
+ {
850
+ "epoch": 1.834714773281326,
851
+ "grad_norm": 0.37766233086586,
852
+ "learning_rate": 1.6332031250000003e-05,
853
+ "loss": 4.4378,
854
+ "mean_token_accuracy": 0.28509455919265747,
855
+ "num_tokens": 87085294.0,
856
+ "step": 940
857
+ },
858
+ {
859
+ "epoch": 1.8542174549000487,
860
+ "grad_norm": 0.5309925079345703,
861
+ "learning_rate": 1.629296875e-05,
862
+ "loss": 4.45,
863
+ "mean_token_accuracy": 0.283291470259428,
864
+ "num_tokens": 88022698.0,
865
+ "step": 950
866
+ },
867
+ {
868
+ "epoch": 1.8737201365187715,
869
+ "grad_norm": 0.5312850475311279,
870
+ "learning_rate": 1.6253906250000002e-05,
871
+ "loss": 4.3849,
872
+ "mean_token_accuracy": 0.28923906683921813,
873
+ "num_tokens": 88947429.0,
874
+ "step": 960
875
+ },
876
+ {
877
+ "epoch": 1.893222818137494,
878
+ "grad_norm": 0.5679749250411987,
879
+ "learning_rate": 1.621484375e-05,
880
+ "loss": 4.3953,
881
+ "mean_token_accuracy": 0.28862822949886324,
882
+ "num_tokens": 89878787.0,
883
+ "step": 970
884
+ },
885
+ {
886
+ "epoch": 1.9127254997562164,
887
+ "grad_norm": 0.6506769061088562,
888
+ "learning_rate": 1.6175781250000002e-05,
889
+ "loss": 4.4121,
890
+ "mean_token_accuracy": 0.287344753742218,
891
+ "num_tokens": 90794282.0,
892
+ "step": 980
893
+ },
894
+ {
895
+ "epoch": 1.932228181374939,
896
+ "grad_norm": 0.5218345522880554,
897
+ "learning_rate": 1.6136718750000004e-05,
898
+ "loss": 4.422,
899
+ "mean_token_accuracy": 0.28664510771632196,
900
+ "num_tokens": 91720204.0,
901
+ "step": 990
902
+ },
903
+ {
904
+ "epoch": 1.9517308629936616,
905
+ "grad_norm": 0.38746026158332825,
906
+ "learning_rate": 1.6097656250000002e-05,
907
+ "loss": 4.3909,
908
+ "mean_token_accuracy": 0.2888357400894165,
909
+ "num_tokens": 92640826.0,
910
+ "step": 1000
911
+ },
912
+ {
913
+ "epoch": 1.9712335446123843,
914
+ "grad_norm": 0.4358104467391968,
915
+ "learning_rate": 1.6058593750000004e-05,
916
+ "loss": 4.4009,
917
+ "mean_token_accuracy": 0.28872263357043265,
918
+ "num_tokens": 93557645.0,
919
+ "step": 1010
920
+ },
921
+ {
922
+ "epoch": 1.9907362262311068,
923
+ "grad_norm": 0.399600088596344,
924
+ "learning_rate": 1.6019531250000002e-05,
925
+ "loss": 4.41,
926
+ "mean_token_accuracy": 0.28658533096313477,
927
+ "num_tokens": 94488514.0,
928
+ "step": 1020
929
+ },
930
+ {
931
+ "epoch": 2.0117016089712334,
932
+ "grad_norm": 0.4287355840206146,
933
+ "learning_rate": 1.5980468750000003e-05,
934
+ "loss": 4.8637,
935
+ "mean_token_accuracy": 0.2884528564243782,
936
+ "num_tokens": 95429559.0,
937
+ "step": 1030
938
+ },
939
+ {
940
+ "epoch": 2.0312042905899563,
941
+ "grad_norm": 0.43448057770729065,
942
+ "learning_rate": 1.594140625e-05,
943
+ "loss": 4.411,
944
+ "mean_token_accuracy": 0.28822447881102564,
945
+ "num_tokens": 96370969.0,
946
+ "step": 1040
947
+ },
948
+ {
949
+ "epoch": 2.050706972208679,
950
+ "grad_norm": 0.4509051442146301,
951
+ "learning_rate": 1.5902343750000003e-05,
952
+ "loss": 4.3998,
953
+ "mean_token_accuracy": 0.2875743143260479,
954
+ "num_tokens": 97297487.0,
955
+ "step": 1050
956
+ },
957
+ {
958
+ "epoch": 2.0702096538274013,
959
+ "grad_norm": 0.4603135585784912,
960
+ "learning_rate": 1.586328125e-05,
961
+ "loss": 4.3895,
962
+ "mean_token_accuracy": 0.28888514786958697,
963
+ "num_tokens": 98234116.0,
964
+ "step": 1060
965
+ },
966
+ {
967
+ "epoch": 2.089712335446124,
968
+ "grad_norm": 0.45260968804359436,
969
+ "learning_rate": 1.5824218750000003e-05,
970
+ "loss": 4.3827,
971
+ "mean_token_accuracy": 0.28953884318470957,
972
+ "num_tokens": 99158170.0,
973
+ "step": 1070
974
+ },
975
+ {
976
+ "epoch": 2.1092150170648463,
977
+ "grad_norm": 0.4549092650413513,
978
+ "learning_rate": 1.578515625e-05,
979
+ "loss": 4.3818,
980
+ "mean_token_accuracy": 0.2901176653802395,
981
+ "num_tokens": 100089007.0,
982
+ "step": 1080
983
+ },
984
+ {
985
+ "epoch": 2.128717698683569,
986
+ "grad_norm": 0.4202571213245392,
987
+ "learning_rate": 1.5746093750000003e-05,
988
+ "loss": 4.3617,
989
+ "mean_token_accuracy": 0.2919108562171459,
990
+ "num_tokens": 101002323.0,
991
+ "step": 1090
992
+ },
993
+ {
994
+ "epoch": 2.1482203803022917,
995
+ "grad_norm": 0.5119932889938354,
996
+ "learning_rate": 1.570703125e-05,
997
+ "loss": 4.365,
998
+ "mean_token_accuracy": 0.2918895035982132,
999
+ "num_tokens": 101915323.0,
1000
+ "step": 1100
1001
+ },
1002
+ {
1003
+ "epoch": 2.167723061921014,
1004
+ "grad_norm": 0.49400025606155396,
1005
+ "learning_rate": 1.5667968750000003e-05,
1006
+ "loss": 4.3662,
1007
+ "mean_token_accuracy": 0.29112903624773023,
1008
+ "num_tokens": 102848704.0,
1009
+ "step": 1110
1010
+ },
1011
+ {
1012
+ "epoch": 2.1872257435397366,
1013
+ "grad_norm": 9.812466621398926,
1014
+ "learning_rate": 1.562890625e-05,
1015
+ "loss": 4.3536,
1016
+ "mean_token_accuracy": 0.29283427745103835,
1017
+ "num_tokens": 103770127.0,
1018
+ "step": 1120
1019
+ },
1020
+ {
1021
+ "epoch": 2.206728425158459,
1022
+ "grad_norm": 0.6520562171936035,
1023
+ "learning_rate": 1.5589843750000002e-05,
1024
+ "loss": 4.3685,
1025
+ "mean_token_accuracy": 0.29140080511569977,
1026
+ "num_tokens": 104696965.0,
1027
+ "step": 1130
1028
+ },
1029
+ {
1030
+ "epoch": 2.226231106777182,
1031
+ "grad_norm": 0.3824687600135803,
1032
+ "learning_rate": 1.555078125e-05,
1033
+ "loss": 4.3617,
1034
+ "mean_token_accuracy": 0.29200059548020363,
1035
+ "num_tokens": 105627411.0,
1036
+ "step": 1140
1037
+ },
1038
+ {
1039
+ "epoch": 2.2457337883959045,
1040
+ "grad_norm": 0.40885069966316223,
1041
+ "learning_rate": 1.5511718750000002e-05,
1042
+ "loss": 4.3486,
1043
+ "mean_token_accuracy": 0.29261764511466026,
1044
+ "num_tokens": 106560394.0,
1045
+ "step": 1150
1046
+ },
1047
+ {
1048
+ "epoch": 2.265236470014627,
1049
+ "grad_norm": 0.5578988194465637,
1050
+ "learning_rate": 1.5472656250000004e-05,
1051
+ "loss": 4.3477,
1052
+ "mean_token_accuracy": 0.2928113825619221,
1053
+ "num_tokens": 107484781.0,
1054
+ "step": 1160
1055
+ },
1056
+ {
1057
+ "epoch": 2.2847391516333495,
1058
+ "grad_norm": 2.4490866661071777,
1059
+ "learning_rate": 1.5433593750000002e-05,
1060
+ "loss": 4.3497,
1061
+ "mean_token_accuracy": 0.2932279795408249,
1062
+ "num_tokens": 108403123.0,
1063
+ "step": 1170
1064
+ },
1065
+ {
1066
+ "epoch": 2.304241833252072,
1067
+ "grad_norm": 0.4807080030441284,
1068
+ "learning_rate": 1.5394531250000004e-05,
1069
+ "loss": 4.3279,
1070
+ "mean_token_accuracy": 0.2941134661436081,
1071
+ "num_tokens": 109334624.0,
1072
+ "step": 1180
1073
+ },
1074
+ {
1075
+ "epoch": 2.323744514870795,
1076
+ "grad_norm": 0.7090457677841187,
1077
+ "learning_rate": 1.5355468750000002e-05,
1078
+ "loss": 4.345,
1079
+ "mean_token_accuracy": 0.2933297656476498,
1080
+ "num_tokens": 110262463.0,
1081
+ "step": 1190
1082
+ },
1083
+ {
1084
+ "epoch": 2.3432471964895174,
1085
+ "grad_norm": 0.46787402033805847,
1086
+ "learning_rate": 1.5316406250000003e-05,
1087
+ "loss": 4.3362,
1088
+ "mean_token_accuracy": 0.2946368932723999,
1089
+ "num_tokens": 111180510.0,
1090
+ "step": 1200
1091
+ },
1092
+ {
1093
+ "epoch": 2.36274987810824,
1094
+ "grad_norm": 0.45472535490989685,
1095
+ "learning_rate": 1.527734375e-05,
1096
+ "loss": 4.32,
1097
+ "mean_token_accuracy": 0.2949611395597458,
1098
+ "num_tokens": 112102026.0,
1099
+ "step": 1210
1100
+ },
1101
+ {
1102
+ "epoch": 2.3822525597269624,
1103
+ "grad_norm": 0.5668436288833618,
1104
+ "learning_rate": 1.5238281250000002e-05,
1105
+ "loss": 4.3424,
1106
+ "mean_token_accuracy": 0.2933102063834667,
1107
+ "num_tokens": 113030329.0,
1108
+ "step": 1220
1109
+ },
1110
+ {
1111
+ "epoch": 2.401755241345685,
1112
+ "grad_norm": 0.446575403213501,
1113
+ "learning_rate": 1.5199218750000001e-05,
1114
+ "loss": 4.3253,
1115
+ "mean_token_accuracy": 0.29592231959104537,
1116
+ "num_tokens": 113952583.0,
1117
+ "step": 1230
1118
+ },
1119
+ {
1120
+ "epoch": 2.421257922964408,
1121
+ "grad_norm": 0.47586357593536377,
1122
+ "learning_rate": 1.5160156250000001e-05,
1123
+ "loss": 4.3155,
1124
+ "mean_token_accuracy": 0.2961124524474144,
1125
+ "num_tokens": 114874886.0,
1126
+ "step": 1240
1127
+ },
1128
+ {
1129
+ "epoch": 2.4407606045831303,
1130
+ "grad_norm": 0.5272416472434998,
1131
+ "learning_rate": 1.5121093750000003e-05,
1132
+ "loss": 4.3256,
1133
+ "mean_token_accuracy": 0.29535831734538076,
1134
+ "num_tokens": 115809893.0,
1135
+ "step": 1250
1136
+ },
1137
+ {
1138
+ "epoch": 2.4602632862018528,
1139
+ "grad_norm": 0.5159743428230286,
1140
+ "learning_rate": 1.5082031250000003e-05,
1141
+ "loss": 4.3045,
1142
+ "mean_token_accuracy": 0.2964573077857494,
1143
+ "num_tokens": 116738350.0,
1144
+ "step": 1260
1145
+ },
1146
+ {
1147
+ "epoch": 2.4797659678205752,
1148
+ "grad_norm": 0.3612087070941925,
1149
+ "learning_rate": 1.5042968750000003e-05,
1150
+ "loss": 4.3171,
1151
+ "mean_token_accuracy": 0.296286004781723,
1152
+ "num_tokens": 117673838.0,
1153
+ "step": 1270
1154
+ },
1155
+ {
1156
+ "epoch": 2.4992686494392977,
1157
+ "grad_norm": 1.1809757947921753,
1158
+ "learning_rate": 1.5003906250000003e-05,
1159
+ "loss": 4.3191,
1160
+ "mean_token_accuracy": 0.2969906762242317,
1161
+ "num_tokens": 118603195.0,
1162
+ "step": 1280
1163
+ },
1164
+ {
1165
+ "epoch": 2.51877133105802,
1166
+ "grad_norm": 0.6246888041496277,
1167
+ "learning_rate": 1.4964843750000002e-05,
1168
+ "loss": 4.3009,
1169
+ "mean_token_accuracy": 0.2975566402077675,
1170
+ "num_tokens": 119519063.0,
1171
+ "step": 1290
1172
+ },
1173
+ {
1174
+ "epoch": 2.538274012676743,
1175
+ "grad_norm": 0.8195675611495972,
1176
+ "learning_rate": 1.4925781250000002e-05,
1177
+ "loss": 4.3006,
1178
+ "mean_token_accuracy": 0.297472283244133,
1179
+ "num_tokens": 120445030.0,
1180
+ "step": 1300
1181
+ },
1182
+ {
1183
+ "epoch": 2.5577766942954656,
1184
+ "grad_norm": 0.4961223602294922,
1185
+ "learning_rate": 1.4886718750000002e-05,
1186
+ "loss": 4.2969,
1187
+ "mean_token_accuracy": 0.29855757504701613,
1188
+ "num_tokens": 121369449.0,
1189
+ "step": 1310
1190
+ },
1191
+ {
1192
+ "epoch": 2.577279375914188,
1193
+ "grad_norm": 0.5146915912628174,
1194
+ "learning_rate": 1.4847656250000002e-05,
1195
+ "loss": 4.2858,
1196
+ "mean_token_accuracy": 0.2985923945903778,
1197
+ "num_tokens": 122298443.0,
1198
+ "step": 1320
1199
+ },
1200
+ {
1201
+ "epoch": 2.596782057532911,
1202
+ "grad_norm": 0.6109800934791565,
1203
+ "learning_rate": 1.4808593750000002e-05,
1204
+ "loss": 4.2935,
1205
+ "mean_token_accuracy": 0.29859942123293876,
1206
+ "num_tokens": 123233917.0,
1207
+ "step": 1330
1208
+ },
1209
+ {
1210
+ "epoch": 2.6162847391516335,
1211
+ "grad_norm": 0.40669572353363037,
1212
+ "learning_rate": 1.4769531250000002e-05,
1213
+ "loss": 4.2888,
1214
+ "mean_token_accuracy": 0.29900421276688577,
1215
+ "num_tokens": 124161805.0,
1216
+ "step": 1340
1217
+ },
1218
+ {
1219
+ "epoch": 2.635787420770356,
1220
+ "grad_norm": 1.3442695140838623,
1221
+ "learning_rate": 1.4730468750000002e-05,
1222
+ "loss": 4.2757,
1223
+ "mean_token_accuracy": 0.2997878722846508,
1224
+ "num_tokens": 125085090.0,
1225
+ "step": 1350
1226
+ },
1227
+ {
1228
+ "epoch": 2.6552901023890785,
1229
+ "grad_norm": 0.5308565497398376,
1230
+ "learning_rate": 1.4691406250000002e-05,
1231
+ "loss": 4.2859,
1232
+ "mean_token_accuracy": 0.29943727552890775,
1233
+ "num_tokens": 126012123.0,
1234
+ "step": 1360
1235
+ },
1236
+ {
1237
+ "epoch": 2.674792784007801,
1238
+ "grad_norm": 0.5062427520751953,
1239
+ "learning_rate": 1.4652343750000002e-05,
1240
+ "loss": 4.2803,
1241
+ "mean_token_accuracy": 0.2989941954612732,
1242
+ "num_tokens": 126935848.0,
1243
+ "step": 1370
1244
+ },
1245
+ {
1246
+ "epoch": 2.6942954656265234,
1247
+ "grad_norm": 0.41506361961364746,
1248
+ "learning_rate": 1.4613281250000002e-05,
1249
+ "loss": 4.2803,
1250
+ "mean_token_accuracy": 0.3002371557056904,
1251
+ "num_tokens": 127857739.0,
1252
+ "step": 1380
1253
+ },
1254
+ {
1255
+ "epoch": 2.7137981472452464,
1256
+ "grad_norm": 0.44968003034591675,
1257
+ "learning_rate": 1.4574218750000001e-05,
1258
+ "loss": 4.2577,
1259
+ "mean_token_accuracy": 0.30137933045625687,
1260
+ "num_tokens": 128776712.0,
1261
+ "step": 1390
1262
+ },
1263
+ {
1264
+ "epoch": 2.733300828863969,
1265
+ "grad_norm": 0.41343918442726135,
1266
+ "learning_rate": 1.4535156250000001e-05,
1267
+ "loss": 4.2617,
1268
+ "mean_token_accuracy": 0.3014510445296764,
1269
+ "num_tokens": 129707252.0,
1270
+ "step": 1400
1271
+ },
1272
+ {
1273
+ "epoch": 2.7528035104826913,
1274
+ "grad_norm": 0.7177313566207886,
1275
+ "learning_rate": 1.4496093750000001e-05,
1276
+ "loss": 4.2673,
1277
+ "mean_token_accuracy": 0.3009005382657051,
1278
+ "num_tokens": 130637763.0,
1279
+ "step": 1410
1280
+ },
1281
+ {
1282
+ "epoch": 2.772306192101414,
1283
+ "grad_norm": 1.7760525941848755,
1284
+ "learning_rate": 1.4457031250000003e-05,
1285
+ "loss": 4.2834,
1286
+ "mean_token_accuracy": 0.2999152898788452,
1287
+ "num_tokens": 131573493.0,
1288
+ "step": 1420
1289
+ },
1290
+ {
1291
+ "epoch": 2.7918088737201368,
1292
+ "grad_norm": 0.397335022687912,
1293
+ "learning_rate": 1.4417968750000003e-05,
1294
+ "loss": 4.2466,
1295
+ "mean_token_accuracy": 0.30402503311634066,
1296
+ "num_tokens": 132504591.0,
1297
+ "step": 1430
1298
+ },
1299
+ {
1300
+ "epoch": 2.8113115553388592,
1301
+ "grad_norm": 0.3949294686317444,
1302
+ "learning_rate": 1.4378906250000003e-05,
1303
+ "loss": 4.265,
1304
+ "mean_token_accuracy": 0.30095369294285773,
1305
+ "num_tokens": 133431529.0,
1306
+ "step": 1440
1307
+ },
1308
+ {
1309
+ "epoch": 2.8308142369575817,
1310
+ "grad_norm": 0.4513266682624817,
1311
+ "learning_rate": 1.4339843750000003e-05,
1312
+ "loss": 4.2622,
1313
+ "mean_token_accuracy": 0.30230883583426477,
1314
+ "num_tokens": 134351259.0,
1315
+ "step": 1450
1316
+ },
1317
+ {
1318
+ "epoch": 2.850316918576304,
1319
+ "grad_norm": 0.42385134100914,
1320
+ "learning_rate": 1.4300781250000002e-05,
1321
+ "loss": 4.2306,
1322
+ "mean_token_accuracy": 0.3048314802348614,
1323
+ "num_tokens": 135276683.0,
1324
+ "step": 1460
1325
+ },
1326
+ {
1327
+ "epoch": 2.8698196001950267,
1328
+ "grad_norm": 0.9934040307998657,
1329
+ "learning_rate": 1.4261718750000002e-05,
1330
+ "loss": 4.237,
1331
+ "mean_token_accuracy": 0.303445303440094,
1332
+ "num_tokens": 136210229.0,
1333
+ "step": 1470
1334
+ },
1335
+ {
1336
+ "epoch": 2.889322281813749,
1337
+ "grad_norm": 0.7958151698112488,
1338
+ "learning_rate": 1.4222656250000002e-05,
1339
+ "loss": 4.2307,
1340
+ "mean_token_accuracy": 0.305256237834692,
1341
+ "num_tokens": 137139018.0,
1342
+ "step": 1480
1343
+ },
1344
+ {
1345
+ "epoch": 2.908824963432472,
1346
+ "grad_norm": 0.5570520758628845,
1347
+ "learning_rate": 1.4183593750000002e-05,
1348
+ "loss": 4.2503,
1349
+ "mean_token_accuracy": 0.3026120513677597,
1350
+ "num_tokens": 138071332.0,
1351
+ "step": 1490
1352
+ },
1353
+ {
1354
+ "epoch": 2.9283276450511946,
1355
+ "grad_norm": 0.41619789600372314,
1356
+ "learning_rate": 1.4144531250000002e-05,
1357
+ "loss": 4.2189,
1358
+ "mean_token_accuracy": 0.3050854988396168,
1359
+ "num_tokens": 139000222.0,
1360
+ "step": 1500
1361
+ },
1362
+ {
1363
+ "epoch": 2.947830326669917,
1364
+ "grad_norm": 0.44383278489112854,
1365
+ "learning_rate": 1.4105468750000002e-05,
1366
+ "loss": 4.2415,
1367
+ "mean_token_accuracy": 0.3042911276221275,
1368
+ "num_tokens": 139917384.0,
1369
+ "step": 1510
1370
+ },
1371
+ {
1372
+ "epoch": 2.9673330082886396,
1373
+ "grad_norm": 0.8169625997543335,
1374
+ "learning_rate": 1.4066406250000002e-05,
1375
+ "loss": 4.217,
1376
+ "mean_token_accuracy": 0.3053439900279045,
1377
+ "num_tokens": 140832220.0,
1378
+ "step": 1520
1379
+ },
1380
+ {
1381
+ "epoch": 2.9868356899073625,
1382
+ "grad_norm": 0.3078594207763672,
1383
+ "learning_rate": 1.4027343750000002e-05,
1384
+ "loss": 4.2246,
1385
+ "mean_token_accuracy": 0.30591325610876086,
1386
+ "num_tokens": 141759431.0,
1387
+ "step": 1530
1388
+ },
1389
+ {
1390
+ "epoch": 3.007801072647489,
1391
+ "grad_norm": 0.46778416633605957,
1392
+ "learning_rate": 1.3988281250000002e-05,
1393
+ "loss": 4.63,
1394
+ "mean_token_accuracy": 0.3062905574717173,
1395
+ "num_tokens": 142686567.0,
1396
+ "step": 1540
1397
+ },
1398
+ {
1399
+ "epoch": 3.0273037542662116,
1400
+ "grad_norm": 0.38934579491615295,
1401
+ "learning_rate": 1.3949218750000002e-05,
1402
+ "loss": 4.2288,
1403
+ "mean_token_accuracy": 0.30576241165399554,
1404
+ "num_tokens": 143605470.0,
1405
+ "step": 1550
1406
+ },
1407
+ {
1408
+ "epoch": 3.046806435884934,
1409
+ "grad_norm": 0.4673042893409729,
1410
+ "learning_rate": 1.3910156250000001e-05,
1411
+ "loss": 4.2274,
1412
+ "mean_token_accuracy": 0.30559116452932356,
1413
+ "num_tokens": 144523275.0,
1414
+ "step": 1560
1415
+ },
1416
+ {
1417
+ "epoch": 3.066309117503657,
1418
+ "grad_norm": 0.39577770233154297,
1419
+ "learning_rate": 1.3871093750000001e-05,
1420
+ "loss": 4.2425,
1421
+ "mean_token_accuracy": 0.30391779616475106,
1422
+ "num_tokens": 145444439.0,
1423
+ "step": 1570
1424
+ },
1425
+ {
1426
+ "epoch": 3.0858117991223795,
1427
+ "grad_norm": 0.3524993658065796,
1428
+ "learning_rate": 1.3832031250000001e-05,
1429
+ "loss": 4.2118,
1430
+ "mean_token_accuracy": 0.3058738835155964,
1431
+ "num_tokens": 146378772.0,
1432
+ "step": 1580
1433
+ },
1434
+ {
1435
+ "epoch": 3.105314480741102,
1436
+ "grad_norm": 0.5424984097480774,
1437
+ "learning_rate": 1.3792968750000003e-05,
1438
+ "loss": 4.2163,
1439
+ "mean_token_accuracy": 0.3056952103972435,
1440
+ "num_tokens": 147320678.0,
1441
+ "step": 1590
1442
+ },
1443
+ {
1444
+ "epoch": 3.1248171623598244,
1445
+ "grad_norm": 0.35715430974960327,
1446
+ "learning_rate": 1.3753906250000003e-05,
1447
+ "loss": 4.1736,
1448
+ "mean_token_accuracy": 0.30897570848464967,
1449
+ "num_tokens": 148231165.0,
1450
+ "step": 1600
1451
+ },
1452
+ {
1453
+ "epoch": 3.144319843978547,
1454
+ "grad_norm": 0.42818182706832886,
1455
+ "learning_rate": 1.3714843750000003e-05,
1456
+ "loss": 4.2168,
1457
+ "mean_token_accuracy": 0.30649841353297236,
1458
+ "num_tokens": 149164808.0,
1459
+ "step": 1610
1460
+ },
1461
+ {
1462
+ "epoch": 3.1638225255972694,
1463
+ "grad_norm": 1.06892728805542,
1464
+ "learning_rate": 1.3675781250000002e-05,
1465
+ "loss": 4.1984,
1466
+ "mean_token_accuracy": 0.3070691518485546,
1467
+ "num_tokens": 150098120.0,
1468
+ "step": 1620
1469
+ },
1470
+ {
1471
+ "epoch": 3.1833252072159923,
1472
+ "grad_norm": 0.3775452971458435,
1473
+ "learning_rate": 1.3636718750000002e-05,
1474
+ "loss": 4.2151,
1475
+ "mean_token_accuracy": 0.3055191844701767,
1476
+ "num_tokens": 151024766.0,
1477
+ "step": 1630
1478
+ },
1479
+ {
1480
+ "epoch": 3.202827888834715,
1481
+ "grad_norm": 0.4792298674583435,
1482
+ "learning_rate": 1.3597656250000002e-05,
1483
+ "loss": 4.2077,
1484
+ "mean_token_accuracy": 0.30750301480293274,
1485
+ "num_tokens": 151941554.0,
1486
+ "step": 1640
1487
+ },
1488
+ {
1489
+ "epoch": 3.2223305704534373,
1490
+ "grad_norm": 0.9628048539161682,
1491
+ "learning_rate": 1.3558593750000002e-05,
1492
+ "loss": 4.1879,
1493
+ "mean_token_accuracy": 0.30834688916802405,
1494
+ "num_tokens": 152873143.0,
1495
+ "step": 1650
1496
+ },
1497
+ {
1498
+ "epoch": 3.2418332520721598,
1499
+ "grad_norm": 0.4353286027908325,
1500
+ "learning_rate": 1.3519531250000002e-05,
1501
+ "loss": 4.1972,
1502
+ "mean_token_accuracy": 0.30848201364278793,
1503
+ "num_tokens": 153799524.0,
1504
+ "step": 1660
1505
+ },
1506
+ {
1507
+ "epoch": 3.2613359336908827,
1508
+ "grad_norm": 0.47365450859069824,
1509
+ "learning_rate": 1.3480468750000002e-05,
1510
+ "loss": 4.1959,
1511
+ "mean_token_accuracy": 0.30711600482463836,
1512
+ "num_tokens": 154737404.0,
1513
+ "step": 1670
1514
+ },
1515
+ {
1516
+ "epoch": 3.280838615309605,
1517
+ "grad_norm": 0.507435142993927,
1518
+ "learning_rate": 1.3441406250000002e-05,
1519
+ "loss": 4.1875,
1520
+ "mean_token_accuracy": 0.3079258047044277,
1521
+ "num_tokens": 155669573.0,
1522
+ "step": 1680
1523
+ },
1524
+ {
1525
+ "epoch": 3.3003412969283277,
1526
+ "grad_norm": 0.3681463897228241,
1527
+ "learning_rate": 1.3402343750000002e-05,
1528
+ "loss": 4.2108,
1529
+ "mean_token_accuracy": 0.3069092735648155,
1530
+ "num_tokens": 156608562.0,
1531
+ "step": 1690
1532
+ },
1533
+ {
1534
+ "epoch": 3.31984397854705,
1535
+ "grad_norm": 0.3918771743774414,
1536
+ "learning_rate": 1.3363281250000002e-05,
1537
+ "loss": 4.1608,
1538
+ "mean_token_accuracy": 0.3113169133663177,
1539
+ "num_tokens": 157520485.0,
1540
+ "step": 1700
1541
+ },
1542
+ {
1543
+ "epoch": 3.3393466601657726,
1544
+ "grad_norm": 0.4225058853626251,
1545
+ "learning_rate": 1.3324218750000002e-05,
1546
+ "loss": 4.204,
1547
+ "mean_token_accuracy": 0.30721485018730166,
1548
+ "num_tokens": 158451388.0,
1549
+ "step": 1710
1550
+ },
1551
+ {
1552
+ "epoch": 3.358849341784495,
1553
+ "grad_norm": 0.35778412222862244,
1554
+ "learning_rate": 1.3285156250000001e-05,
1555
+ "loss": 4.1689,
1556
+ "mean_token_accuracy": 0.3099173367023468,
1557
+ "num_tokens": 159377651.0,
1558
+ "step": 1720
1559
+ },
1560
+ {
1561
+ "epoch": 3.378352023403218,
1562
+ "grad_norm": 2.289072275161743,
1563
+ "learning_rate": 1.3246093750000001e-05,
1564
+ "loss": 4.1889,
1565
+ "mean_token_accuracy": 0.30997600927948954,
1566
+ "num_tokens": 160304853.0,
1567
+ "step": 1730
1568
+ },
1569
+ {
1570
+ "epoch": 3.3978547050219405,
1571
+ "grad_norm": 1.1738953590393066,
1572
+ "learning_rate": 1.3207031250000001e-05,
1573
+ "loss": 4.1907,
1574
+ "mean_token_accuracy": 0.3083472929894924,
1575
+ "num_tokens": 161230300.0,
1576
+ "step": 1740
1577
+ },
1578
+ {
1579
+ "epoch": 3.417357386640663,
1580
+ "grad_norm": 0.35479751229286194,
1581
+ "learning_rate": 1.3167968750000001e-05,
1582
+ "loss": 4.2018,
1583
+ "mean_token_accuracy": 0.3078498594462872,
1584
+ "num_tokens": 162156769.0,
1585
+ "step": 1750
1586
+ },
1587
+ {
1588
+ "epoch": 3.4368600682593855,
1589
+ "grad_norm": 0.5815494060516357,
1590
+ "learning_rate": 1.3128906250000003e-05,
1591
+ "loss": 4.1843,
1592
+ "mean_token_accuracy": 0.3095716036856174,
1593
+ "num_tokens": 163073001.0,
1594
+ "step": 1760
1595
+ },
1596
+ {
1597
+ "epoch": 3.4563627498781084,
1598
+ "grad_norm": 0.5193825364112854,
1599
+ "learning_rate": 1.3089843750000003e-05,
1600
+ "loss": 4.1644,
1601
+ "mean_token_accuracy": 0.3114980049431324,
1602
+ "num_tokens": 164000019.0,
1603
+ "step": 1770
1604
+ },
1605
+ {
1606
+ "epoch": 3.475865431496831,
1607
+ "grad_norm": 0.5195266604423523,
1608
+ "learning_rate": 1.3050781250000003e-05,
1609
+ "loss": 4.174,
1610
+ "mean_token_accuracy": 0.30980290472507477,
1611
+ "num_tokens": 164932749.0,
1612
+ "step": 1780
1613
+ },
1614
+ {
1615
+ "epoch": 3.4953681131155534,
1616
+ "grad_norm": 0.4400602877140045,
1617
+ "learning_rate": 1.3011718750000002e-05,
1618
+ "loss": 4.1615,
1619
+ "mean_token_accuracy": 0.31133070439100263,
1620
+ "num_tokens": 165849595.0,
1621
+ "step": 1790
1622
+ },
1623
+ {
1624
+ "epoch": 3.514870794734276,
1625
+ "grad_norm": 0.3400901257991791,
1626
+ "learning_rate": 1.2972656250000002e-05,
1627
+ "loss": 4.1662,
1628
+ "mean_token_accuracy": 0.3104513093829155,
1629
+ "num_tokens": 166773538.0,
1630
+ "step": 1800
1631
+ },
1632
+ {
1633
+ "epoch": 3.5343734763529984,
1634
+ "grad_norm": 0.6036484837532043,
1635
+ "learning_rate": 1.2933593750000002e-05,
1636
+ "loss": 4.1792,
1637
+ "mean_token_accuracy": 0.31103473380208013,
1638
+ "num_tokens": 167704179.0,
1639
+ "step": 1810
1640
+ },
1641
+ {
1642
+ "epoch": 3.553876157971721,
1643
+ "grad_norm": 1.2540099620819092,
1644
+ "learning_rate": 1.2894531250000002e-05,
1645
+ "loss": 4.1385,
1646
+ "mean_token_accuracy": 0.3131691038608551,
1647
+ "num_tokens": 168631611.0,
1648
+ "step": 1820
1649
+ },
1650
+ {
1651
+ "epoch": 3.573378839590444,
1652
+ "grad_norm": 0.38774392008781433,
1653
+ "learning_rate": 1.2855468750000002e-05,
1654
+ "loss": 4.1689,
1655
+ "mean_token_accuracy": 0.3113840945065022,
1656
+ "num_tokens": 169544507.0,
1657
+ "step": 1830
1658
+ },
1659
+ {
1660
+ "epoch": 3.5928815212091663,
1661
+ "grad_norm": 0.465732216835022,
1662
+ "learning_rate": 1.2816406250000002e-05,
1663
+ "loss": 4.1749,
1664
+ "mean_token_accuracy": 0.31010042652487757,
1665
+ "num_tokens": 170472995.0,
1666
+ "step": 1840
1667
+ },
1668
+ {
1669
+ "epoch": 3.6123842028278887,
1670
+ "grad_norm": 0.471019983291626,
1671
+ "learning_rate": 1.2777343750000002e-05,
1672
+ "loss": 4.1695,
1673
+ "mean_token_accuracy": 0.3110612273216248,
1674
+ "num_tokens": 171402715.0,
1675
+ "step": 1850
1676
+ },
1677
+ {
1678
+ "epoch": 3.6318868844466117,
1679
+ "grad_norm": 0.39528539776802063,
1680
+ "learning_rate": 1.2738281250000002e-05,
1681
+ "loss": 4.1473,
1682
+ "mean_token_accuracy": 0.3129354894161224,
1683
+ "num_tokens": 172328519.0,
1684
+ "step": 1860
1685
+ },
1686
+ {
1687
+ "epoch": 3.651389566065334,
1688
+ "grad_norm": 0.5567691922187805,
1689
+ "learning_rate": 1.2699218750000002e-05,
1690
+ "loss": 4.1365,
1691
+ "mean_token_accuracy": 0.3130590170621872,
1692
+ "num_tokens": 173254781.0,
1693
+ "step": 1870
1694
+ },
1695
+ {
1696
+ "epoch": 3.6708922476840566,
1697
+ "grad_norm": 0.49460193514823914,
1698
+ "learning_rate": 1.2660156250000002e-05,
1699
+ "loss": 4.141,
1700
+ "mean_token_accuracy": 0.3134758062660694,
1701
+ "num_tokens": 174184953.0,
1702
+ "step": 1880
1703
+ },
1704
+ {
1705
+ "epoch": 3.690394929302779,
1706
+ "grad_norm": 0.3996962010860443,
1707
+ "learning_rate": 1.2621093750000001e-05,
1708
+ "loss": 4.1586,
1709
+ "mean_token_accuracy": 0.31323386654257773,
1710
+ "num_tokens": 175108592.0,
1711
+ "step": 1890
1712
+ },
1713
+ {
1714
+ "epoch": 3.7098976109215016,
1715
+ "grad_norm": 0.3824934959411621,
1716
+ "learning_rate": 1.2582031250000001e-05,
1717
+ "loss": 4.1385,
1718
+ "mean_token_accuracy": 0.3139244385063648,
1719
+ "num_tokens": 176037993.0,
1720
+ "step": 1900
1721
+ },
1722
+ {
1723
+ "epoch": 3.729400292540224,
1724
+ "grad_norm": 0.4332631528377533,
1725
+ "learning_rate": 1.2542968750000001e-05,
1726
+ "loss": 4.157,
1727
+ "mean_token_accuracy": 0.31349849998950957,
1728
+ "num_tokens": 176962331.0,
1729
+ "step": 1910
1730
+ },
1731
+ {
1732
+ "epoch": 3.748902974158947,
1733
+ "grad_norm": 0.6383605003356934,
1734
+ "learning_rate": 1.2503906250000001e-05,
1735
+ "loss": 4.1621,
1736
+ "mean_token_accuracy": 0.31250465139746664,
1737
+ "num_tokens": 177887507.0,
1738
+ "step": 1920
1739
+ },
1740
+ {
1741
+ "epoch": 3.7684056557776695,
1742
+ "grad_norm": 0.4531135559082031,
1743
+ "learning_rate": 1.2464843750000003e-05,
1744
+ "loss": 4.1326,
1745
+ "mean_token_accuracy": 0.31330428943037986,
1746
+ "num_tokens": 178819985.0,
1747
+ "step": 1930
1748
+ },
1749
+ {
1750
+ "epoch": 3.787908337396392,
1751
+ "grad_norm": 0.40584367513656616,
1752
+ "learning_rate": 1.2425781250000003e-05,
1753
+ "loss": 4.1467,
1754
+ "mean_token_accuracy": 0.3128244742751122,
1755
+ "num_tokens": 179755943.0,
1756
+ "step": 1940
1757
+ },
1758
+ {
1759
+ "epoch": 3.8074110190151145,
1760
+ "grad_norm": 0.3366943597793579,
1761
+ "learning_rate": 1.2386718750000003e-05,
1762
+ "loss": 4.153,
1763
+ "mean_token_accuracy": 0.31367711797356607,
1764
+ "num_tokens": 180690051.0,
1765
+ "step": 1950
1766
+ },
1767
+ {
1768
+ "epoch": 3.8269137006338374,
1769
+ "grad_norm": 0.47235119342803955,
1770
+ "learning_rate": 1.2347656250000002e-05,
1771
+ "loss": 4.1359,
1772
+ "mean_token_accuracy": 0.31598555818200114,
1773
+ "num_tokens": 181612029.0,
1774
+ "step": 1960
1775
+ },
1776
+ {
1777
+ "epoch": 3.84641638225256,
1778
+ "grad_norm": 0.8654286861419678,
1779
+ "learning_rate": 1.2308593750000002e-05,
1780
+ "loss": 4.1453,
1781
+ "mean_token_accuracy": 0.3134163662791252,
1782
+ "num_tokens": 182526847.0,
1783
+ "step": 1970
1784
+ },
1785
+ {
1786
+ "epoch": 3.8659190638712824,
1787
+ "grad_norm": 0.43307119607925415,
1788
+ "learning_rate": 1.2269531250000002e-05,
1789
+ "loss": 4.1145,
1790
+ "mean_token_accuracy": 0.3158551573753357,
1791
+ "num_tokens": 183462915.0,
1792
+ "step": 1980
1793
+ },
1794
+ {
1795
+ "epoch": 3.885421745490005,
1796
+ "grad_norm": 0.6477943062782288,
1797
+ "learning_rate": 1.2230468750000002e-05,
1798
+ "loss": 4.1241,
1799
+ "mean_token_accuracy": 0.3159909948706627,
1800
+ "num_tokens": 184385672.0,
1801
+ "step": 1990
1802
+ },
1803
+ {
1804
+ "epoch": 3.9049244271087273,
1805
+ "grad_norm": 0.38260239362716675,
1806
+ "learning_rate": 1.2191406250000002e-05,
1807
+ "loss": 4.1452,
1808
+ "mean_token_accuracy": 0.3129440575838089,
1809
+ "num_tokens": 185321368.0,
1810
+ "step": 2000
1811
+ }
1812
+ ],
1813
+ "logging_steps": 10,
1814
+ "max_steps": 5120,
1815
+ "num_input_tokens_seen": 0,
1816
+ "num_train_epochs": 10,
1817
+ "save_steps": 500,
1818
+ "stateful_callbacks": {
1819
+ "TrainerControl": {
1820
+ "args": {
1821
+ "should_epoch_stop": false,
1822
+ "should_evaluate": false,
1823
+ "should_log": false,
1824
+ "should_save": true,
1825
+ "should_training_stop": false
1826
+ },
1827
+ "attributes": {}
1828
+ }
1829
+ },
1830
+ "total_flos": 2.7633132003498394e+17,
1831
+ "train_batch_size": 64,
1832
+ "trial_name": null,
1833
+ "trial_params": null
1834
+ }
checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:213ef3ed16d9dd20d51f6355dc64c9dc5ebcaf8490efb503d6a15061df366d53
3
+ size 5624
checkpoint-2500/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 256,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 6,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 6,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-2500/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.51.3"
7
+ }
checkpoint-2500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9988be5ac9bba1921f10f52fd242e1e0ae5eccc486b68a433a0b1be67a0d0d59
3
+ size 800819936
checkpoint-2500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:385903316c121cdf7c92e5bea51c9f837c5af213e9e30778065f5323b08e657a
3
+ size 1601820026
checkpoint-2500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecefbb3f17bb76b6655eb0157c98b5287c17fa4b4c72a6b9068b0823ce9fd18d
3
+ size 14244
checkpoint-2500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c36ac66279539103c80c21896a5136f4fe5dee7b02df2613aeec5356487af9a8
3
+ size 1064
checkpoint-2500/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-2500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2500/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-2500/tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }
checkpoint-2500/trainer_state.json ADDED
@@ -0,0 +1,2284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 4.8815212091662605,
6
+ "eval_steps": 500,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.019502681618722574,
14
+ "grad_norm": 3.0208523273468018,
15
+ "learning_rate": 1.9964843750000004e-05,
16
+ "loss": 7.8867,
17
+ "mean_token_accuracy": 0.0920736625790596,
18
+ "num_tokens": 920759.0,
19
+ "step": 10
20
+ },
21
+ {
22
+ "epoch": 0.03900536323744515,
23
+ "grad_norm": 2.2770543098449707,
24
+ "learning_rate": 1.9925781250000002e-05,
25
+ "loss": 7.5013,
26
+ "mean_token_accuracy": 0.10232679340988397,
27
+ "num_tokens": 1848077.0,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.05850804485616772,
32
+ "grad_norm": 1.6555824279785156,
33
+ "learning_rate": 1.9886718750000004e-05,
34
+ "loss": 7.0436,
35
+ "mean_token_accuracy": 0.11740029789507389,
36
+ "num_tokens": 2781210.0,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.0780107264748903,
41
+ "grad_norm": 1.7775338888168335,
42
+ "learning_rate": 1.9847656250000002e-05,
43
+ "loss": 6.7631,
44
+ "mean_token_accuracy": 0.1280333673581481,
45
+ "num_tokens": 3689316.0,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 0.09751340809361288,
50
+ "grad_norm": 1.2532657384872437,
51
+ "learning_rate": 1.9808593750000003e-05,
52
+ "loss": 6.515,
53
+ "mean_token_accuracy": 0.13626975379884243,
54
+ "num_tokens": 4616761.0,
55
+ "step": 50
56
+ },
57
+ {
58
+ "epoch": 0.11701608971233544,
59
+ "grad_norm": 1.1137648820877075,
60
+ "learning_rate": 1.976953125e-05,
61
+ "loss": 6.3421,
62
+ "mean_token_accuracy": 0.1418815266340971,
63
+ "num_tokens": 5544631.0,
64
+ "step": 60
65
+ },
66
+ {
67
+ "epoch": 0.13651877133105803,
68
+ "grad_norm": 0.9245680570602417,
69
+ "learning_rate": 1.9730468750000003e-05,
70
+ "loss": 6.2092,
71
+ "mean_token_accuracy": 0.1463709220290184,
72
+ "num_tokens": 6483486.0,
73
+ "step": 70
74
+ },
75
+ {
76
+ "epoch": 0.1560214529497806,
77
+ "grad_norm": 0.8324124813079834,
78
+ "learning_rate": 1.969140625e-05,
79
+ "loss": 6.0411,
80
+ "mean_token_accuracy": 0.15314992293715476,
81
+ "num_tokens": 7412558.0,
82
+ "step": 80
83
+ },
84
+ {
85
+ "epoch": 0.17552413456850316,
86
+ "grad_norm": 0.920666515827179,
87
+ "learning_rate": 1.9652343750000003e-05,
88
+ "loss": 5.9319,
89
+ "mean_token_accuracy": 0.16162274666130544,
90
+ "num_tokens": 8332801.0,
91
+ "step": 90
92
+ },
93
+ {
94
+ "epoch": 0.19502681618722575,
95
+ "grad_norm": 0.8294446468353271,
96
+ "learning_rate": 1.961328125e-05,
97
+ "loss": 5.8516,
98
+ "mean_token_accuracy": 0.16927699856460093,
99
+ "num_tokens": 9274826.0,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 0.21452949780594832,
104
+ "grad_norm": 0.7535350322723389,
105
+ "learning_rate": 1.9574218750000003e-05,
106
+ "loss": 5.7591,
107
+ "mean_token_accuracy": 0.17415257096290587,
108
+ "num_tokens": 10190661.0,
109
+ "step": 110
110
+ },
111
+ {
112
+ "epoch": 0.2340321794246709,
113
+ "grad_norm": 0.9731621742248535,
114
+ "learning_rate": 1.953515625e-05,
115
+ "loss": 5.6905,
116
+ "mean_token_accuracy": 0.17712676227092744,
117
+ "num_tokens": 11113827.0,
118
+ "step": 120
119
+ },
120
+ {
121
+ "epoch": 0.25353486104339346,
122
+ "grad_norm": 1.5154342651367188,
123
+ "learning_rate": 1.9496093750000003e-05,
124
+ "loss": 5.6104,
125
+ "mean_token_accuracy": 0.1849387872964144,
126
+ "num_tokens": 12034156.0,
127
+ "step": 130
128
+ },
129
+ {
130
+ "epoch": 0.27303754266211605,
131
+ "grad_norm": 0.7294800877571106,
132
+ "learning_rate": 1.9457031250000004e-05,
133
+ "loss": 5.5413,
134
+ "mean_token_accuracy": 0.19220538288354874,
135
+ "num_tokens": 12950143.0,
136
+ "step": 140
137
+ },
138
+ {
139
+ "epoch": 0.2925402242808386,
140
+ "grad_norm": 0.6792197823524475,
141
+ "learning_rate": 1.9417968750000002e-05,
142
+ "loss": 5.5046,
143
+ "mean_token_accuracy": 0.1948456909507513,
144
+ "num_tokens": 13864130.0,
145
+ "step": 150
146
+ },
147
+ {
148
+ "epoch": 0.3120429058995612,
149
+ "grad_norm": 0.6913173794746399,
150
+ "learning_rate": 1.9378906250000004e-05,
151
+ "loss": 5.4398,
152
+ "mean_token_accuracy": 0.19714849777519702,
153
+ "num_tokens": 14795220.0,
154
+ "step": 160
155
+ },
156
+ {
157
+ "epoch": 0.3315455875182838,
158
+ "grad_norm": 0.6437414288520813,
159
+ "learning_rate": 1.9339843750000002e-05,
160
+ "loss": 5.4273,
161
+ "mean_token_accuracy": 0.19942218959331512,
162
+ "num_tokens": 15722954.0,
163
+ "step": 170
164
+ },
165
+ {
166
+ "epoch": 0.3510482691370063,
167
+ "grad_norm": 2.1863138675689697,
168
+ "learning_rate": 1.9300781250000004e-05,
169
+ "loss": 5.3666,
170
+ "mean_token_accuracy": 0.2016600638628006,
171
+ "num_tokens": 16637414.0,
172
+ "step": 180
173
+ },
174
+ {
175
+ "epoch": 0.3705509507557289,
176
+ "grad_norm": 0.8689864873886108,
177
+ "learning_rate": 1.9261718750000002e-05,
178
+ "loss": 5.3421,
179
+ "mean_token_accuracy": 0.20273192636668683,
180
+ "num_tokens": 17569476.0,
181
+ "step": 190
182
+ },
183
+ {
184
+ "epoch": 0.3900536323744515,
185
+ "grad_norm": 1.2784861326217651,
186
+ "learning_rate": 1.9222656250000003e-05,
187
+ "loss": 5.3323,
188
+ "mean_token_accuracy": 0.20496859662234784,
189
+ "num_tokens": 18498067.0,
190
+ "step": 200
191
+ },
192
+ {
193
+ "epoch": 0.40955631399317405,
194
+ "grad_norm": 0.6330307722091675,
195
+ "learning_rate": 1.9183593750000002e-05,
196
+ "loss": 5.2827,
197
+ "mean_token_accuracy": 0.2135307714343071,
198
+ "num_tokens": 19416964.0,
199
+ "step": 210
200
+ },
201
+ {
202
+ "epoch": 0.42905899561189664,
203
+ "grad_norm": 1.1162034273147583,
204
+ "learning_rate": 1.9144531250000003e-05,
205
+ "loss": 5.2121,
206
+ "mean_token_accuracy": 0.21816504523158073,
207
+ "num_tokens": 20341056.0,
208
+ "step": 220
209
+ },
210
+ {
211
+ "epoch": 0.4485616772306192,
212
+ "grad_norm": 0.6339605450630188,
213
+ "learning_rate": 1.910546875e-05,
214
+ "loss": 5.2187,
215
+ "mean_token_accuracy": 0.21803640052676201,
216
+ "num_tokens": 21267082.0,
217
+ "step": 230
218
+ },
219
+ {
220
+ "epoch": 0.4680643588493418,
221
+ "grad_norm": 0.607659637928009,
222
+ "learning_rate": 1.9066406250000003e-05,
223
+ "loss": 5.1826,
224
+ "mean_token_accuracy": 0.22011552266776563,
225
+ "num_tokens": 22195442.0,
226
+ "step": 240
227
+ },
228
+ {
229
+ "epoch": 0.48756704046806437,
230
+ "grad_norm": 0.5029736757278442,
231
+ "learning_rate": 1.902734375e-05,
232
+ "loss": 5.1424,
233
+ "mean_token_accuracy": 0.22285537868738176,
234
+ "num_tokens": 23144077.0,
235
+ "step": 250
236
+ },
237
+ {
238
+ "epoch": 0.5070697220867869,
239
+ "grad_norm": 0.9591688513755798,
240
+ "learning_rate": 1.8988281250000003e-05,
241
+ "loss": 5.0913,
242
+ "mean_token_accuracy": 0.22736062072217464,
243
+ "num_tokens": 24068643.0,
244
+ "step": 260
245
+ },
246
+ {
247
+ "epoch": 0.5265724037055095,
248
+ "grad_norm": 0.5418295860290527,
249
+ "learning_rate": 1.894921875e-05,
250
+ "loss": 5.1015,
251
+ "mean_token_accuracy": 0.22693138755857944,
252
+ "num_tokens": 24996534.0,
253
+ "step": 270
254
+ },
255
+ {
256
+ "epoch": 0.5460750853242321,
257
+ "grad_norm": 0.5258099436759949,
258
+ "learning_rate": 1.8910156250000003e-05,
259
+ "loss": 5.0746,
260
+ "mean_token_accuracy": 0.23026154786348343,
261
+ "num_tokens": 25920902.0,
262
+ "step": 280
263
+ },
264
+ {
265
+ "epoch": 0.5655777669429547,
266
+ "grad_norm": 0.5592005252838135,
267
+ "learning_rate": 1.887109375e-05,
268
+ "loss": 5.0566,
269
+ "mean_token_accuracy": 0.23160071447491645,
270
+ "num_tokens": 26845104.0,
271
+ "step": 290
272
+ },
273
+ {
274
+ "epoch": 0.5850804485616772,
275
+ "grad_norm": 0.5427853465080261,
276
+ "learning_rate": 1.8832031250000002e-05,
277
+ "loss": 5.0565,
278
+ "mean_token_accuracy": 0.2316149313002825,
279
+ "num_tokens": 27782058.0,
280
+ "step": 300
281
+ },
282
+ {
283
+ "epoch": 0.6045831301803998,
284
+ "grad_norm": 0.9386640191078186,
285
+ "learning_rate": 1.8792968750000004e-05,
286
+ "loss": 5.0003,
287
+ "mean_token_accuracy": 0.2349798556417227,
288
+ "num_tokens": 28707140.0,
289
+ "step": 310
290
+ },
291
+ {
292
+ "epoch": 0.6240858117991224,
293
+ "grad_norm": 0.8473969101905823,
294
+ "learning_rate": 1.8753906250000002e-05,
295
+ "loss": 4.9954,
296
+ "mean_token_accuracy": 0.23647231683135034,
297
+ "num_tokens": 29637908.0,
298
+ "step": 320
299
+ },
300
+ {
301
+ "epoch": 0.643588493417845,
302
+ "grad_norm": 0.5143874883651733,
303
+ "learning_rate": 1.8714843750000004e-05,
304
+ "loss": 4.9881,
305
+ "mean_token_accuracy": 0.23599626012146474,
306
+ "num_tokens": 30559254.0,
307
+ "step": 330
308
+ },
309
+ {
310
+ "epoch": 0.6630911750365676,
311
+ "grad_norm": 0.4523729979991913,
312
+ "learning_rate": 1.8675781250000002e-05,
313
+ "loss": 4.9695,
314
+ "mean_token_accuracy": 0.23746853992342948,
315
+ "num_tokens": 31487152.0,
316
+ "step": 340
317
+ },
318
+ {
319
+ "epoch": 0.6825938566552902,
320
+ "grad_norm": 0.5573343634605408,
321
+ "learning_rate": 1.8636718750000004e-05,
322
+ "loss": 4.9525,
323
+ "mean_token_accuracy": 0.23925678990781307,
324
+ "num_tokens": 32396911.0,
325
+ "step": 350
326
+ },
327
+ {
328
+ "epoch": 0.7020965382740126,
329
+ "grad_norm": 1.2790151834487915,
330
+ "learning_rate": 1.8597656250000002e-05,
331
+ "loss": 4.9338,
332
+ "mean_token_accuracy": 0.24103106185793877,
333
+ "num_tokens": 33325126.0,
334
+ "step": 360
335
+ },
336
+ {
337
+ "epoch": 0.7215992198927352,
338
+ "grad_norm": 0.8714343309402466,
339
+ "learning_rate": 1.8558593750000003e-05,
340
+ "loss": 4.9252,
341
+ "mean_token_accuracy": 0.24138498678803444,
342
+ "num_tokens": 34262332.0,
343
+ "step": 370
344
+ },
345
+ {
346
+ "epoch": 0.7411019015114578,
347
+ "grad_norm": 0.5251726508140564,
348
+ "learning_rate": 1.851953125e-05,
349
+ "loss": 4.8883,
350
+ "mean_token_accuracy": 0.24456401653587817,
351
+ "num_tokens": 35181080.0,
352
+ "step": 380
353
+ },
354
+ {
355
+ "epoch": 0.7606045831301804,
356
+ "grad_norm": 0.46523743867874146,
357
+ "learning_rate": 1.8480468750000003e-05,
358
+ "loss": 4.8887,
359
+ "mean_token_accuracy": 0.24552332125604154,
360
+ "num_tokens": 36105038.0,
361
+ "step": 390
362
+ },
363
+ {
364
+ "epoch": 0.780107264748903,
365
+ "grad_norm": 0.5080934166908264,
366
+ "learning_rate": 1.844140625e-05,
367
+ "loss": 4.8659,
368
+ "mean_token_accuracy": 0.245796899497509,
369
+ "num_tokens": 37036998.0,
370
+ "step": 400
371
+ },
372
+ {
373
+ "epoch": 0.7996099463676255,
374
+ "grad_norm": 1.117693543434143,
375
+ "learning_rate": 1.8402343750000003e-05,
376
+ "loss": 4.8681,
377
+ "mean_token_accuracy": 0.2463846940547228,
378
+ "num_tokens": 37968683.0,
379
+ "step": 410
380
+ },
381
+ {
382
+ "epoch": 0.8191126279863481,
383
+ "grad_norm": 0.5299689769744873,
384
+ "learning_rate": 1.836328125e-05,
385
+ "loss": 4.8305,
386
+ "mean_token_accuracy": 0.24968006946146487,
387
+ "num_tokens": 38888752.0,
388
+ "step": 420
389
+ },
390
+ {
391
+ "epoch": 0.8386153096050707,
392
+ "grad_norm": 0.8458639979362488,
393
+ "learning_rate": 1.8324218750000003e-05,
394
+ "loss": 4.8279,
395
+ "mean_token_accuracy": 0.25057865455746653,
396
+ "num_tokens": 39823608.0,
397
+ "step": 430
398
+ },
399
+ {
400
+ "epoch": 0.8581179912237933,
401
+ "grad_norm": 0.5429758429527283,
402
+ "learning_rate": 1.828515625e-05,
403
+ "loss": 4.8243,
404
+ "mean_token_accuracy": 0.2514403607696295,
405
+ "num_tokens": 40748246.0,
406
+ "step": 440
407
+ },
408
+ {
409
+ "epoch": 0.8776206728425159,
410
+ "grad_norm": 0.47386595606803894,
411
+ "learning_rate": 1.8246093750000003e-05,
412
+ "loss": 4.7881,
413
+ "mean_token_accuracy": 0.25412631034851074,
414
+ "num_tokens": 41662563.0,
415
+ "step": 450
416
+ },
417
+ {
418
+ "epoch": 0.8971233544612384,
419
+ "grad_norm": 0.41789767146110535,
420
+ "learning_rate": 1.820703125e-05,
421
+ "loss": 4.7944,
422
+ "mean_token_accuracy": 0.25327568165957925,
423
+ "num_tokens": 42588923.0,
424
+ "step": 460
425
+ },
426
+ {
427
+ "epoch": 0.916626036079961,
428
+ "grad_norm": 0.43711453676223755,
429
+ "learning_rate": 1.8167968750000002e-05,
430
+ "loss": 4.7758,
431
+ "mean_token_accuracy": 0.2548953540623188,
432
+ "num_tokens": 43515886.0,
433
+ "step": 470
434
+ },
435
+ {
436
+ "epoch": 0.9361287176986836,
437
+ "grad_norm": 0.6433466076850891,
438
+ "learning_rate": 1.8128906250000004e-05,
439
+ "loss": 4.7632,
440
+ "mean_token_accuracy": 0.2562540594488382,
441
+ "num_tokens": 44446196.0,
442
+ "step": 480
443
+ },
444
+ {
445
+ "epoch": 0.9556313993174061,
446
+ "grad_norm": 0.7580122351646423,
447
+ "learning_rate": 1.8089843750000002e-05,
448
+ "loss": 4.7559,
449
+ "mean_token_accuracy": 0.25687045492231847,
450
+ "num_tokens": 45389436.0,
451
+ "step": 490
452
+ },
453
+ {
454
+ "epoch": 0.9751340809361287,
455
+ "grad_norm": 0.9800453782081604,
456
+ "learning_rate": 1.8050781250000004e-05,
457
+ "loss": 4.7711,
458
+ "mean_token_accuracy": 0.2573121260851622,
459
+ "num_tokens": 46313035.0,
460
+ "step": 500
461
+ },
462
+ {
463
+ "epoch": 0.9946367625548513,
464
+ "grad_norm": 0.636842668056488,
465
+ "learning_rate": 1.8011718750000002e-05,
466
+ "loss": 4.7693,
467
+ "mean_token_accuracy": 0.2563398856669664,
468
+ "num_tokens": 47231018.0,
469
+ "step": 510
470
+ },
471
+ {
472
+ "epoch": 1.0156021452949782,
473
+ "grad_norm": 0.9280142188072205,
474
+ "learning_rate": 1.7972656250000004e-05,
475
+ "loss": 5.2112,
476
+ "mean_token_accuracy": 0.25861204106633257,
477
+ "num_tokens": 48181298.0,
478
+ "step": 520
479
+ },
480
+ {
481
+ "epoch": 1.0351048269137006,
482
+ "grad_norm": 0.5360897183418274,
483
+ "learning_rate": 1.7933593750000002e-05,
484
+ "loss": 4.7274,
485
+ "mean_token_accuracy": 0.26002744026482105,
486
+ "num_tokens": 49105264.0,
487
+ "step": 530
488
+ },
489
+ {
490
+ "epoch": 1.0546075085324231,
491
+ "grad_norm": 0.5811850428581238,
492
+ "learning_rate": 1.7894531250000003e-05,
493
+ "loss": 4.7052,
494
+ "mean_token_accuracy": 0.2608158510178328,
495
+ "num_tokens": 50033001.0,
496
+ "step": 540
497
+ },
498
+ {
499
+ "epoch": 1.0741101901511458,
500
+ "grad_norm": 0.9147205352783203,
501
+ "learning_rate": 1.785546875e-05,
502
+ "loss": 4.6854,
503
+ "mean_token_accuracy": 0.26275911666452884,
504
+ "num_tokens": 50954465.0,
505
+ "step": 550
506
+ },
507
+ {
508
+ "epoch": 1.0936128717698683,
509
+ "grad_norm": 0.46637555956840515,
510
+ "learning_rate": 1.7816406250000003e-05,
511
+ "loss": 4.6969,
512
+ "mean_token_accuracy": 0.26292436122894286,
513
+ "num_tokens": 51876402.0,
514
+ "step": 560
515
+ },
516
+ {
517
+ "epoch": 1.113115553388591,
518
+ "grad_norm": 1.1488078832626343,
519
+ "learning_rate": 1.777734375e-05,
520
+ "loss": 4.6934,
521
+ "mean_token_accuracy": 0.26502432897686956,
522
+ "num_tokens": 52789774.0,
523
+ "step": 570
524
+ },
525
+ {
526
+ "epoch": 1.1326182350073135,
527
+ "grad_norm": 0.42444393038749695,
528
+ "learning_rate": 1.7738281250000003e-05,
529
+ "loss": 4.6616,
530
+ "mean_token_accuracy": 0.2661720596253872,
531
+ "num_tokens": 53700514.0,
532
+ "step": 580
533
+ },
534
+ {
535
+ "epoch": 1.152120916626036,
536
+ "grad_norm": 0.46090322732925415,
537
+ "learning_rate": 1.769921875e-05,
538
+ "loss": 4.6677,
539
+ "mean_token_accuracy": 0.2653431937098503,
540
+ "num_tokens": 54637906.0,
541
+ "step": 590
542
+ },
543
+ {
544
+ "epoch": 1.1716235982447587,
545
+ "grad_norm": 1.0310624837875366,
546
+ "learning_rate": 1.7660156250000003e-05,
547
+ "loss": 4.6441,
548
+ "mean_token_accuracy": 0.26668640449643133,
549
+ "num_tokens": 55566104.0,
550
+ "step": 600
551
+ },
552
+ {
553
+ "epoch": 1.1911262798634812,
554
+ "grad_norm": 0.5190272927284241,
555
+ "learning_rate": 1.762109375e-05,
556
+ "loss": 4.6388,
557
+ "mean_token_accuracy": 0.26863499656319617,
558
+ "num_tokens": 56475233.0,
559
+ "step": 610
560
+ },
561
+ {
562
+ "epoch": 1.210628961482204,
563
+ "grad_norm": 0.4435961842536926,
564
+ "learning_rate": 1.7582031250000003e-05,
565
+ "loss": 4.6344,
566
+ "mean_token_accuracy": 0.27009780779480935,
567
+ "num_tokens": 57393208.0,
568
+ "step": 620
569
+ },
570
+ {
571
+ "epoch": 1.2301316431009264,
572
+ "grad_norm": 1.5555769205093384,
573
+ "learning_rate": 1.754296875e-05,
574
+ "loss": 4.6239,
575
+ "mean_token_accuracy": 0.26839635893702507,
576
+ "num_tokens": 58320792.0,
577
+ "step": 630
578
+ },
579
+ {
580
+ "epoch": 1.2496343247196489,
581
+ "grad_norm": 0.45114317536354065,
582
+ "learning_rate": 1.7503906250000002e-05,
583
+ "loss": 4.6231,
584
+ "mean_token_accuracy": 0.26895947232842443,
585
+ "num_tokens": 59247413.0,
586
+ "step": 640
587
+ },
588
+ {
589
+ "epoch": 1.2691370063383716,
590
+ "grad_norm": 0.5050057768821716,
591
+ "learning_rate": 1.7464843750000004e-05,
592
+ "loss": 4.6231,
593
+ "mean_token_accuracy": 0.2689752779901028,
594
+ "num_tokens": 60180529.0,
595
+ "step": 650
596
+ },
597
+ {
598
+ "epoch": 1.288639687957094,
599
+ "grad_norm": 0.4494447708129883,
600
+ "learning_rate": 1.7425781250000002e-05,
601
+ "loss": 4.5939,
602
+ "mean_token_accuracy": 0.2701444610953331,
603
+ "num_tokens": 61116823.0,
604
+ "step": 660
605
+ },
606
+ {
607
+ "epoch": 1.3081423695758168,
608
+ "grad_norm": 0.5126622915267944,
609
+ "learning_rate": 1.7386718750000004e-05,
610
+ "loss": 4.5905,
611
+ "mean_token_accuracy": 0.27250412106513977,
612
+ "num_tokens": 62036873.0,
613
+ "step": 670
614
+ },
615
+ {
616
+ "epoch": 1.3276450511945392,
617
+ "grad_norm": 0.4027528166770935,
618
+ "learning_rate": 1.7347656250000002e-05,
619
+ "loss": 4.5702,
620
+ "mean_token_accuracy": 0.2741738684475422,
621
+ "num_tokens": 62955213.0,
622
+ "step": 680
623
+ },
624
+ {
625
+ "epoch": 1.3471477328132617,
626
+ "grad_norm": 0.42226913571357727,
627
+ "learning_rate": 1.7308593750000004e-05,
628
+ "loss": 4.6027,
629
+ "mean_token_accuracy": 0.2714010961353779,
630
+ "num_tokens": 63879282.0,
631
+ "step": 690
632
+ },
633
+ {
634
+ "epoch": 1.3666504144319844,
635
+ "grad_norm": 0.7456600069999695,
636
+ "learning_rate": 1.7269531250000002e-05,
637
+ "loss": 4.6076,
638
+ "mean_token_accuracy": 0.27038322016596794,
639
+ "num_tokens": 64813842.0,
640
+ "step": 700
641
+ },
642
+ {
643
+ "epoch": 1.386153096050707,
644
+ "grad_norm": 0.4936697483062744,
645
+ "learning_rate": 1.7230468750000003e-05,
646
+ "loss": 4.5808,
647
+ "mean_token_accuracy": 0.27259208634495735,
648
+ "num_tokens": 65750164.0,
649
+ "step": 710
650
+ },
651
+ {
652
+ "epoch": 1.4056557776694296,
653
+ "grad_norm": 0.40352705121040344,
654
+ "learning_rate": 1.719140625e-05,
655
+ "loss": 4.5687,
656
+ "mean_token_accuracy": 0.2735755704343319,
657
+ "num_tokens": 66671414.0,
658
+ "step": 720
659
+ },
660
+ {
661
+ "epoch": 1.425158459288152,
662
+ "grad_norm": 0.5929037928581238,
663
+ "learning_rate": 1.7152343750000003e-05,
664
+ "loss": 4.5549,
665
+ "mean_token_accuracy": 0.27455407530069353,
666
+ "num_tokens": 67601565.0,
667
+ "step": 730
668
+ },
669
+ {
670
+ "epoch": 1.4446611409068746,
671
+ "grad_norm": 2.1730380058288574,
672
+ "learning_rate": 1.711328125e-05,
673
+ "loss": 4.5471,
674
+ "mean_token_accuracy": 0.2747652716934681,
675
+ "num_tokens": 68543369.0,
676
+ "step": 740
677
+ },
678
+ {
679
+ "epoch": 1.4641638225255973,
680
+ "grad_norm": 0.45310553908348083,
681
+ "learning_rate": 1.7074218750000003e-05,
682
+ "loss": 4.5379,
683
+ "mean_token_accuracy": 0.2759984292089939,
684
+ "num_tokens": 69475220.0,
685
+ "step": 750
686
+ },
687
+ {
688
+ "epoch": 1.4836665041443198,
689
+ "grad_norm": 0.44060948491096497,
690
+ "learning_rate": 1.703515625e-05,
691
+ "loss": 4.5207,
692
+ "mean_token_accuracy": 0.2781739257276058,
693
+ "num_tokens": 70397859.0,
694
+ "step": 760
695
+ },
696
+ {
697
+ "epoch": 1.5031691857630425,
698
+ "grad_norm": 0.9775242209434509,
699
+ "learning_rate": 1.6996093750000003e-05,
700
+ "loss": 4.5283,
701
+ "mean_token_accuracy": 0.27766570150852204,
702
+ "num_tokens": 71329289.0,
703
+ "step": 770
704
+ },
705
+ {
706
+ "epoch": 1.522671867381765,
707
+ "grad_norm": 0.476166695356369,
708
+ "learning_rate": 1.695703125e-05,
709
+ "loss": 4.5227,
710
+ "mean_token_accuracy": 0.2781515374779701,
711
+ "num_tokens": 72251787.0,
712
+ "step": 780
713
+ },
714
+ {
715
+ "epoch": 1.5421745490004874,
716
+ "grad_norm": 0.7697413563728333,
717
+ "learning_rate": 1.6917968750000003e-05,
718
+ "loss": 4.5089,
719
+ "mean_token_accuracy": 0.2777483291924,
720
+ "num_tokens": 73184759.0,
721
+ "step": 790
722
+ },
723
+ {
724
+ "epoch": 1.5616772306192102,
725
+ "grad_norm": 0.39053142070770264,
726
+ "learning_rate": 1.687890625e-05,
727
+ "loss": 4.5224,
728
+ "mean_token_accuracy": 0.2780651919543743,
729
+ "num_tokens": 74104519.0,
730
+ "step": 800
731
+ },
732
+ {
733
+ "epoch": 1.5811799122379329,
734
+ "grad_norm": 0.5136573910713196,
735
+ "learning_rate": 1.6839843750000002e-05,
736
+ "loss": 4.5168,
737
+ "mean_token_accuracy": 0.27778707146644593,
738
+ "num_tokens": 75041132.0,
739
+ "step": 810
740
+ },
741
+ {
742
+ "epoch": 1.6006825938566553,
743
+ "grad_norm": 0.4006953239440918,
744
+ "learning_rate": 1.6800781250000004e-05,
745
+ "loss": 4.4969,
746
+ "mean_token_accuracy": 0.2798936806619167,
747
+ "num_tokens": 75957080.0,
748
+ "step": 820
749
+ },
750
+ {
751
+ "epoch": 1.6201852754753778,
752
+ "grad_norm": 0.8261349201202393,
753
+ "learning_rate": 1.6761718750000002e-05,
754
+ "loss": 4.503,
755
+ "mean_token_accuracy": 0.27909068912267687,
756
+ "num_tokens": 76885767.0,
757
+ "step": 830
758
+ },
759
+ {
760
+ "epoch": 1.6396879570941003,
761
+ "grad_norm": 0.6244539022445679,
762
+ "learning_rate": 1.6722656250000004e-05,
763
+ "loss": 4.4941,
764
+ "mean_token_accuracy": 0.27921902686357497,
765
+ "num_tokens": 77828819.0,
766
+ "step": 840
767
+ },
768
+ {
769
+ "epoch": 1.659190638712823,
770
+ "grad_norm": 0.3824380040168762,
771
+ "learning_rate": 1.6683593750000002e-05,
772
+ "loss": 4.5036,
773
+ "mean_token_accuracy": 0.2791468746960163,
774
+ "num_tokens": 78767094.0,
775
+ "step": 850
776
+ },
777
+ {
778
+ "epoch": 1.6786933203315457,
779
+ "grad_norm": 0.41238024830818176,
780
+ "learning_rate": 1.6644531250000004e-05,
781
+ "loss": 4.498,
782
+ "mean_token_accuracy": 0.28029546365141866,
783
+ "num_tokens": 79699908.0,
784
+ "step": 860
785
+ },
786
+ {
787
+ "epoch": 1.6981960019502682,
788
+ "grad_norm": 0.4335004389286041,
789
+ "learning_rate": 1.6605468750000002e-05,
790
+ "loss": 4.4684,
791
+ "mean_token_accuracy": 0.2824758395552635,
792
+ "num_tokens": 80618966.0,
793
+ "step": 870
794
+ },
795
+ {
796
+ "epoch": 1.7176986835689907,
797
+ "grad_norm": 0.48968249559402466,
798
+ "learning_rate": 1.6566406250000003e-05,
799
+ "loss": 4.4526,
800
+ "mean_token_accuracy": 0.28368064761161804,
801
+ "num_tokens": 81545205.0,
802
+ "step": 880
803
+ },
804
+ {
805
+ "epoch": 1.7372013651877132,
806
+ "grad_norm": 0.41890937089920044,
807
+ "learning_rate": 1.652734375e-05,
808
+ "loss": 4.4734,
809
+ "mean_token_accuracy": 0.2822652608156204,
810
+ "num_tokens": 82463087.0,
811
+ "step": 890
812
+ },
813
+ {
814
+ "epoch": 1.7567040468064359,
815
+ "grad_norm": 0.40375128388404846,
816
+ "learning_rate": 1.6488281250000003e-05,
817
+ "loss": 4.4329,
818
+ "mean_token_accuracy": 0.2864795848727226,
819
+ "num_tokens": 83384315.0,
820
+ "step": 900
821
+ },
822
+ {
823
+ "epoch": 1.7762067284251586,
824
+ "grad_norm": 1.6861543655395508,
825
+ "learning_rate": 1.644921875e-05,
826
+ "loss": 4.4701,
827
+ "mean_token_accuracy": 0.2822112552821636,
828
+ "num_tokens": 84314435.0,
829
+ "step": 910
830
+ },
831
+ {
832
+ "epoch": 1.795709410043881,
833
+ "grad_norm": 0.42918047308921814,
834
+ "learning_rate": 1.6410156250000003e-05,
835
+ "loss": 4.4543,
836
+ "mean_token_accuracy": 0.282807744294405,
837
+ "num_tokens": 85248307.0,
838
+ "step": 920
839
+ },
840
+ {
841
+ "epoch": 1.8152120916626036,
842
+ "grad_norm": 0.38594943284988403,
843
+ "learning_rate": 1.637109375e-05,
844
+ "loss": 4.4509,
845
+ "mean_token_accuracy": 0.28484038934111594,
846
+ "num_tokens": 86168104.0,
847
+ "step": 930
848
+ },
849
+ {
850
+ "epoch": 1.834714773281326,
851
+ "grad_norm": 0.37766233086586,
852
+ "learning_rate": 1.6332031250000003e-05,
853
+ "loss": 4.4378,
854
+ "mean_token_accuracy": 0.28509455919265747,
855
+ "num_tokens": 87085294.0,
856
+ "step": 940
857
+ },
858
+ {
859
+ "epoch": 1.8542174549000487,
860
+ "grad_norm": 0.5309925079345703,
861
+ "learning_rate": 1.629296875e-05,
862
+ "loss": 4.45,
863
+ "mean_token_accuracy": 0.283291470259428,
864
+ "num_tokens": 88022698.0,
865
+ "step": 950
866
+ },
867
+ {
868
+ "epoch": 1.8737201365187715,
869
+ "grad_norm": 0.5312850475311279,
870
+ "learning_rate": 1.6253906250000002e-05,
871
+ "loss": 4.3849,
872
+ "mean_token_accuracy": 0.28923906683921813,
873
+ "num_tokens": 88947429.0,
874
+ "step": 960
875
+ },
876
+ {
877
+ "epoch": 1.893222818137494,
878
+ "grad_norm": 0.5679749250411987,
879
+ "learning_rate": 1.621484375e-05,
880
+ "loss": 4.3953,
881
+ "mean_token_accuracy": 0.28862822949886324,
882
+ "num_tokens": 89878787.0,
883
+ "step": 970
884
+ },
885
+ {
886
+ "epoch": 1.9127254997562164,
887
+ "grad_norm": 0.6506769061088562,
888
+ "learning_rate": 1.6175781250000002e-05,
889
+ "loss": 4.4121,
890
+ "mean_token_accuracy": 0.287344753742218,
891
+ "num_tokens": 90794282.0,
892
+ "step": 980
893
+ },
894
+ {
895
+ "epoch": 1.932228181374939,
896
+ "grad_norm": 0.5218345522880554,
897
+ "learning_rate": 1.6136718750000004e-05,
898
+ "loss": 4.422,
899
+ "mean_token_accuracy": 0.28664510771632196,
900
+ "num_tokens": 91720204.0,
901
+ "step": 990
902
+ },
903
+ {
904
+ "epoch": 1.9517308629936616,
905
+ "grad_norm": 0.38746026158332825,
906
+ "learning_rate": 1.6097656250000002e-05,
907
+ "loss": 4.3909,
908
+ "mean_token_accuracy": 0.2888357400894165,
909
+ "num_tokens": 92640826.0,
910
+ "step": 1000
911
+ },
912
+ {
913
+ "epoch": 1.9712335446123843,
914
+ "grad_norm": 0.4358104467391968,
915
+ "learning_rate": 1.6058593750000004e-05,
916
+ "loss": 4.4009,
917
+ "mean_token_accuracy": 0.28872263357043265,
918
+ "num_tokens": 93557645.0,
919
+ "step": 1010
920
+ },
921
+ {
922
+ "epoch": 1.9907362262311068,
923
+ "grad_norm": 0.399600088596344,
924
+ "learning_rate": 1.6019531250000002e-05,
925
+ "loss": 4.41,
926
+ "mean_token_accuracy": 0.28658533096313477,
927
+ "num_tokens": 94488514.0,
928
+ "step": 1020
929
+ },
930
+ {
931
+ "epoch": 2.0117016089712334,
932
+ "grad_norm": 0.4287355840206146,
933
+ "learning_rate": 1.5980468750000003e-05,
934
+ "loss": 4.8637,
935
+ "mean_token_accuracy": 0.2884528564243782,
936
+ "num_tokens": 95429559.0,
937
+ "step": 1030
938
+ },
939
+ {
940
+ "epoch": 2.0312042905899563,
941
+ "grad_norm": 0.43448057770729065,
942
+ "learning_rate": 1.594140625e-05,
943
+ "loss": 4.411,
944
+ "mean_token_accuracy": 0.28822447881102564,
945
+ "num_tokens": 96370969.0,
946
+ "step": 1040
947
+ },
948
+ {
949
+ "epoch": 2.050706972208679,
950
+ "grad_norm": 0.4509051442146301,
951
+ "learning_rate": 1.5902343750000003e-05,
952
+ "loss": 4.3998,
953
+ "mean_token_accuracy": 0.2875743143260479,
954
+ "num_tokens": 97297487.0,
955
+ "step": 1050
956
+ },
957
+ {
958
+ "epoch": 2.0702096538274013,
959
+ "grad_norm": 0.4603135585784912,
960
+ "learning_rate": 1.586328125e-05,
961
+ "loss": 4.3895,
962
+ "mean_token_accuracy": 0.28888514786958697,
963
+ "num_tokens": 98234116.0,
964
+ "step": 1060
965
+ },
966
+ {
967
+ "epoch": 2.089712335446124,
968
+ "grad_norm": 0.45260968804359436,
969
+ "learning_rate": 1.5824218750000003e-05,
970
+ "loss": 4.3827,
971
+ "mean_token_accuracy": 0.28953884318470957,
972
+ "num_tokens": 99158170.0,
973
+ "step": 1070
974
+ },
975
+ {
976
+ "epoch": 2.1092150170648463,
977
+ "grad_norm": 0.4549092650413513,
978
+ "learning_rate": 1.578515625e-05,
979
+ "loss": 4.3818,
980
+ "mean_token_accuracy": 0.2901176653802395,
981
+ "num_tokens": 100089007.0,
982
+ "step": 1080
983
+ },
984
+ {
985
+ "epoch": 2.128717698683569,
986
+ "grad_norm": 0.4202571213245392,
987
+ "learning_rate": 1.5746093750000003e-05,
988
+ "loss": 4.3617,
989
+ "mean_token_accuracy": 0.2919108562171459,
990
+ "num_tokens": 101002323.0,
991
+ "step": 1090
992
+ },
993
+ {
994
+ "epoch": 2.1482203803022917,
995
+ "grad_norm": 0.5119932889938354,
996
+ "learning_rate": 1.570703125e-05,
997
+ "loss": 4.365,
998
+ "mean_token_accuracy": 0.2918895035982132,
999
+ "num_tokens": 101915323.0,
1000
+ "step": 1100
1001
+ },
1002
+ {
1003
+ "epoch": 2.167723061921014,
1004
+ "grad_norm": 0.49400025606155396,
1005
+ "learning_rate": 1.5667968750000003e-05,
1006
+ "loss": 4.3662,
1007
+ "mean_token_accuracy": 0.29112903624773023,
1008
+ "num_tokens": 102848704.0,
1009
+ "step": 1110
1010
+ },
1011
+ {
1012
+ "epoch": 2.1872257435397366,
1013
+ "grad_norm": 9.812466621398926,
1014
+ "learning_rate": 1.562890625e-05,
1015
+ "loss": 4.3536,
1016
+ "mean_token_accuracy": 0.29283427745103835,
1017
+ "num_tokens": 103770127.0,
1018
+ "step": 1120
1019
+ },
1020
+ {
1021
+ "epoch": 2.206728425158459,
1022
+ "grad_norm": 0.6520562171936035,
1023
+ "learning_rate": 1.5589843750000002e-05,
1024
+ "loss": 4.3685,
1025
+ "mean_token_accuracy": 0.29140080511569977,
1026
+ "num_tokens": 104696965.0,
1027
+ "step": 1130
1028
+ },
1029
+ {
1030
+ "epoch": 2.226231106777182,
1031
+ "grad_norm": 0.3824687600135803,
1032
+ "learning_rate": 1.555078125e-05,
1033
+ "loss": 4.3617,
1034
+ "mean_token_accuracy": 0.29200059548020363,
1035
+ "num_tokens": 105627411.0,
1036
+ "step": 1140
1037
+ },
1038
+ {
1039
+ "epoch": 2.2457337883959045,
1040
+ "grad_norm": 0.40885069966316223,
1041
+ "learning_rate": 1.5511718750000002e-05,
1042
+ "loss": 4.3486,
1043
+ "mean_token_accuracy": 0.29261764511466026,
1044
+ "num_tokens": 106560394.0,
1045
+ "step": 1150
1046
+ },
1047
+ {
1048
+ "epoch": 2.265236470014627,
1049
+ "grad_norm": 0.5578988194465637,
1050
+ "learning_rate": 1.5472656250000004e-05,
1051
+ "loss": 4.3477,
1052
+ "mean_token_accuracy": 0.2928113825619221,
1053
+ "num_tokens": 107484781.0,
1054
+ "step": 1160
1055
+ },
1056
+ {
1057
+ "epoch": 2.2847391516333495,
1058
+ "grad_norm": 2.4490866661071777,
1059
+ "learning_rate": 1.5433593750000002e-05,
1060
+ "loss": 4.3497,
1061
+ "mean_token_accuracy": 0.2932279795408249,
1062
+ "num_tokens": 108403123.0,
1063
+ "step": 1170
1064
+ },
1065
+ {
1066
+ "epoch": 2.304241833252072,
1067
+ "grad_norm": 0.4807080030441284,
1068
+ "learning_rate": 1.5394531250000004e-05,
1069
+ "loss": 4.3279,
1070
+ "mean_token_accuracy": 0.2941134661436081,
1071
+ "num_tokens": 109334624.0,
1072
+ "step": 1180
1073
+ },
1074
+ {
1075
+ "epoch": 2.323744514870795,
1076
+ "grad_norm": 0.7090457677841187,
1077
+ "learning_rate": 1.5355468750000002e-05,
1078
+ "loss": 4.345,
1079
+ "mean_token_accuracy": 0.2933297656476498,
1080
+ "num_tokens": 110262463.0,
1081
+ "step": 1190
1082
+ },
1083
+ {
1084
+ "epoch": 2.3432471964895174,
1085
+ "grad_norm": 0.46787402033805847,
1086
+ "learning_rate": 1.5316406250000003e-05,
1087
+ "loss": 4.3362,
1088
+ "mean_token_accuracy": 0.2946368932723999,
1089
+ "num_tokens": 111180510.0,
1090
+ "step": 1200
1091
+ },
1092
+ {
1093
+ "epoch": 2.36274987810824,
1094
+ "grad_norm": 0.45472535490989685,
1095
+ "learning_rate": 1.527734375e-05,
1096
+ "loss": 4.32,
1097
+ "mean_token_accuracy": 0.2949611395597458,
1098
+ "num_tokens": 112102026.0,
1099
+ "step": 1210
1100
+ },
1101
+ {
1102
+ "epoch": 2.3822525597269624,
1103
+ "grad_norm": 0.5668436288833618,
1104
+ "learning_rate": 1.5238281250000002e-05,
1105
+ "loss": 4.3424,
1106
+ "mean_token_accuracy": 0.2933102063834667,
1107
+ "num_tokens": 113030329.0,
1108
+ "step": 1220
1109
+ },
1110
+ {
1111
+ "epoch": 2.401755241345685,
1112
+ "grad_norm": 0.446575403213501,
1113
+ "learning_rate": 1.5199218750000001e-05,
1114
+ "loss": 4.3253,
1115
+ "mean_token_accuracy": 0.29592231959104537,
1116
+ "num_tokens": 113952583.0,
1117
+ "step": 1230
1118
+ },
1119
+ {
1120
+ "epoch": 2.421257922964408,
1121
+ "grad_norm": 0.47586357593536377,
1122
+ "learning_rate": 1.5160156250000001e-05,
1123
+ "loss": 4.3155,
1124
+ "mean_token_accuracy": 0.2961124524474144,
1125
+ "num_tokens": 114874886.0,
1126
+ "step": 1240
1127
+ },
1128
+ {
1129
+ "epoch": 2.4407606045831303,
1130
+ "grad_norm": 0.5272416472434998,
1131
+ "learning_rate": 1.5121093750000003e-05,
1132
+ "loss": 4.3256,
1133
+ "mean_token_accuracy": 0.29535831734538076,
1134
+ "num_tokens": 115809893.0,
1135
+ "step": 1250
1136
+ },
1137
+ {
1138
+ "epoch": 2.4602632862018528,
1139
+ "grad_norm": 0.5159743428230286,
1140
+ "learning_rate": 1.5082031250000003e-05,
1141
+ "loss": 4.3045,
1142
+ "mean_token_accuracy": 0.2964573077857494,
1143
+ "num_tokens": 116738350.0,
1144
+ "step": 1260
1145
+ },
1146
+ {
1147
+ "epoch": 2.4797659678205752,
1148
+ "grad_norm": 0.3612087070941925,
1149
+ "learning_rate": 1.5042968750000003e-05,
1150
+ "loss": 4.3171,
1151
+ "mean_token_accuracy": 0.296286004781723,
1152
+ "num_tokens": 117673838.0,
1153
+ "step": 1270
1154
+ },
1155
+ {
1156
+ "epoch": 2.4992686494392977,
1157
+ "grad_norm": 1.1809757947921753,
1158
+ "learning_rate": 1.5003906250000003e-05,
1159
+ "loss": 4.3191,
1160
+ "mean_token_accuracy": 0.2969906762242317,
1161
+ "num_tokens": 118603195.0,
1162
+ "step": 1280
1163
+ },
1164
+ {
1165
+ "epoch": 2.51877133105802,
1166
+ "grad_norm": 0.6246888041496277,
1167
+ "learning_rate": 1.4964843750000002e-05,
1168
+ "loss": 4.3009,
1169
+ "mean_token_accuracy": 0.2975566402077675,
1170
+ "num_tokens": 119519063.0,
1171
+ "step": 1290
1172
+ },
1173
+ {
1174
+ "epoch": 2.538274012676743,
1175
+ "grad_norm": 0.8195675611495972,
1176
+ "learning_rate": 1.4925781250000002e-05,
1177
+ "loss": 4.3006,
1178
+ "mean_token_accuracy": 0.297472283244133,
1179
+ "num_tokens": 120445030.0,
1180
+ "step": 1300
1181
+ },
1182
+ {
1183
+ "epoch": 2.5577766942954656,
1184
+ "grad_norm": 0.4961223602294922,
1185
+ "learning_rate": 1.4886718750000002e-05,
1186
+ "loss": 4.2969,
1187
+ "mean_token_accuracy": 0.29855757504701613,
1188
+ "num_tokens": 121369449.0,
1189
+ "step": 1310
1190
+ },
1191
+ {
1192
+ "epoch": 2.577279375914188,
1193
+ "grad_norm": 0.5146915912628174,
1194
+ "learning_rate": 1.4847656250000002e-05,
1195
+ "loss": 4.2858,
1196
+ "mean_token_accuracy": 0.2985923945903778,
1197
+ "num_tokens": 122298443.0,
1198
+ "step": 1320
1199
+ },
1200
+ {
1201
+ "epoch": 2.596782057532911,
1202
+ "grad_norm": 0.6109800934791565,
1203
+ "learning_rate": 1.4808593750000002e-05,
1204
+ "loss": 4.2935,
1205
+ "mean_token_accuracy": 0.29859942123293876,
1206
+ "num_tokens": 123233917.0,
1207
+ "step": 1330
1208
+ },
1209
+ {
1210
+ "epoch": 2.6162847391516335,
1211
+ "grad_norm": 0.40669572353363037,
1212
+ "learning_rate": 1.4769531250000002e-05,
1213
+ "loss": 4.2888,
1214
+ "mean_token_accuracy": 0.29900421276688577,
1215
+ "num_tokens": 124161805.0,
1216
+ "step": 1340
1217
+ },
1218
+ {
1219
+ "epoch": 2.635787420770356,
1220
+ "grad_norm": 1.3442695140838623,
1221
+ "learning_rate": 1.4730468750000002e-05,
1222
+ "loss": 4.2757,
1223
+ "mean_token_accuracy": 0.2997878722846508,
1224
+ "num_tokens": 125085090.0,
1225
+ "step": 1350
1226
+ },
1227
+ {
1228
+ "epoch": 2.6552901023890785,
1229
+ "grad_norm": 0.5308565497398376,
1230
+ "learning_rate": 1.4691406250000002e-05,
1231
+ "loss": 4.2859,
1232
+ "mean_token_accuracy": 0.29943727552890775,
1233
+ "num_tokens": 126012123.0,
1234
+ "step": 1360
1235
+ },
1236
+ {
1237
+ "epoch": 2.674792784007801,
1238
+ "grad_norm": 0.5062427520751953,
1239
+ "learning_rate": 1.4652343750000002e-05,
1240
+ "loss": 4.2803,
1241
+ "mean_token_accuracy": 0.2989941954612732,
1242
+ "num_tokens": 126935848.0,
1243
+ "step": 1370
1244
+ },
1245
+ {
1246
+ "epoch": 2.6942954656265234,
1247
+ "grad_norm": 0.41506361961364746,
1248
+ "learning_rate": 1.4613281250000002e-05,
1249
+ "loss": 4.2803,
1250
+ "mean_token_accuracy": 0.3002371557056904,
1251
+ "num_tokens": 127857739.0,
1252
+ "step": 1380
1253
+ },
1254
+ {
1255
+ "epoch": 2.7137981472452464,
1256
+ "grad_norm": 0.44968003034591675,
1257
+ "learning_rate": 1.4574218750000001e-05,
1258
+ "loss": 4.2577,
1259
+ "mean_token_accuracy": 0.30137933045625687,
1260
+ "num_tokens": 128776712.0,
1261
+ "step": 1390
1262
+ },
1263
+ {
1264
+ "epoch": 2.733300828863969,
1265
+ "grad_norm": 0.41343918442726135,
1266
+ "learning_rate": 1.4535156250000001e-05,
1267
+ "loss": 4.2617,
1268
+ "mean_token_accuracy": 0.3014510445296764,
1269
+ "num_tokens": 129707252.0,
1270
+ "step": 1400
1271
+ },
1272
+ {
1273
+ "epoch": 2.7528035104826913,
1274
+ "grad_norm": 0.7177313566207886,
1275
+ "learning_rate": 1.4496093750000001e-05,
1276
+ "loss": 4.2673,
1277
+ "mean_token_accuracy": 0.3009005382657051,
1278
+ "num_tokens": 130637763.0,
1279
+ "step": 1410
1280
+ },
1281
+ {
1282
+ "epoch": 2.772306192101414,
1283
+ "grad_norm": 1.7760525941848755,
1284
+ "learning_rate": 1.4457031250000003e-05,
1285
+ "loss": 4.2834,
1286
+ "mean_token_accuracy": 0.2999152898788452,
1287
+ "num_tokens": 131573493.0,
1288
+ "step": 1420
1289
+ },
1290
+ {
1291
+ "epoch": 2.7918088737201368,
1292
+ "grad_norm": 0.397335022687912,
1293
+ "learning_rate": 1.4417968750000003e-05,
1294
+ "loss": 4.2466,
1295
+ "mean_token_accuracy": 0.30402503311634066,
1296
+ "num_tokens": 132504591.0,
1297
+ "step": 1430
1298
+ },
1299
+ {
1300
+ "epoch": 2.8113115553388592,
1301
+ "grad_norm": 0.3949294686317444,
1302
+ "learning_rate": 1.4378906250000003e-05,
1303
+ "loss": 4.265,
1304
+ "mean_token_accuracy": 0.30095369294285773,
1305
+ "num_tokens": 133431529.0,
1306
+ "step": 1440
1307
+ },
1308
+ {
1309
+ "epoch": 2.8308142369575817,
1310
+ "grad_norm": 0.4513266682624817,
1311
+ "learning_rate": 1.4339843750000003e-05,
1312
+ "loss": 4.2622,
1313
+ "mean_token_accuracy": 0.30230883583426477,
1314
+ "num_tokens": 134351259.0,
1315
+ "step": 1450
1316
+ },
1317
+ {
1318
+ "epoch": 2.850316918576304,
1319
+ "grad_norm": 0.42385134100914,
1320
+ "learning_rate": 1.4300781250000002e-05,
1321
+ "loss": 4.2306,
1322
+ "mean_token_accuracy": 0.3048314802348614,
1323
+ "num_tokens": 135276683.0,
1324
+ "step": 1460
1325
+ },
1326
+ {
1327
+ "epoch": 2.8698196001950267,
1328
+ "grad_norm": 0.9934040307998657,
1329
+ "learning_rate": 1.4261718750000002e-05,
1330
+ "loss": 4.237,
1331
+ "mean_token_accuracy": 0.303445303440094,
1332
+ "num_tokens": 136210229.0,
1333
+ "step": 1470
1334
+ },
1335
+ {
1336
+ "epoch": 2.889322281813749,
1337
+ "grad_norm": 0.7958151698112488,
1338
+ "learning_rate": 1.4222656250000002e-05,
1339
+ "loss": 4.2307,
1340
+ "mean_token_accuracy": 0.305256237834692,
1341
+ "num_tokens": 137139018.0,
1342
+ "step": 1480
1343
+ },
1344
+ {
1345
+ "epoch": 2.908824963432472,
1346
+ "grad_norm": 0.5570520758628845,
1347
+ "learning_rate": 1.4183593750000002e-05,
1348
+ "loss": 4.2503,
1349
+ "mean_token_accuracy": 0.3026120513677597,
1350
+ "num_tokens": 138071332.0,
1351
+ "step": 1490
1352
+ },
1353
+ {
1354
+ "epoch": 2.9283276450511946,
1355
+ "grad_norm": 0.41619789600372314,
1356
+ "learning_rate": 1.4144531250000002e-05,
1357
+ "loss": 4.2189,
1358
+ "mean_token_accuracy": 0.3050854988396168,
1359
+ "num_tokens": 139000222.0,
1360
+ "step": 1500
1361
+ },
1362
+ {
1363
+ "epoch": 2.947830326669917,
1364
+ "grad_norm": 0.44383278489112854,
1365
+ "learning_rate": 1.4105468750000002e-05,
1366
+ "loss": 4.2415,
1367
+ "mean_token_accuracy": 0.3042911276221275,
1368
+ "num_tokens": 139917384.0,
1369
+ "step": 1510
1370
+ },
1371
+ {
1372
+ "epoch": 2.9673330082886396,
1373
+ "grad_norm": 0.8169625997543335,
1374
+ "learning_rate": 1.4066406250000002e-05,
1375
+ "loss": 4.217,
1376
+ "mean_token_accuracy": 0.3053439900279045,
1377
+ "num_tokens": 140832220.0,
1378
+ "step": 1520
1379
+ },
1380
+ {
1381
+ "epoch": 2.9868356899073625,
1382
+ "grad_norm": 0.3078594207763672,
1383
+ "learning_rate": 1.4027343750000002e-05,
1384
+ "loss": 4.2246,
1385
+ "mean_token_accuracy": 0.30591325610876086,
1386
+ "num_tokens": 141759431.0,
1387
+ "step": 1530
1388
+ },
1389
+ {
1390
+ "epoch": 3.007801072647489,
1391
+ "grad_norm": 0.46778416633605957,
1392
+ "learning_rate": 1.3988281250000002e-05,
1393
+ "loss": 4.63,
1394
+ "mean_token_accuracy": 0.3062905574717173,
1395
+ "num_tokens": 142686567.0,
1396
+ "step": 1540
1397
+ },
1398
+ {
1399
+ "epoch": 3.0273037542662116,
1400
+ "grad_norm": 0.38934579491615295,
1401
+ "learning_rate": 1.3949218750000002e-05,
1402
+ "loss": 4.2288,
1403
+ "mean_token_accuracy": 0.30576241165399554,
1404
+ "num_tokens": 143605470.0,
1405
+ "step": 1550
1406
+ },
1407
+ {
1408
+ "epoch": 3.046806435884934,
1409
+ "grad_norm": 0.4673042893409729,
1410
+ "learning_rate": 1.3910156250000001e-05,
1411
+ "loss": 4.2274,
1412
+ "mean_token_accuracy": 0.30559116452932356,
1413
+ "num_tokens": 144523275.0,
1414
+ "step": 1560
1415
+ },
1416
+ {
1417
+ "epoch": 3.066309117503657,
1418
+ "grad_norm": 0.39577770233154297,
1419
+ "learning_rate": 1.3871093750000001e-05,
1420
+ "loss": 4.2425,
1421
+ "mean_token_accuracy": 0.30391779616475106,
1422
+ "num_tokens": 145444439.0,
1423
+ "step": 1570
1424
+ },
1425
+ {
1426
+ "epoch": 3.0858117991223795,
1427
+ "grad_norm": 0.3524993658065796,
1428
+ "learning_rate": 1.3832031250000001e-05,
1429
+ "loss": 4.2118,
1430
+ "mean_token_accuracy": 0.3058738835155964,
1431
+ "num_tokens": 146378772.0,
1432
+ "step": 1580
1433
+ },
1434
+ {
1435
+ "epoch": 3.105314480741102,
1436
+ "grad_norm": 0.5424984097480774,
1437
+ "learning_rate": 1.3792968750000003e-05,
1438
+ "loss": 4.2163,
1439
+ "mean_token_accuracy": 0.3056952103972435,
1440
+ "num_tokens": 147320678.0,
1441
+ "step": 1590
1442
+ },
1443
+ {
1444
+ "epoch": 3.1248171623598244,
1445
+ "grad_norm": 0.35715430974960327,
1446
+ "learning_rate": 1.3753906250000003e-05,
1447
+ "loss": 4.1736,
1448
+ "mean_token_accuracy": 0.30897570848464967,
1449
+ "num_tokens": 148231165.0,
1450
+ "step": 1600
1451
+ },
1452
+ {
1453
+ "epoch": 3.144319843978547,
1454
+ "grad_norm": 0.42818182706832886,
1455
+ "learning_rate": 1.3714843750000003e-05,
1456
+ "loss": 4.2168,
1457
+ "mean_token_accuracy": 0.30649841353297236,
1458
+ "num_tokens": 149164808.0,
1459
+ "step": 1610
1460
+ },
1461
+ {
1462
+ "epoch": 3.1638225255972694,
1463
+ "grad_norm": 1.06892728805542,
1464
+ "learning_rate": 1.3675781250000002e-05,
1465
+ "loss": 4.1984,
1466
+ "mean_token_accuracy": 0.3070691518485546,
1467
+ "num_tokens": 150098120.0,
1468
+ "step": 1620
1469
+ },
1470
+ {
1471
+ "epoch": 3.1833252072159923,
1472
+ "grad_norm": 0.3775452971458435,
1473
+ "learning_rate": 1.3636718750000002e-05,
1474
+ "loss": 4.2151,
1475
+ "mean_token_accuracy": 0.3055191844701767,
1476
+ "num_tokens": 151024766.0,
1477
+ "step": 1630
1478
+ },
1479
+ {
1480
+ "epoch": 3.202827888834715,
1481
+ "grad_norm": 0.4792298674583435,
1482
+ "learning_rate": 1.3597656250000002e-05,
1483
+ "loss": 4.2077,
1484
+ "mean_token_accuracy": 0.30750301480293274,
1485
+ "num_tokens": 151941554.0,
1486
+ "step": 1640
1487
+ },
1488
+ {
1489
+ "epoch": 3.2223305704534373,
1490
+ "grad_norm": 0.9628048539161682,
1491
+ "learning_rate": 1.3558593750000002e-05,
1492
+ "loss": 4.1879,
1493
+ "mean_token_accuracy": 0.30834688916802405,
1494
+ "num_tokens": 152873143.0,
1495
+ "step": 1650
1496
+ },
1497
+ {
1498
+ "epoch": 3.2418332520721598,
1499
+ "grad_norm": 0.4353286027908325,
1500
+ "learning_rate": 1.3519531250000002e-05,
1501
+ "loss": 4.1972,
1502
+ "mean_token_accuracy": 0.30848201364278793,
1503
+ "num_tokens": 153799524.0,
1504
+ "step": 1660
1505
+ },
1506
+ {
1507
+ "epoch": 3.2613359336908827,
1508
+ "grad_norm": 0.47365450859069824,
1509
+ "learning_rate": 1.3480468750000002e-05,
1510
+ "loss": 4.1959,
1511
+ "mean_token_accuracy": 0.30711600482463836,
1512
+ "num_tokens": 154737404.0,
1513
+ "step": 1670
1514
+ },
1515
+ {
1516
+ "epoch": 3.280838615309605,
1517
+ "grad_norm": 0.507435142993927,
1518
+ "learning_rate": 1.3441406250000002e-05,
1519
+ "loss": 4.1875,
1520
+ "mean_token_accuracy": 0.3079258047044277,
1521
+ "num_tokens": 155669573.0,
1522
+ "step": 1680
1523
+ },
1524
+ {
1525
+ "epoch": 3.3003412969283277,
1526
+ "grad_norm": 0.3681463897228241,
1527
+ "learning_rate": 1.3402343750000002e-05,
1528
+ "loss": 4.2108,
1529
+ "mean_token_accuracy": 0.3069092735648155,
1530
+ "num_tokens": 156608562.0,
1531
+ "step": 1690
1532
+ },
1533
+ {
1534
+ "epoch": 3.31984397854705,
1535
+ "grad_norm": 0.3918771743774414,
1536
+ "learning_rate": 1.3363281250000002e-05,
1537
+ "loss": 4.1608,
1538
+ "mean_token_accuracy": 0.3113169133663177,
1539
+ "num_tokens": 157520485.0,
1540
+ "step": 1700
1541
+ },
1542
+ {
1543
+ "epoch": 3.3393466601657726,
1544
+ "grad_norm": 0.4225058853626251,
1545
+ "learning_rate": 1.3324218750000002e-05,
1546
+ "loss": 4.204,
1547
+ "mean_token_accuracy": 0.30721485018730166,
1548
+ "num_tokens": 158451388.0,
1549
+ "step": 1710
1550
+ },
1551
+ {
1552
+ "epoch": 3.358849341784495,
1553
+ "grad_norm": 0.35778412222862244,
1554
+ "learning_rate": 1.3285156250000001e-05,
1555
+ "loss": 4.1689,
1556
+ "mean_token_accuracy": 0.3099173367023468,
1557
+ "num_tokens": 159377651.0,
1558
+ "step": 1720
1559
+ },
1560
+ {
1561
+ "epoch": 3.378352023403218,
1562
+ "grad_norm": 2.289072275161743,
1563
+ "learning_rate": 1.3246093750000001e-05,
1564
+ "loss": 4.1889,
1565
+ "mean_token_accuracy": 0.30997600927948954,
1566
+ "num_tokens": 160304853.0,
1567
+ "step": 1730
1568
+ },
1569
+ {
1570
+ "epoch": 3.3978547050219405,
1571
+ "grad_norm": 1.1738953590393066,
1572
+ "learning_rate": 1.3207031250000001e-05,
1573
+ "loss": 4.1907,
1574
+ "mean_token_accuracy": 0.3083472929894924,
1575
+ "num_tokens": 161230300.0,
1576
+ "step": 1740
1577
+ },
1578
+ {
1579
+ "epoch": 3.417357386640663,
1580
+ "grad_norm": 0.35479751229286194,
1581
+ "learning_rate": 1.3167968750000001e-05,
1582
+ "loss": 4.2018,
1583
+ "mean_token_accuracy": 0.3078498594462872,
1584
+ "num_tokens": 162156769.0,
1585
+ "step": 1750
1586
+ },
1587
+ {
1588
+ "epoch": 3.4368600682593855,
1589
+ "grad_norm": 0.5815494060516357,
1590
+ "learning_rate": 1.3128906250000003e-05,
1591
+ "loss": 4.1843,
1592
+ "mean_token_accuracy": 0.3095716036856174,
1593
+ "num_tokens": 163073001.0,
1594
+ "step": 1760
1595
+ },
1596
+ {
1597
+ "epoch": 3.4563627498781084,
1598
+ "grad_norm": 0.5193825364112854,
1599
+ "learning_rate": 1.3089843750000003e-05,
1600
+ "loss": 4.1644,
1601
+ "mean_token_accuracy": 0.3114980049431324,
1602
+ "num_tokens": 164000019.0,
1603
+ "step": 1770
1604
+ },
1605
+ {
1606
+ "epoch": 3.475865431496831,
1607
+ "grad_norm": 0.5195266604423523,
1608
+ "learning_rate": 1.3050781250000003e-05,
1609
+ "loss": 4.174,
1610
+ "mean_token_accuracy": 0.30980290472507477,
1611
+ "num_tokens": 164932749.0,
1612
+ "step": 1780
1613
+ },
1614
+ {
1615
+ "epoch": 3.4953681131155534,
1616
+ "grad_norm": 0.4400602877140045,
1617
+ "learning_rate": 1.3011718750000002e-05,
1618
+ "loss": 4.1615,
1619
+ "mean_token_accuracy": 0.31133070439100263,
1620
+ "num_tokens": 165849595.0,
1621
+ "step": 1790
1622
+ },
1623
+ {
1624
+ "epoch": 3.514870794734276,
1625
+ "grad_norm": 0.3400901257991791,
1626
+ "learning_rate": 1.2972656250000002e-05,
1627
+ "loss": 4.1662,
1628
+ "mean_token_accuracy": 0.3104513093829155,
1629
+ "num_tokens": 166773538.0,
1630
+ "step": 1800
1631
+ },
1632
+ {
1633
+ "epoch": 3.5343734763529984,
1634
+ "grad_norm": 0.6036484837532043,
1635
+ "learning_rate": 1.2933593750000002e-05,
1636
+ "loss": 4.1792,
1637
+ "mean_token_accuracy": 0.31103473380208013,
1638
+ "num_tokens": 167704179.0,
1639
+ "step": 1810
1640
+ },
1641
+ {
1642
+ "epoch": 3.553876157971721,
1643
+ "grad_norm": 1.2540099620819092,
1644
+ "learning_rate": 1.2894531250000002e-05,
1645
+ "loss": 4.1385,
1646
+ "mean_token_accuracy": 0.3131691038608551,
1647
+ "num_tokens": 168631611.0,
1648
+ "step": 1820
1649
+ },
1650
+ {
1651
+ "epoch": 3.573378839590444,
1652
+ "grad_norm": 0.38774392008781433,
1653
+ "learning_rate": 1.2855468750000002e-05,
1654
+ "loss": 4.1689,
1655
+ "mean_token_accuracy": 0.3113840945065022,
1656
+ "num_tokens": 169544507.0,
1657
+ "step": 1830
1658
+ },
1659
+ {
1660
+ "epoch": 3.5928815212091663,
1661
+ "grad_norm": 0.465732216835022,
1662
+ "learning_rate": 1.2816406250000002e-05,
1663
+ "loss": 4.1749,
1664
+ "mean_token_accuracy": 0.31010042652487757,
1665
+ "num_tokens": 170472995.0,
1666
+ "step": 1840
1667
+ },
1668
+ {
1669
+ "epoch": 3.6123842028278887,
1670
+ "grad_norm": 0.471019983291626,
1671
+ "learning_rate": 1.2777343750000002e-05,
1672
+ "loss": 4.1695,
1673
+ "mean_token_accuracy": 0.3110612273216248,
1674
+ "num_tokens": 171402715.0,
1675
+ "step": 1850
1676
+ },
1677
+ {
1678
+ "epoch": 3.6318868844466117,
1679
+ "grad_norm": 0.39528539776802063,
1680
+ "learning_rate": 1.2738281250000002e-05,
1681
+ "loss": 4.1473,
1682
+ "mean_token_accuracy": 0.3129354894161224,
1683
+ "num_tokens": 172328519.0,
1684
+ "step": 1860
1685
+ },
1686
+ {
1687
+ "epoch": 3.651389566065334,
1688
+ "grad_norm": 0.5567691922187805,
1689
+ "learning_rate": 1.2699218750000002e-05,
1690
+ "loss": 4.1365,
1691
+ "mean_token_accuracy": 0.3130590170621872,
1692
+ "num_tokens": 173254781.0,
1693
+ "step": 1870
1694
+ },
1695
+ {
1696
+ "epoch": 3.6708922476840566,
1697
+ "grad_norm": 0.49460193514823914,
1698
+ "learning_rate": 1.2660156250000002e-05,
1699
+ "loss": 4.141,
1700
+ "mean_token_accuracy": 0.3134758062660694,
1701
+ "num_tokens": 174184953.0,
1702
+ "step": 1880
1703
+ },
1704
+ {
1705
+ "epoch": 3.690394929302779,
1706
+ "grad_norm": 0.3996962010860443,
1707
+ "learning_rate": 1.2621093750000001e-05,
1708
+ "loss": 4.1586,
1709
+ "mean_token_accuracy": 0.31323386654257773,
1710
+ "num_tokens": 175108592.0,
1711
+ "step": 1890
1712
+ },
1713
+ {
1714
+ "epoch": 3.7098976109215016,
1715
+ "grad_norm": 0.3824934959411621,
1716
+ "learning_rate": 1.2582031250000001e-05,
1717
+ "loss": 4.1385,
1718
+ "mean_token_accuracy": 0.3139244385063648,
1719
+ "num_tokens": 176037993.0,
1720
+ "step": 1900
1721
+ },
1722
+ {
1723
+ "epoch": 3.729400292540224,
1724
+ "grad_norm": 0.4332631528377533,
1725
+ "learning_rate": 1.2542968750000001e-05,
1726
+ "loss": 4.157,
1727
+ "mean_token_accuracy": 0.31349849998950957,
1728
+ "num_tokens": 176962331.0,
1729
+ "step": 1910
1730
+ },
1731
+ {
1732
+ "epoch": 3.748902974158947,
1733
+ "grad_norm": 0.6383605003356934,
1734
+ "learning_rate": 1.2503906250000001e-05,
1735
+ "loss": 4.1621,
1736
+ "mean_token_accuracy": 0.31250465139746664,
1737
+ "num_tokens": 177887507.0,
1738
+ "step": 1920
1739
+ },
1740
+ {
1741
+ "epoch": 3.7684056557776695,
1742
+ "grad_norm": 0.4531135559082031,
1743
+ "learning_rate": 1.2464843750000003e-05,
1744
+ "loss": 4.1326,
1745
+ "mean_token_accuracy": 0.31330428943037986,
1746
+ "num_tokens": 178819985.0,
1747
+ "step": 1930
1748
+ },
1749
+ {
1750
+ "epoch": 3.787908337396392,
1751
+ "grad_norm": 0.40584367513656616,
1752
+ "learning_rate": 1.2425781250000003e-05,
1753
+ "loss": 4.1467,
1754
+ "mean_token_accuracy": 0.3128244742751122,
1755
+ "num_tokens": 179755943.0,
1756
+ "step": 1940
1757
+ },
1758
+ {
1759
+ "epoch": 3.8074110190151145,
1760
+ "grad_norm": 0.3366943597793579,
1761
+ "learning_rate": 1.2386718750000003e-05,
1762
+ "loss": 4.153,
1763
+ "mean_token_accuracy": 0.31367711797356607,
1764
+ "num_tokens": 180690051.0,
1765
+ "step": 1950
1766
+ },
1767
+ {
1768
+ "epoch": 3.8269137006338374,
1769
+ "grad_norm": 0.47235119342803955,
1770
+ "learning_rate": 1.2347656250000002e-05,
1771
+ "loss": 4.1359,
1772
+ "mean_token_accuracy": 0.31598555818200114,
1773
+ "num_tokens": 181612029.0,
1774
+ "step": 1960
1775
+ },
1776
+ {
1777
+ "epoch": 3.84641638225256,
1778
+ "grad_norm": 0.8654286861419678,
1779
+ "learning_rate": 1.2308593750000002e-05,
1780
+ "loss": 4.1453,
1781
+ "mean_token_accuracy": 0.3134163662791252,
1782
+ "num_tokens": 182526847.0,
1783
+ "step": 1970
1784
+ },
1785
+ {
1786
+ "epoch": 3.8659190638712824,
1787
+ "grad_norm": 0.43307119607925415,
1788
+ "learning_rate": 1.2269531250000002e-05,
1789
+ "loss": 4.1145,
1790
+ "mean_token_accuracy": 0.3158551573753357,
1791
+ "num_tokens": 183462915.0,
1792
+ "step": 1980
1793
+ },
1794
+ {
1795
+ "epoch": 3.885421745490005,
1796
+ "grad_norm": 0.6477943062782288,
1797
+ "learning_rate": 1.2230468750000002e-05,
1798
+ "loss": 4.1241,
1799
+ "mean_token_accuracy": 0.3159909948706627,
1800
+ "num_tokens": 184385672.0,
1801
+ "step": 1990
1802
+ },
1803
+ {
1804
+ "epoch": 3.9049244271087273,
1805
+ "grad_norm": 0.38260239362716675,
1806
+ "learning_rate": 1.2191406250000002e-05,
1807
+ "loss": 4.1452,
1808
+ "mean_token_accuracy": 0.3129440575838089,
1809
+ "num_tokens": 185321368.0,
1810
+ "step": 2000
1811
+ },
1812
+ {
1813
+ "epoch": 3.92442710872745,
1814
+ "grad_norm": 0.47935089468955994,
1815
+ "learning_rate": 1.2152343750000002e-05,
1816
+ "loss": 4.1194,
1817
+ "mean_token_accuracy": 0.3149518817663193,
1818
+ "num_tokens": 186242758.0,
1819
+ "step": 2010
1820
+ },
1821
+ {
1822
+ "epoch": 3.9439297903461727,
1823
+ "grad_norm": 0.42702531814575195,
1824
+ "learning_rate": 1.2113281250000002e-05,
1825
+ "loss": 4.1037,
1826
+ "mean_token_accuracy": 0.3160653866827488,
1827
+ "num_tokens": 187173397.0,
1828
+ "step": 2020
1829
+ },
1830
+ {
1831
+ "epoch": 3.9634324719648952,
1832
+ "grad_norm": 0.367144912481308,
1833
+ "learning_rate": 1.2074218750000002e-05,
1834
+ "loss": 4.1208,
1835
+ "mean_token_accuracy": 0.31613789275288584,
1836
+ "num_tokens": 188080952.0,
1837
+ "step": 2030
1838
+ },
1839
+ {
1840
+ "epoch": 3.9829351535836177,
1841
+ "grad_norm": 0.4893103837966919,
1842
+ "learning_rate": 1.2035156250000002e-05,
1843
+ "loss": 4.1124,
1844
+ "mean_token_accuracy": 0.31543073803186417,
1845
+ "num_tokens": 189003783.0,
1846
+ "step": 2040
1847
+ },
1848
+ {
1849
+ "epoch": 4.003900536323744,
1850
+ "grad_norm": 0.4504469633102417,
1851
+ "learning_rate": 1.1996093750000001e-05,
1852
+ "loss": 4.528,
1853
+ "mean_token_accuracy": 0.31533794650217384,
1854
+ "num_tokens": 189955319.0,
1855
+ "step": 2050
1856
+ },
1857
+ {
1858
+ "epoch": 4.023403217942467,
1859
+ "grad_norm": 0.3595225214958191,
1860
+ "learning_rate": 1.1957031250000001e-05,
1861
+ "loss": 4.1133,
1862
+ "mean_token_accuracy": 0.31596027612686156,
1863
+ "num_tokens": 190888628.0,
1864
+ "step": 2060
1865
+ },
1866
+ {
1867
+ "epoch": 4.042905899561189,
1868
+ "grad_norm": 1.716142177581787,
1869
+ "learning_rate": 1.1917968750000001e-05,
1870
+ "loss": 4.1355,
1871
+ "mean_token_accuracy": 0.3142480179667473,
1872
+ "num_tokens": 191820311.0,
1873
+ "step": 2070
1874
+ },
1875
+ {
1876
+ "epoch": 4.062408581179913,
1877
+ "grad_norm": 0.6082829236984253,
1878
+ "learning_rate": 1.1878906250000001e-05,
1879
+ "loss": 4.1022,
1880
+ "mean_token_accuracy": 0.31689164936542513,
1881
+ "num_tokens": 192743910.0,
1882
+ "step": 2080
1883
+ },
1884
+ {
1885
+ "epoch": 4.081911262798635,
1886
+ "grad_norm": 0.7768521904945374,
1887
+ "learning_rate": 1.1839843750000001e-05,
1888
+ "loss": 4.0982,
1889
+ "mean_token_accuracy": 0.31711534410715103,
1890
+ "num_tokens": 193668772.0,
1891
+ "step": 2090
1892
+ },
1893
+ {
1894
+ "epoch": 4.101413944417358,
1895
+ "grad_norm": 0.3628576695919037,
1896
+ "learning_rate": 1.1800781250000003e-05,
1897
+ "loss": 4.0729,
1898
+ "mean_token_accuracy": 0.319500220566988,
1899
+ "num_tokens": 194595536.0,
1900
+ "step": 2100
1901
+ },
1902
+ {
1903
+ "epoch": 4.12091662603608,
1904
+ "grad_norm": 0.44966161251068115,
1905
+ "learning_rate": 1.1761718750000003e-05,
1906
+ "loss": 4.1226,
1907
+ "mean_token_accuracy": 0.31548903733491895,
1908
+ "num_tokens": 195521123.0,
1909
+ "step": 2110
1910
+ },
1911
+ {
1912
+ "epoch": 4.140419307654803,
1913
+ "grad_norm": 0.3710924983024597,
1914
+ "learning_rate": 1.1722656250000002e-05,
1915
+ "loss": 4.0965,
1916
+ "mean_token_accuracy": 0.31763359606266023,
1917
+ "num_tokens": 196446271.0,
1918
+ "step": 2120
1919
+ },
1920
+ {
1921
+ "epoch": 4.159921989273525,
1922
+ "grad_norm": 0.344722718000412,
1923
+ "learning_rate": 1.1683593750000002e-05,
1924
+ "loss": 4.0912,
1925
+ "mean_token_accuracy": 0.319039486348629,
1926
+ "num_tokens": 197363695.0,
1927
+ "step": 2130
1928
+ },
1929
+ {
1930
+ "epoch": 4.179424670892248,
1931
+ "grad_norm": 0.9052795171737671,
1932
+ "learning_rate": 1.1644531250000002e-05,
1933
+ "loss": 4.0975,
1934
+ "mean_token_accuracy": 0.3183410093188286,
1935
+ "num_tokens": 198298478.0,
1936
+ "step": 2140
1937
+ },
1938
+ {
1939
+ "epoch": 4.19892735251097,
1940
+ "grad_norm": 0.4249030649662018,
1941
+ "learning_rate": 1.1605468750000002e-05,
1942
+ "loss": 4.1177,
1943
+ "mean_token_accuracy": 0.3169899210333824,
1944
+ "num_tokens": 199229400.0,
1945
+ "step": 2150
1946
+ },
1947
+ {
1948
+ "epoch": 4.2184300341296925,
1949
+ "grad_norm": 0.7394511103630066,
1950
+ "learning_rate": 1.1566406250000002e-05,
1951
+ "loss": 4.1101,
1952
+ "mean_token_accuracy": 0.31670680120587347,
1953
+ "num_tokens": 200158230.0,
1954
+ "step": 2160
1955
+ },
1956
+ {
1957
+ "epoch": 4.237932715748415,
1958
+ "grad_norm": 0.4374152421951294,
1959
+ "learning_rate": 1.1527343750000002e-05,
1960
+ "loss": 4.101,
1961
+ "mean_token_accuracy": 0.3186027079820633,
1962
+ "num_tokens": 201088196.0,
1963
+ "step": 2170
1964
+ },
1965
+ {
1966
+ "epoch": 4.257435397367138,
1967
+ "grad_norm": 0.36458227038383484,
1968
+ "learning_rate": 1.1488281250000002e-05,
1969
+ "loss": 4.0918,
1970
+ "mean_token_accuracy": 0.31778610348701475,
1971
+ "num_tokens": 202028416.0,
1972
+ "step": 2180
1973
+ },
1974
+ {
1975
+ "epoch": 4.276938078985861,
1976
+ "grad_norm": 0.47918209433555603,
1977
+ "learning_rate": 1.1449218750000002e-05,
1978
+ "loss": 4.103,
1979
+ "mean_token_accuracy": 0.3175153359770775,
1980
+ "num_tokens": 202958008.0,
1981
+ "step": 2190
1982
+ },
1983
+ {
1984
+ "epoch": 4.296440760604583,
1985
+ "grad_norm": 0.5565679669380188,
1986
+ "learning_rate": 1.1410156250000002e-05,
1987
+ "loss": 4.0695,
1988
+ "mean_token_accuracy": 0.32034494280815123,
1989
+ "num_tokens": 203881344.0,
1990
+ "step": 2200
1991
+ },
1992
+ {
1993
+ "epoch": 4.315943442223306,
1994
+ "grad_norm": 0.3992047607898712,
1995
+ "learning_rate": 1.1371093750000002e-05,
1996
+ "loss": 4.0912,
1997
+ "mean_token_accuracy": 0.31856209337711333,
1998
+ "num_tokens": 204808785.0,
1999
+ "step": 2210
2000
+ },
2001
+ {
2002
+ "epoch": 4.335446123842028,
2003
+ "grad_norm": 0.34964555501937866,
2004
+ "learning_rate": 1.1332031250000001e-05,
2005
+ "loss": 4.1028,
2006
+ "mean_token_accuracy": 0.3173605494201183,
2007
+ "num_tokens": 205730997.0,
2008
+ "step": 2220
2009
+ },
2010
+ {
2011
+ "epoch": 4.354948805460751,
2012
+ "grad_norm": 0.5610654950141907,
2013
+ "learning_rate": 1.1292968750000001e-05,
2014
+ "loss": 4.0833,
2015
+ "mean_token_accuracy": 0.32026237323880197,
2016
+ "num_tokens": 206648222.0,
2017
+ "step": 2230
2018
+ },
2019
+ {
2020
+ "epoch": 4.374451487079473,
2021
+ "grad_norm": 16.436649322509766,
2022
+ "learning_rate": 1.1253906250000001e-05,
2023
+ "loss": 4.0856,
2024
+ "mean_token_accuracy": 0.3189682215452194,
2025
+ "num_tokens": 207575961.0,
2026
+ "step": 2240
2027
+ },
2028
+ {
2029
+ "epoch": 4.393954168698196,
2030
+ "grad_norm": 0.4388623535633087,
2031
+ "learning_rate": 1.1214843750000001e-05,
2032
+ "loss": 4.0944,
2033
+ "mean_token_accuracy": 0.3173427350819111,
2034
+ "num_tokens": 208500246.0,
2035
+ "step": 2250
2036
+ },
2037
+ {
2038
+ "epoch": 4.413456850316918,
2039
+ "grad_norm": 0.35677370429039,
2040
+ "learning_rate": 1.1175781250000001e-05,
2041
+ "loss": 4.0722,
2042
+ "mean_token_accuracy": 0.32106063738465307,
2043
+ "num_tokens": 209423458.0,
2044
+ "step": 2260
2045
+ },
2046
+ {
2047
+ "epoch": 4.432959531935641,
2048
+ "grad_norm": 0.39513856172561646,
2049
+ "learning_rate": 1.1136718750000003e-05,
2050
+ "loss": 4.0538,
2051
+ "mean_token_accuracy": 0.3214304678142071,
2052
+ "num_tokens": 210341440.0,
2053
+ "step": 2270
2054
+ },
2055
+ {
2056
+ "epoch": 4.452462213554364,
2057
+ "grad_norm": 0.4563831686973572,
2058
+ "learning_rate": 1.1097656250000003e-05,
2059
+ "loss": 4.109,
2060
+ "mean_token_accuracy": 0.31732322424650194,
2061
+ "num_tokens": 211281853.0,
2062
+ "step": 2280
2063
+ },
2064
+ {
2065
+ "epoch": 4.471964895173087,
2066
+ "grad_norm": 1.39266836643219,
2067
+ "learning_rate": 1.1058593750000002e-05,
2068
+ "loss": 4.0719,
2069
+ "mean_token_accuracy": 0.31940086409449575,
2070
+ "num_tokens": 212204229.0,
2071
+ "step": 2290
2072
+ },
2073
+ {
2074
+ "epoch": 4.491467576791809,
2075
+ "grad_norm": 0.3986080586910248,
2076
+ "learning_rate": 1.1019531250000002e-05,
2077
+ "loss": 4.1338,
2078
+ "mean_token_accuracy": 0.31560456529259684,
2079
+ "num_tokens": 213145258.0,
2080
+ "step": 2300
2081
+ },
2082
+ {
2083
+ "epoch": 4.510970258410532,
2084
+ "grad_norm": 0.37061166763305664,
2085
+ "learning_rate": 1.0980468750000002e-05,
2086
+ "loss": 4.0439,
2087
+ "mean_token_accuracy": 0.3235855162143707,
2088
+ "num_tokens": 214077815.0,
2089
+ "step": 2310
2090
+ },
2091
+ {
2092
+ "epoch": 4.530472940029254,
2093
+ "grad_norm": 0.4753463566303253,
2094
+ "learning_rate": 1.0941406250000002e-05,
2095
+ "loss": 4.0921,
2096
+ "mean_token_accuracy": 0.31985238641500474,
2097
+ "num_tokens": 215014694.0,
2098
+ "step": 2320
2099
+ },
2100
+ {
2101
+ "epoch": 4.5499756216479765,
2102
+ "grad_norm": 0.3656369745731354,
2103
+ "learning_rate": 1.0902343750000002e-05,
2104
+ "loss": 4.0713,
2105
+ "mean_token_accuracy": 0.3213632293045521,
2106
+ "num_tokens": 215925904.0,
2107
+ "step": 2330
2108
+ },
2109
+ {
2110
+ "epoch": 4.569478303266699,
2111
+ "grad_norm": 0.3658876121044159,
2112
+ "learning_rate": 1.0863281250000002e-05,
2113
+ "loss": 4.0767,
2114
+ "mean_token_accuracy": 0.31990990936756136,
2115
+ "num_tokens": 216858323.0,
2116
+ "step": 2340
2117
+ },
2118
+ {
2119
+ "epoch": 4.5889809848854215,
2120
+ "grad_norm": 0.37420952320098877,
2121
+ "learning_rate": 1.0824218750000002e-05,
2122
+ "loss": 4.055,
2123
+ "mean_token_accuracy": 0.32238853722810745,
2124
+ "num_tokens": 217787155.0,
2125
+ "step": 2350
2126
+ },
2127
+ {
2128
+ "epoch": 4.608483666504144,
2129
+ "grad_norm": 0.4945075809955597,
2130
+ "learning_rate": 1.0785156250000002e-05,
2131
+ "loss": 4.0728,
2132
+ "mean_token_accuracy": 0.31983637139201165,
2133
+ "num_tokens": 218720734.0,
2134
+ "step": 2360
2135
+ },
2136
+ {
2137
+ "epoch": 4.627986348122867,
2138
+ "grad_norm": 0.40318942070007324,
2139
+ "learning_rate": 1.0746093750000002e-05,
2140
+ "loss": 4.0634,
2141
+ "mean_token_accuracy": 0.32157402858138084,
2142
+ "num_tokens": 219638841.0,
2143
+ "step": 2370
2144
+ },
2145
+ {
2146
+ "epoch": 4.64748902974159,
2147
+ "grad_norm": 0.3799983263015747,
2148
+ "learning_rate": 1.0707031250000001e-05,
2149
+ "loss": 4.0593,
2150
+ "mean_token_accuracy": 0.3221892111003399,
2151
+ "num_tokens": 220557146.0,
2152
+ "step": 2380
2153
+ },
2154
+ {
2155
+ "epoch": 4.666991711360312,
2156
+ "grad_norm": 0.6413635611534119,
2157
+ "learning_rate": 1.0667968750000001e-05,
2158
+ "loss": 4.0611,
2159
+ "mean_token_accuracy": 0.32295678928494453,
2160
+ "num_tokens": 221477704.0,
2161
+ "step": 2390
2162
+ },
2163
+ {
2164
+ "epoch": 4.686494392979035,
2165
+ "grad_norm": 0.3513067066669464,
2166
+ "learning_rate": 1.0628906250000001e-05,
2167
+ "loss": 4.0467,
2168
+ "mean_token_accuracy": 0.32364632859826087,
2169
+ "num_tokens": 222386399.0,
2170
+ "step": 2400
2171
+ },
2172
+ {
2173
+ "epoch": 4.705997074597757,
2174
+ "grad_norm": 0.5278533101081848,
2175
+ "learning_rate": 1.0589843750000001e-05,
2176
+ "loss": 4.0758,
2177
+ "mean_token_accuracy": 0.3208088092505932,
2178
+ "num_tokens": 223316991.0,
2179
+ "step": 2410
2180
+ },
2181
+ {
2182
+ "epoch": 4.72549975621648,
2183
+ "grad_norm": 0.39425331354141235,
2184
+ "learning_rate": 1.0550781250000001e-05,
2185
+ "loss": 4.0678,
2186
+ "mean_token_accuracy": 0.32063120454549787,
2187
+ "num_tokens": 224243157.0,
2188
+ "step": 2420
2189
+ },
2190
+ {
2191
+ "epoch": 4.745002437835202,
2192
+ "grad_norm": 0.6337940096855164,
2193
+ "learning_rate": 1.0511718750000001e-05,
2194
+ "loss": 4.0704,
2195
+ "mean_token_accuracy": 0.32208378091454504,
2196
+ "num_tokens": 225166510.0,
2197
+ "step": 2430
2198
+ },
2199
+ {
2200
+ "epoch": 4.764505119453925,
2201
+ "grad_norm": 0.3218667507171631,
2202
+ "learning_rate": 1.0472656250000003e-05,
2203
+ "loss": 4.0344,
2204
+ "mean_token_accuracy": 0.3255066409707069,
2205
+ "num_tokens": 226083020.0,
2206
+ "step": 2440
2207
+ },
2208
+ {
2209
+ "epoch": 4.784007801072647,
2210
+ "grad_norm": 0.32973775267601013,
2211
+ "learning_rate": 1.0433593750000002e-05,
2212
+ "loss": 4.0347,
2213
+ "mean_token_accuracy": 0.32414417788386346,
2214
+ "num_tokens": 227011065.0,
2215
+ "step": 2450
2216
+ },
2217
+ {
2218
+ "epoch": 4.80351048269137,
2219
+ "grad_norm": 0.6606370806694031,
2220
+ "learning_rate": 1.0394531250000002e-05,
2221
+ "loss": 4.0718,
2222
+ "mean_token_accuracy": 0.3219029247760773,
2223
+ "num_tokens": 227936157.0,
2224
+ "step": 2460
2225
+ },
2226
+ {
2227
+ "epoch": 4.823013164310092,
2228
+ "grad_norm": 0.39825528860092163,
2229
+ "learning_rate": 1.0355468750000002e-05,
2230
+ "loss": 4.035,
2231
+ "mean_token_accuracy": 0.32509928196668625,
2232
+ "num_tokens": 228856183.0,
2233
+ "step": 2470
2234
+ },
2235
+ {
2236
+ "epoch": 4.842515845928816,
2237
+ "grad_norm": 0.46268683671951294,
2238
+ "learning_rate": 1.0316406250000002e-05,
2239
+ "loss": 4.0787,
2240
+ "mean_token_accuracy": 0.3220914930105209,
2241
+ "num_tokens": 229775699.0,
2242
+ "step": 2480
2243
+ },
2244
+ {
2245
+ "epoch": 4.862018527547538,
2246
+ "grad_norm": 0.6751087307929993,
2247
+ "learning_rate": 1.0277343750000002e-05,
2248
+ "loss": 4.0714,
2249
+ "mean_token_accuracy": 0.3211475729942322,
2250
+ "num_tokens": 230696333.0,
2251
+ "step": 2490
2252
+ },
2253
+ {
2254
+ "epoch": 4.8815212091662605,
2255
+ "grad_norm": 0.48659786581993103,
2256
+ "learning_rate": 1.0238281250000002e-05,
2257
+ "loss": 4.0464,
2258
+ "mean_token_accuracy": 0.3232453539967537,
2259
+ "num_tokens": 231621188.0,
2260
+ "step": 2500
2261
+ }
2262
+ ],
2263
+ "logging_steps": 10,
2264
+ "max_steps": 5120,
2265
+ "num_input_tokens_seen": 0,
2266
+ "num_train_epochs": 10,
2267
+ "save_steps": 500,
2268
+ "stateful_callbacks": {
2269
+ "TrainerControl": {
2270
+ "args": {
2271
+ "should_epoch_stop": false,
2272
+ "should_evaluate": false,
2273
+ "should_log": false,
2274
+ "should_save": true,
2275
+ "should_training_stop": false
2276
+ },
2277
+ "attributes": {}
2278
+ }
2279
+ },
2280
+ "total_flos": 3.4542278216043725e+17,
2281
+ "train_batch_size": 64,
2282
+ "trial_name": null,
2283
+ "trial_params": null
2284
+ }
checkpoint-2500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:213ef3ed16d9dd20d51f6355dc64c9dc5ebcaf8490efb503d6a15061df366d53
3
+ size 5624
checkpoint-3000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 256,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 6,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 6,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.51.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
checkpoint-3000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.51.3"
7
+ }