craa commited on
Commit
8f29760
·
verified ·
1 Parent(s): 4c6532d

Training in progress, step 10000, checkpoint

Browse files
checkpoint-10000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_embd": 768,
14
+ "n_head": 12,
15
+ "n_inner": null,
16
+ "n_layer": 12,
17
+ "n_positions": 1024,
18
+ "reorder_and_upcast_attn": false,
19
+ "resid_pdrop": 0.1,
20
+ "scale_attn_by_inverse_layer_idx": false,
21
+ "scale_attn_weights": true,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.47.0.dev0",
29
+ "use_cache": true,
30
+ "vocab_size": 52000
31
+ }
checkpoint-10000/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.47.0.dev0"
6
+ }
checkpoint-10000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80a1801977af12fd143f00a61374c20613c0d16ca8ffd5c89c1ec9ef109a62a9
3
+ size 503128704
checkpoint-10000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:791eba535fa2535b70956dcdc743c57e19bf1d4948dab3230dee16fd5055d638
3
+ size 1006351290
checkpoint-10000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d65e78f44114e21bbfe15593d61a8ef27409a026cbe91ed192a243a5a11f378b
3
+ size 14244
checkpoint-10000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4e7893396eb2644b36995e7c0917ee358383aadd367376b940738387a18a156
3
+ size 1064
checkpoint-10000/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
checkpoint-10000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10000/trainer_state.json ADDED
@@ -0,0 +1,1523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 3.762051582336426,
3
+ "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M__495/checkpoint-10000",
4
+ "epoch": 1.0781671159029649,
5
+ "eval_steps": 1000,
6
+ "global_step": 10000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005390835579514825,
13
+ "grad_norm": 0.9464320540428162,
14
+ "learning_rate": 0.000276,
15
+ "loss": 8.7571,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.01078167115902965,
20
+ "grad_norm": 1.1756318807601929,
21
+ "learning_rate": 0.0005759999999999999,
22
+ "loss": 7.0078,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.016172506738544475,
27
+ "grad_norm": 1.551442265510559,
28
+ "learning_rate": 0.000599702104695089,
29
+ "loss": 6.6048,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.0215633423180593,
34
+ "grad_norm": 0.7034249901771545,
35
+ "learning_rate": 0.0005993783054506205,
36
+ "loss": 6.3332,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.026954177897574125,
41
+ "grad_norm": 1.0177743434906006,
42
+ "learning_rate": 0.0005990545062061521,
43
+ "loss": 6.1389,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.03234501347708895,
48
+ "grad_norm": 1.3281992673873901,
49
+ "learning_rate": 0.0005987307069616836,
50
+ "loss": 6.0274,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.03773584905660377,
55
+ "grad_norm": 1.3769625425338745,
56
+ "learning_rate": 0.0005984069077172153,
57
+ "loss": 5.9227,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.0431266846361186,
62
+ "grad_norm": 1.3584883213043213,
63
+ "learning_rate": 0.0005980831084727469,
64
+ "loss": 5.8655,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.04851752021563342,
69
+ "grad_norm": 0.9795990586280823,
70
+ "learning_rate": 0.0005977593092282784,
71
+ "loss": 5.78,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.05390835579514825,
76
+ "grad_norm": 1.4081448316574097,
77
+ "learning_rate": 0.00059743550998381,
78
+ "loss": 5.7304,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.05929919137466307,
83
+ "grad_norm": 1.7430726289749146,
84
+ "learning_rate": 0.0005971117107393416,
85
+ "loss": 5.6254,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.0646900269541779,
90
+ "grad_norm": 1.4288103580474854,
91
+ "learning_rate": 0.0005967879114948732,
92
+ "loss": 5.5997,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.07008086253369272,
97
+ "grad_norm": 0.9352820515632629,
98
+ "learning_rate": 0.0005964641122504047,
99
+ "loss": 5.5009,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.07547169811320754,
104
+ "grad_norm": 1.0235956907272339,
105
+ "learning_rate": 0.0005961403130059363,
106
+ "loss": 5.4834,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.08086253369272237,
111
+ "grad_norm": 1.1283656358718872,
112
+ "learning_rate": 0.0005958165137614678,
113
+ "loss": 5.4096,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.0862533692722372,
118
+ "grad_norm": 1.1757662296295166,
119
+ "learning_rate": 0.0005954927145169995,
120
+ "loss": 5.3508,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.09164420485175202,
125
+ "grad_norm": 1.2652894258499146,
126
+ "learning_rate": 0.0005951689152725309,
127
+ "loss": 5.3137,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.09703504043126684,
132
+ "grad_norm": 0.8220955729484558,
133
+ "learning_rate": 0.0005948451160280626,
134
+ "loss": 5.234,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.10242587601078167,
139
+ "grad_norm": 1.0178474187850952,
140
+ "learning_rate": 0.0005945213167835941,
141
+ "loss": 5.2211,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.1078167115902965,
146
+ "grad_norm": 1.0003489255905151,
147
+ "learning_rate": 0.0005941975175391257,
148
+ "loss": 5.1665,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.1078167115902965,
153
+ "eval_accuracy": 0.2206364212520268,
154
+ "eval_loss": 5.09123420715332,
155
+ "eval_runtime": 146.3271,
156
+ "eval_samples_per_second": 123.087,
157
+ "eval_steps_per_second": 7.695,
158
+ "step": 1000
159
+ },
160
+ {
161
+ "epoch": 0.11320754716981132,
162
+ "grad_norm": 1.0129122734069824,
163
+ "learning_rate": 0.0005938737182946572,
164
+ "loss": 5.1253,
165
+ "step": 1050
166
+ },
167
+ {
168
+ "epoch": 0.11859838274932614,
169
+ "grad_norm": 1.2447277307510376,
170
+ "learning_rate": 0.0005935499190501888,
171
+ "loss": 5.0843,
172
+ "step": 1100
173
+ },
174
+ {
175
+ "epoch": 0.12398921832884097,
176
+ "grad_norm": 1.1244721412658691,
177
+ "learning_rate": 0.0005932261198057204,
178
+ "loss": 5.0632,
179
+ "step": 1150
180
+ },
181
+ {
182
+ "epoch": 0.1293800539083558,
183
+ "grad_norm": 1.0002185106277466,
184
+ "learning_rate": 0.000592902320561252,
185
+ "loss": 5.0409,
186
+ "step": 1200
187
+ },
188
+ {
189
+ "epoch": 0.1347708894878706,
190
+ "grad_norm": 1.237203598022461,
191
+ "learning_rate": 0.0005925785213167835,
192
+ "loss": 5.0158,
193
+ "step": 1250
194
+ },
195
+ {
196
+ "epoch": 0.14016172506738545,
197
+ "grad_norm": 0.9483816623687744,
198
+ "learning_rate": 0.0005922547220723151,
199
+ "loss": 4.9578,
200
+ "step": 1300
201
+ },
202
+ {
203
+ "epoch": 0.14555256064690028,
204
+ "grad_norm": 1.1492619514465332,
205
+ "learning_rate": 0.0005919309228278468,
206
+ "loss": 4.9422,
207
+ "step": 1350
208
+ },
209
+ {
210
+ "epoch": 0.1509433962264151,
211
+ "grad_norm": 0.8103901743888855,
212
+ "learning_rate": 0.0005916071235833783,
213
+ "loss": 4.9161,
214
+ "step": 1400
215
+ },
216
+ {
217
+ "epoch": 0.15633423180592992,
218
+ "grad_norm": 0.9849348068237305,
219
+ "learning_rate": 0.0005912833243389097,
220
+ "loss": 4.8937,
221
+ "step": 1450
222
+ },
223
+ {
224
+ "epoch": 0.16172506738544473,
225
+ "grad_norm": 1.0992136001586914,
226
+ "learning_rate": 0.0005909595250944414,
227
+ "loss": 4.8771,
228
+ "step": 1500
229
+ },
230
+ {
231
+ "epoch": 0.16711590296495957,
232
+ "grad_norm": 0.7955754995346069,
233
+ "learning_rate": 0.000590635725849973,
234
+ "loss": 4.8337,
235
+ "step": 1550
236
+ },
237
+ {
238
+ "epoch": 0.1725067385444744,
239
+ "grad_norm": 0.8510985970497131,
240
+ "learning_rate": 0.0005903119266055045,
241
+ "loss": 4.8487,
242
+ "step": 1600
243
+ },
244
+ {
245
+ "epoch": 0.1778975741239892,
246
+ "grad_norm": 0.8423263430595398,
247
+ "learning_rate": 0.0005899881273610361,
248
+ "loss": 4.8251,
249
+ "step": 1650
250
+ },
251
+ {
252
+ "epoch": 0.18328840970350405,
253
+ "grad_norm": 0.8307051658630371,
254
+ "learning_rate": 0.0005896643281165677,
255
+ "loss": 4.7528,
256
+ "step": 1700
257
+ },
258
+ {
259
+ "epoch": 0.18867924528301888,
260
+ "grad_norm": 1.171060562133789,
261
+ "learning_rate": 0.0005893405288720993,
262
+ "loss": 4.7617,
263
+ "step": 1750
264
+ },
265
+ {
266
+ "epoch": 0.1940700808625337,
267
+ "grad_norm": 1.0469765663146973,
268
+ "learning_rate": 0.0005890167296276308,
269
+ "loss": 4.7597,
270
+ "step": 1800
271
+ },
272
+ {
273
+ "epoch": 0.19946091644204852,
274
+ "grad_norm": 0.8637204170227051,
275
+ "learning_rate": 0.0005886929303831624,
276
+ "loss": 4.7128,
277
+ "step": 1850
278
+ },
279
+ {
280
+ "epoch": 0.20485175202156333,
281
+ "grad_norm": 0.9173099398612976,
282
+ "learning_rate": 0.0005883691311386939,
283
+ "loss": 4.7067,
284
+ "step": 1900
285
+ },
286
+ {
287
+ "epoch": 0.21024258760107817,
288
+ "grad_norm": 0.9003922343254089,
289
+ "learning_rate": 0.0005880453318942256,
290
+ "loss": 4.6783,
291
+ "step": 1950
292
+ },
293
+ {
294
+ "epoch": 0.215633423180593,
295
+ "grad_norm": 0.9870163798332214,
296
+ "learning_rate": 0.0005877215326497571,
297
+ "loss": 4.6555,
298
+ "step": 2000
299
+ },
300
+ {
301
+ "epoch": 0.215633423180593,
302
+ "eval_accuracy": 0.260708366848222,
303
+ "eval_loss": 4.6007513999938965,
304
+ "eval_runtime": 144.747,
305
+ "eval_samples_per_second": 124.431,
306
+ "eval_steps_per_second": 7.779,
307
+ "step": 2000
308
+ },
309
+ {
310
+ "epoch": 0.2210242587601078,
311
+ "grad_norm": 0.8778396248817444,
312
+ "learning_rate": 0.0005873977334052887,
313
+ "loss": 4.6696,
314
+ "step": 2050
315
+ },
316
+ {
317
+ "epoch": 0.22641509433962265,
318
+ "grad_norm": 0.8375086188316345,
319
+ "learning_rate": 0.0005870739341608202,
320
+ "loss": 4.6395,
321
+ "step": 2100
322
+ },
323
+ {
324
+ "epoch": 0.23180592991913745,
325
+ "grad_norm": 0.8078502416610718,
326
+ "learning_rate": 0.0005867501349163519,
327
+ "loss": 4.5823,
328
+ "step": 2150
329
+ },
330
+ {
331
+ "epoch": 0.2371967654986523,
332
+ "grad_norm": 0.8603857755661011,
333
+ "learning_rate": 0.0005864263356718833,
334
+ "loss": 4.6175,
335
+ "step": 2200
336
+ },
337
+ {
338
+ "epoch": 0.24258760107816713,
339
+ "grad_norm": 0.8307099342346191,
340
+ "learning_rate": 0.000586102536427415,
341
+ "loss": 4.5541,
342
+ "step": 2250
343
+ },
344
+ {
345
+ "epoch": 0.24797843665768193,
346
+ "grad_norm": 1.0236228704452515,
347
+ "learning_rate": 0.0005857787371829465,
348
+ "loss": 4.5368,
349
+ "step": 2300
350
+ },
351
+ {
352
+ "epoch": 0.25336927223719674,
353
+ "grad_norm": 0.9307533502578735,
354
+ "learning_rate": 0.0005854549379384781,
355
+ "loss": 4.5326,
356
+ "step": 2350
357
+ },
358
+ {
359
+ "epoch": 0.2587601078167116,
360
+ "grad_norm": 1.4191182851791382,
361
+ "learning_rate": 0.0005851311386940096,
362
+ "loss": 4.5253,
363
+ "step": 2400
364
+ },
365
+ {
366
+ "epoch": 0.2641509433962264,
367
+ "grad_norm": 1.015572428703308,
368
+ "learning_rate": 0.0005848073394495412,
369
+ "loss": 4.5272,
370
+ "step": 2450
371
+ },
372
+ {
373
+ "epoch": 0.2695417789757412,
374
+ "grad_norm": 0.8903120756149292,
375
+ "learning_rate": 0.0005844835402050728,
376
+ "loss": 4.5219,
377
+ "step": 2500
378
+ },
379
+ {
380
+ "epoch": 0.2749326145552561,
381
+ "grad_norm": 1.0123368501663208,
382
+ "learning_rate": 0.0005841597409606044,
383
+ "loss": 4.4772,
384
+ "step": 2550
385
+ },
386
+ {
387
+ "epoch": 0.2803234501347709,
388
+ "grad_norm": 0.925151526927948,
389
+ "learning_rate": 0.000583835941716136,
390
+ "loss": 4.4624,
391
+ "step": 2600
392
+ },
393
+ {
394
+ "epoch": 0.2857142857142857,
395
+ "grad_norm": 1.1478705406188965,
396
+ "learning_rate": 0.0005835121424716675,
397
+ "loss": 4.4605,
398
+ "step": 2650
399
+ },
400
+ {
401
+ "epoch": 0.29110512129380056,
402
+ "grad_norm": 1.0130943059921265,
403
+ "learning_rate": 0.0005831883432271992,
404
+ "loss": 4.4529,
405
+ "step": 2700
406
+ },
407
+ {
408
+ "epoch": 0.29649595687331537,
409
+ "grad_norm": 0.991671621799469,
410
+ "learning_rate": 0.0005828645439827307,
411
+ "loss": 4.4101,
412
+ "step": 2750
413
+ },
414
+ {
415
+ "epoch": 0.3018867924528302,
416
+ "grad_norm": 0.8699747323989868,
417
+ "learning_rate": 0.0005825407447382622,
418
+ "loss": 4.4505,
419
+ "step": 2800
420
+ },
421
+ {
422
+ "epoch": 0.30727762803234504,
423
+ "grad_norm": 1.0381041765213013,
424
+ "learning_rate": 0.0005822169454937938,
425
+ "loss": 4.4171,
426
+ "step": 2850
427
+ },
428
+ {
429
+ "epoch": 0.31266846361185985,
430
+ "grad_norm": 1.0296149253845215,
431
+ "learning_rate": 0.0005818931462493254,
432
+ "loss": 4.4014,
433
+ "step": 2900
434
+ },
435
+ {
436
+ "epoch": 0.31805929919137466,
437
+ "grad_norm": 0.9270951747894287,
438
+ "learning_rate": 0.0005815693470048569,
439
+ "loss": 4.3968,
440
+ "step": 2950
441
+ },
442
+ {
443
+ "epoch": 0.32345013477088946,
444
+ "grad_norm": 0.7531670331954956,
445
+ "learning_rate": 0.0005812455477603885,
446
+ "loss": 4.3901,
447
+ "step": 3000
448
+ },
449
+ {
450
+ "epoch": 0.32345013477088946,
451
+ "eval_accuracy": 0.29159338482103947,
452
+ "eval_loss": 4.296316146850586,
453
+ "eval_runtime": 144.5889,
454
+ "eval_samples_per_second": 124.567,
455
+ "eval_steps_per_second": 7.788,
456
+ "step": 3000
457
+ },
458
+ {
459
+ "epoch": 0.3288409703504043,
460
+ "grad_norm": 0.8791877627372742,
461
+ "learning_rate": 0.0005809217485159201,
462
+ "loss": 4.3683,
463
+ "step": 3050
464
+ },
465
+ {
466
+ "epoch": 0.33423180592991913,
467
+ "grad_norm": 0.843708872795105,
468
+ "learning_rate": 0.0005805979492714517,
469
+ "loss": 4.3643,
470
+ "step": 3100
471
+ },
472
+ {
473
+ "epoch": 0.33962264150943394,
474
+ "grad_norm": 0.8123868703842163,
475
+ "learning_rate": 0.0005802741500269832,
476
+ "loss": 4.3262,
477
+ "step": 3150
478
+ },
479
+ {
480
+ "epoch": 0.3450134770889488,
481
+ "grad_norm": 0.9056026339530945,
482
+ "learning_rate": 0.0005799503507825148,
483
+ "loss": 4.3298,
484
+ "step": 3200
485
+ },
486
+ {
487
+ "epoch": 0.3504043126684636,
488
+ "grad_norm": 0.6567044854164124,
489
+ "learning_rate": 0.0005796265515380463,
490
+ "loss": 4.306,
491
+ "step": 3250
492
+ },
493
+ {
494
+ "epoch": 0.3557951482479784,
495
+ "grad_norm": 0.9095497131347656,
496
+ "learning_rate": 0.000579302752293578,
497
+ "loss": 4.328,
498
+ "step": 3300
499
+ },
500
+ {
501
+ "epoch": 0.3611859838274933,
502
+ "grad_norm": 0.8091392517089844,
503
+ "learning_rate": 0.0005789789530491095,
504
+ "loss": 4.3044,
505
+ "step": 3350
506
+ },
507
+ {
508
+ "epoch": 0.3665768194070081,
509
+ "grad_norm": 0.9594192504882812,
510
+ "learning_rate": 0.0005786551538046411,
511
+ "loss": 4.2963,
512
+ "step": 3400
513
+ },
514
+ {
515
+ "epoch": 0.3719676549865229,
516
+ "grad_norm": 0.72095787525177,
517
+ "learning_rate": 0.0005783313545601726,
518
+ "loss": 4.2802,
519
+ "step": 3450
520
+ },
521
+ {
522
+ "epoch": 0.37735849056603776,
523
+ "grad_norm": 0.7558978199958801,
524
+ "learning_rate": 0.0005780075553157043,
525
+ "loss": 4.2589,
526
+ "step": 3500
527
+ },
528
+ {
529
+ "epoch": 0.38274932614555257,
530
+ "grad_norm": 0.7811341881752014,
531
+ "learning_rate": 0.0005776837560712357,
532
+ "loss": 4.264,
533
+ "step": 3550
534
+ },
535
+ {
536
+ "epoch": 0.3881401617250674,
537
+ "grad_norm": 0.9022195935249329,
538
+ "learning_rate": 0.0005773599568267673,
539
+ "loss": 4.2651,
540
+ "step": 3600
541
+ },
542
+ {
543
+ "epoch": 0.3935309973045822,
544
+ "grad_norm": 0.9639933109283447,
545
+ "learning_rate": 0.0005770361575822989,
546
+ "loss": 4.2789,
547
+ "step": 3650
548
+ },
549
+ {
550
+ "epoch": 0.39892183288409705,
551
+ "grad_norm": 0.9333063960075378,
552
+ "learning_rate": 0.0005767123583378305,
553
+ "loss": 4.24,
554
+ "step": 3700
555
+ },
556
+ {
557
+ "epoch": 0.40431266846361186,
558
+ "grad_norm": 0.7508504986763,
559
+ "learning_rate": 0.000576388559093362,
560
+ "loss": 4.2359,
561
+ "step": 3750
562
+ },
563
+ {
564
+ "epoch": 0.40970350404312667,
565
+ "grad_norm": 0.7458257079124451,
566
+ "learning_rate": 0.0005760647598488936,
567
+ "loss": 4.2555,
568
+ "step": 3800
569
+ },
570
+ {
571
+ "epoch": 0.41509433962264153,
572
+ "grad_norm": 0.7919742465019226,
573
+ "learning_rate": 0.0005757409606044253,
574
+ "loss": 4.2366,
575
+ "step": 3850
576
+ },
577
+ {
578
+ "epoch": 0.42048517520215634,
579
+ "grad_norm": 0.9453123211860657,
580
+ "learning_rate": 0.0005754171613599568,
581
+ "loss": 4.2055,
582
+ "step": 3900
583
+ },
584
+ {
585
+ "epoch": 0.42587601078167114,
586
+ "grad_norm": 0.6152997612953186,
587
+ "learning_rate": 0.0005750933621154884,
588
+ "loss": 4.201,
589
+ "step": 3950
590
+ },
591
+ {
592
+ "epoch": 0.431266846361186,
593
+ "grad_norm": 0.8247600197792053,
594
+ "learning_rate": 0.0005747695628710199,
595
+ "loss": 4.2075,
596
+ "step": 4000
597
+ },
598
+ {
599
+ "epoch": 0.431266846361186,
600
+ "eval_accuracy": 0.30819272110400897,
601
+ "eval_loss": 4.134169101715088,
602
+ "eval_runtime": 144.6112,
603
+ "eval_samples_per_second": 124.548,
604
+ "eval_steps_per_second": 7.786,
605
+ "step": 4000
606
+ },
607
+ {
608
+ "epoch": 0.4366576819407008,
609
+ "grad_norm": 0.8243815898895264,
610
+ "learning_rate": 0.0005744457636265515,
611
+ "loss": 4.2068,
612
+ "step": 4050
613
+ },
614
+ {
615
+ "epoch": 0.4420485175202156,
616
+ "grad_norm": 0.6849672794342041,
617
+ "learning_rate": 0.0005741219643820831,
618
+ "loss": 4.2091,
619
+ "step": 4100
620
+ },
621
+ {
622
+ "epoch": 0.4474393530997305,
623
+ "grad_norm": 0.7505493760108948,
624
+ "learning_rate": 0.0005737981651376146,
625
+ "loss": 4.193,
626
+ "step": 4150
627
+ },
628
+ {
629
+ "epoch": 0.4528301886792453,
630
+ "grad_norm": 0.6466169953346252,
631
+ "learning_rate": 0.0005734743658931462,
632
+ "loss": 4.1829,
633
+ "step": 4200
634
+ },
635
+ {
636
+ "epoch": 0.4582210242587601,
637
+ "grad_norm": 0.5937972664833069,
638
+ "learning_rate": 0.0005731505666486778,
639
+ "loss": 4.1738,
640
+ "step": 4250
641
+ },
642
+ {
643
+ "epoch": 0.4636118598382749,
644
+ "grad_norm": 0.7906216979026794,
645
+ "learning_rate": 0.0005728267674042093,
646
+ "loss": 4.1668,
647
+ "step": 4300
648
+ },
649
+ {
650
+ "epoch": 0.46900269541778977,
651
+ "grad_norm": 0.744433581829071,
652
+ "learning_rate": 0.0005725029681597409,
653
+ "loss": 4.1723,
654
+ "step": 4350
655
+ },
656
+ {
657
+ "epoch": 0.4743935309973046,
658
+ "grad_norm": 0.8073228597640991,
659
+ "learning_rate": 0.0005721791689152725,
660
+ "loss": 4.158,
661
+ "step": 4400
662
+ },
663
+ {
664
+ "epoch": 0.4797843665768194,
665
+ "grad_norm": 0.8262885808944702,
666
+ "learning_rate": 0.0005718553696708041,
667
+ "loss": 4.1602,
668
+ "step": 4450
669
+ },
670
+ {
671
+ "epoch": 0.48517520215633425,
672
+ "grad_norm": 0.6594825387001038,
673
+ "learning_rate": 0.0005715315704263356,
674
+ "loss": 4.1434,
675
+ "step": 4500
676
+ },
677
+ {
678
+ "epoch": 0.49056603773584906,
679
+ "grad_norm": 0.6674824357032776,
680
+ "learning_rate": 0.0005712077711818672,
681
+ "loss": 4.1315,
682
+ "step": 4550
683
+ },
684
+ {
685
+ "epoch": 0.49595687331536387,
686
+ "grad_norm": 0.7260637879371643,
687
+ "learning_rate": 0.0005708839719373987,
688
+ "loss": 4.1407,
689
+ "step": 4600
690
+ },
691
+ {
692
+ "epoch": 0.5013477088948787,
693
+ "grad_norm": 0.6827527284622192,
694
+ "learning_rate": 0.0005705601726929304,
695
+ "loss": 4.1196,
696
+ "step": 4650
697
+ },
698
+ {
699
+ "epoch": 0.5067385444743935,
700
+ "grad_norm": 0.774723470211029,
701
+ "learning_rate": 0.0005702363734484619,
702
+ "loss": 4.1256,
703
+ "step": 4700
704
+ },
705
+ {
706
+ "epoch": 0.5121293800539084,
707
+ "grad_norm": 0.6491437554359436,
708
+ "learning_rate": 0.0005699125742039935,
709
+ "loss": 4.1063,
710
+ "step": 4750
711
+ },
712
+ {
713
+ "epoch": 0.5175202156334232,
714
+ "grad_norm": 0.7277990579605103,
715
+ "learning_rate": 0.000569588774959525,
716
+ "loss": 4.1197,
717
+ "step": 4800
718
+ },
719
+ {
720
+ "epoch": 0.522911051212938,
721
+ "grad_norm": 0.6551647782325745,
722
+ "learning_rate": 0.0005692649757150567,
723
+ "loss": 4.1135,
724
+ "step": 4850
725
+ },
726
+ {
727
+ "epoch": 0.5283018867924528,
728
+ "grad_norm": 0.7088435292243958,
729
+ "learning_rate": 0.0005689411764705881,
730
+ "loss": 4.1188,
731
+ "step": 4900
732
+ },
733
+ {
734
+ "epoch": 0.5336927223719676,
735
+ "grad_norm": 0.7585951089859009,
736
+ "learning_rate": 0.0005686173772261197,
737
+ "loss": 4.0928,
738
+ "step": 4950
739
+ },
740
+ {
741
+ "epoch": 0.5390835579514824,
742
+ "grad_norm": 0.673933744430542,
743
+ "learning_rate": 0.0005682935779816514,
744
+ "loss": 4.0819,
745
+ "step": 5000
746
+ },
747
+ {
748
+ "epoch": 0.5390835579514824,
749
+ "eval_accuracy": 0.31859362744293795,
750
+ "eval_loss": 4.022838115692139,
751
+ "eval_runtime": 144.8217,
752
+ "eval_samples_per_second": 124.367,
753
+ "eval_steps_per_second": 7.775,
754
+ "step": 5000
755
+ },
756
+ {
757
+ "epoch": 0.5444743935309974,
758
+ "grad_norm": 0.617730438709259,
759
+ "learning_rate": 0.0005679697787371829,
760
+ "loss": 4.0793,
761
+ "step": 5050
762
+ },
763
+ {
764
+ "epoch": 0.5498652291105122,
765
+ "grad_norm": 0.6957946419715881,
766
+ "learning_rate": 0.0005676459794927145,
767
+ "loss": 4.0821,
768
+ "step": 5100
769
+ },
770
+ {
771
+ "epoch": 0.555256064690027,
772
+ "grad_norm": 0.6225258708000183,
773
+ "learning_rate": 0.000567322180248246,
774
+ "loss": 4.0873,
775
+ "step": 5150
776
+ },
777
+ {
778
+ "epoch": 0.5606469002695418,
779
+ "grad_norm": 0.6864507794380188,
780
+ "learning_rate": 0.0005669983810037777,
781
+ "loss": 4.0634,
782
+ "step": 5200
783
+ },
784
+ {
785
+ "epoch": 0.5660377358490566,
786
+ "grad_norm": 0.642590343952179,
787
+ "learning_rate": 0.0005666745817593092,
788
+ "loss": 4.0649,
789
+ "step": 5250
790
+ },
791
+ {
792
+ "epoch": 0.5714285714285714,
793
+ "grad_norm": 0.614847719669342,
794
+ "learning_rate": 0.0005663507825148408,
795
+ "loss": 4.0476,
796
+ "step": 5300
797
+ },
798
+ {
799
+ "epoch": 0.5768194070080862,
800
+ "grad_norm": 0.7870457768440247,
801
+ "learning_rate": 0.0005660269832703723,
802
+ "loss": 4.063,
803
+ "step": 5350
804
+ },
805
+ {
806
+ "epoch": 0.5822102425876011,
807
+ "grad_norm": 0.6789854764938354,
808
+ "learning_rate": 0.0005657031840259039,
809
+ "loss": 4.0518,
810
+ "step": 5400
811
+ },
812
+ {
813
+ "epoch": 0.5876010781671159,
814
+ "grad_norm": 0.6867729425430298,
815
+ "learning_rate": 0.0005653793847814355,
816
+ "loss": 4.0614,
817
+ "step": 5450
818
+ },
819
+ {
820
+ "epoch": 0.5929919137466307,
821
+ "grad_norm": 0.5899894833564758,
822
+ "learning_rate": 0.000565055585536967,
823
+ "loss": 4.0639,
824
+ "step": 5500
825
+ },
826
+ {
827
+ "epoch": 0.5983827493261455,
828
+ "grad_norm": 0.6574368476867676,
829
+ "learning_rate": 0.0005647317862924986,
830
+ "loss": 4.067,
831
+ "step": 5550
832
+ },
833
+ {
834
+ "epoch": 0.6037735849056604,
835
+ "grad_norm": 0.7431745529174805,
836
+ "learning_rate": 0.0005644079870480302,
837
+ "loss": 4.0425,
838
+ "step": 5600
839
+ },
840
+ {
841
+ "epoch": 0.6091644204851752,
842
+ "grad_norm": 0.6241595149040222,
843
+ "learning_rate": 0.0005640841878035617,
844
+ "loss": 4.0319,
845
+ "step": 5650
846
+ },
847
+ {
848
+ "epoch": 0.6145552560646901,
849
+ "grad_norm": 0.6736788749694824,
850
+ "learning_rate": 0.0005637603885590933,
851
+ "loss": 4.0366,
852
+ "step": 5700
853
+ },
854
+ {
855
+ "epoch": 0.6199460916442049,
856
+ "grad_norm": 0.6149032711982727,
857
+ "learning_rate": 0.0005634365893146248,
858
+ "loss": 4.0495,
859
+ "step": 5750
860
+ },
861
+ {
862
+ "epoch": 0.6253369272237197,
863
+ "grad_norm": 0.6543477177619934,
864
+ "learning_rate": 0.0005631127900701565,
865
+ "loss": 4.042,
866
+ "step": 5800
867
+ },
868
+ {
869
+ "epoch": 0.6307277628032345,
870
+ "grad_norm": 0.6215724945068359,
871
+ "learning_rate": 0.000562788990825688,
872
+ "loss": 4.0478,
873
+ "step": 5850
874
+ },
875
+ {
876
+ "epoch": 0.6361185983827493,
877
+ "grad_norm": 0.6606348752975464,
878
+ "learning_rate": 0.0005624651915812196,
879
+ "loss": 4.0192,
880
+ "step": 5900
881
+ },
882
+ {
883
+ "epoch": 0.6415094339622641,
884
+ "grad_norm": 0.7944669723510742,
885
+ "learning_rate": 0.0005621413923367511,
886
+ "loss": 4.012,
887
+ "step": 5950
888
+ },
889
+ {
890
+ "epoch": 0.6469002695417789,
891
+ "grad_norm": 0.6075884699821472,
892
+ "learning_rate": 0.0005618175930922828,
893
+ "loss": 4.0201,
894
+ "step": 6000
895
+ },
896
+ {
897
+ "epoch": 0.6469002695417789,
898
+ "eval_accuracy": 0.3257018080166491,
899
+ "eval_loss": 3.940925359725952,
900
+ "eval_runtime": 144.8645,
901
+ "eval_samples_per_second": 124.33,
902
+ "eval_steps_per_second": 7.773,
903
+ "step": 6000
904
+ },
905
+ {
906
+ "epoch": 0.6522911051212938,
907
+ "grad_norm": 0.7395045757293701,
908
+ "learning_rate": 0.0005614937938478143,
909
+ "loss": 3.9927,
910
+ "step": 6050
911
+ },
912
+ {
913
+ "epoch": 0.6576819407008087,
914
+ "grad_norm": 0.69764643907547,
915
+ "learning_rate": 0.0005611699946033459,
916
+ "loss": 4.0252,
917
+ "step": 6100
918
+ },
919
+ {
920
+ "epoch": 0.6630727762803235,
921
+ "grad_norm": 0.5907176733016968,
922
+ "learning_rate": 0.0005608461953588774,
923
+ "loss": 4.0071,
924
+ "step": 6150
925
+ },
926
+ {
927
+ "epoch": 0.6684636118598383,
928
+ "grad_norm": 0.7374498248100281,
929
+ "learning_rate": 0.000560522396114409,
930
+ "loss": 3.9854,
931
+ "step": 6200
932
+ },
933
+ {
934
+ "epoch": 0.6738544474393531,
935
+ "grad_norm": 0.5937222838401794,
936
+ "learning_rate": 0.0005601985968699405,
937
+ "loss": 4.003,
938
+ "step": 6250
939
+ },
940
+ {
941
+ "epoch": 0.6792452830188679,
942
+ "grad_norm": 0.6928643584251404,
943
+ "learning_rate": 0.0005598747976254721,
944
+ "loss": 4.022,
945
+ "step": 6300
946
+ },
947
+ {
948
+ "epoch": 0.6846361185983828,
949
+ "grad_norm": 0.6832301020622253,
950
+ "learning_rate": 0.0005595509983810038,
951
+ "loss": 3.9816,
952
+ "step": 6350
953
+ },
954
+ {
955
+ "epoch": 0.6900269541778976,
956
+ "grad_norm": 0.6227492690086365,
957
+ "learning_rate": 0.0005592271991365353,
958
+ "loss": 3.9855,
959
+ "step": 6400
960
+ },
961
+ {
962
+ "epoch": 0.6954177897574124,
963
+ "grad_norm": 0.6595360636711121,
964
+ "learning_rate": 0.0005589033998920669,
965
+ "loss": 3.9733,
966
+ "step": 6450
967
+ },
968
+ {
969
+ "epoch": 0.7008086253369272,
970
+ "grad_norm": 0.6538481116294861,
971
+ "learning_rate": 0.0005585796006475984,
972
+ "loss": 3.9867,
973
+ "step": 6500
974
+ },
975
+ {
976
+ "epoch": 0.706199460916442,
977
+ "grad_norm": 0.6099511384963989,
978
+ "learning_rate": 0.0005582558014031301,
979
+ "loss": 4.009,
980
+ "step": 6550
981
+ },
982
+ {
983
+ "epoch": 0.7115902964959568,
984
+ "grad_norm": 0.5673043131828308,
985
+ "learning_rate": 0.0005579320021586616,
986
+ "loss": 3.9638,
987
+ "step": 6600
988
+ },
989
+ {
990
+ "epoch": 0.7169811320754716,
991
+ "grad_norm": 0.7762152552604675,
992
+ "learning_rate": 0.0005576082029141932,
993
+ "loss": 3.9942,
994
+ "step": 6650
995
+ },
996
+ {
997
+ "epoch": 0.7223719676549866,
998
+ "grad_norm": 0.6117172241210938,
999
+ "learning_rate": 0.0005572844036697247,
1000
+ "loss": 3.9672,
1001
+ "step": 6700
1002
+ },
1003
+ {
1004
+ "epoch": 0.7277628032345014,
1005
+ "grad_norm": 0.6088191866874695,
1006
+ "learning_rate": 0.0005569606044252563,
1007
+ "loss": 3.9738,
1008
+ "step": 6750
1009
+ },
1010
+ {
1011
+ "epoch": 0.7331536388140162,
1012
+ "grad_norm": 0.6414440274238586,
1013
+ "learning_rate": 0.0005566368051807879,
1014
+ "loss": 3.9641,
1015
+ "step": 6800
1016
+ },
1017
+ {
1018
+ "epoch": 0.738544474393531,
1019
+ "grad_norm": 0.8121737241744995,
1020
+ "learning_rate": 0.0005563130059363194,
1021
+ "loss": 3.9423,
1022
+ "step": 6850
1023
+ },
1024
+ {
1025
+ "epoch": 0.7439353099730458,
1026
+ "grad_norm": 0.7500906586647034,
1027
+ "learning_rate": 0.000555989206691851,
1028
+ "loss": 3.9677,
1029
+ "step": 6900
1030
+ },
1031
+ {
1032
+ "epoch": 0.7493261455525606,
1033
+ "grad_norm": 0.6089574694633484,
1034
+ "learning_rate": 0.0005556654074473826,
1035
+ "loss": 3.9647,
1036
+ "step": 6950
1037
+ },
1038
+ {
1039
+ "epoch": 0.7547169811320755,
1040
+ "grad_norm": 0.7751880288124084,
1041
+ "learning_rate": 0.0005553416082029141,
1042
+ "loss": 3.9519,
1043
+ "step": 7000
1044
+ },
1045
+ {
1046
+ "epoch": 0.7547169811320755,
1047
+ "eval_accuracy": 0.3312385418738994,
1048
+ "eval_loss": 3.88531494140625,
1049
+ "eval_runtime": 144.5963,
1050
+ "eval_samples_per_second": 124.561,
1051
+ "eval_steps_per_second": 7.787,
1052
+ "step": 7000
1053
+ },
1054
+ {
1055
+ "epoch": 0.7601078167115903,
1056
+ "grad_norm": 0.6487019658088684,
1057
+ "learning_rate": 0.0005550178089584457,
1058
+ "loss": 3.9428,
1059
+ "step": 7050
1060
+ },
1061
+ {
1062
+ "epoch": 0.7654986522911051,
1063
+ "grad_norm": 0.6093623638153076,
1064
+ "learning_rate": 0.0005546940097139772,
1065
+ "loss": 3.9544,
1066
+ "step": 7100
1067
+ },
1068
+ {
1069
+ "epoch": 0.77088948787062,
1070
+ "grad_norm": 0.5531768798828125,
1071
+ "learning_rate": 0.0005543702104695089,
1072
+ "loss": 3.9569,
1073
+ "step": 7150
1074
+ },
1075
+ {
1076
+ "epoch": 0.7762803234501348,
1077
+ "grad_norm": 0.6401906609535217,
1078
+ "learning_rate": 0.0005540464112250404,
1079
+ "loss": 3.9564,
1080
+ "step": 7200
1081
+ },
1082
+ {
1083
+ "epoch": 0.7816711590296496,
1084
+ "grad_norm": 0.5921440720558167,
1085
+ "learning_rate": 0.000553722611980572,
1086
+ "loss": 3.9108,
1087
+ "step": 7250
1088
+ },
1089
+ {
1090
+ "epoch": 0.7870619946091644,
1091
+ "grad_norm": 0.6791409254074097,
1092
+ "learning_rate": 0.0005533988127361035,
1093
+ "loss": 3.9488,
1094
+ "step": 7300
1095
+ },
1096
+ {
1097
+ "epoch": 0.7924528301886793,
1098
+ "grad_norm": 0.6472693681716919,
1099
+ "learning_rate": 0.0005530750134916352,
1100
+ "loss": 3.9341,
1101
+ "step": 7350
1102
+ },
1103
+ {
1104
+ "epoch": 0.7978436657681941,
1105
+ "grad_norm": 0.6375269889831543,
1106
+ "learning_rate": 0.0005527512142471668,
1107
+ "loss": 3.9339,
1108
+ "step": 7400
1109
+ },
1110
+ {
1111
+ "epoch": 0.8032345013477089,
1112
+ "grad_norm": 0.626977264881134,
1113
+ "learning_rate": 0.0005524274150026982,
1114
+ "loss": 3.9247,
1115
+ "step": 7450
1116
+ },
1117
+ {
1118
+ "epoch": 0.8086253369272237,
1119
+ "grad_norm": 0.696706235408783,
1120
+ "learning_rate": 0.0005521036157582299,
1121
+ "loss": 3.9116,
1122
+ "step": 7500
1123
+ },
1124
+ {
1125
+ "epoch": 0.8140161725067385,
1126
+ "grad_norm": 0.594398558139801,
1127
+ "learning_rate": 0.0005517798165137614,
1128
+ "loss": 3.9356,
1129
+ "step": 7550
1130
+ },
1131
+ {
1132
+ "epoch": 0.8194070080862533,
1133
+ "grad_norm": 0.6184767484664917,
1134
+ "learning_rate": 0.000551456017269293,
1135
+ "loss": 3.9384,
1136
+ "step": 7600
1137
+ },
1138
+ {
1139
+ "epoch": 0.8247978436657682,
1140
+ "grad_norm": 0.5797574520111084,
1141
+ "learning_rate": 0.0005511322180248245,
1142
+ "loss": 3.9149,
1143
+ "step": 7650
1144
+ },
1145
+ {
1146
+ "epoch": 0.8301886792452831,
1147
+ "grad_norm": 0.5616925954818726,
1148
+ "learning_rate": 0.0005508084187803562,
1149
+ "loss": 3.9173,
1150
+ "step": 7700
1151
+ },
1152
+ {
1153
+ "epoch": 0.8355795148247979,
1154
+ "grad_norm": 0.6098619103431702,
1155
+ "learning_rate": 0.0005504846195358877,
1156
+ "loss": 3.8999,
1157
+ "step": 7750
1158
+ },
1159
+ {
1160
+ "epoch": 0.8409703504043127,
1161
+ "grad_norm": 0.6354513764381409,
1162
+ "learning_rate": 0.0005501608202914193,
1163
+ "loss": 3.9284,
1164
+ "step": 7800
1165
+ },
1166
+ {
1167
+ "epoch": 0.8463611859838275,
1168
+ "grad_norm": 0.6673269867897034,
1169
+ "learning_rate": 0.0005498370210469508,
1170
+ "loss": 3.909,
1171
+ "step": 7850
1172
+ },
1173
+ {
1174
+ "epoch": 0.8517520215633423,
1175
+ "grad_norm": 0.539669930934906,
1176
+ "learning_rate": 0.0005495132218024824,
1177
+ "loss": 3.9074,
1178
+ "step": 7900
1179
+ },
1180
+ {
1181
+ "epoch": 0.8571428571428571,
1182
+ "grad_norm": 0.618360161781311,
1183
+ "learning_rate": 0.000549189422558014,
1184
+ "loss": 3.9046,
1185
+ "step": 7950
1186
+ },
1187
+ {
1188
+ "epoch": 0.862533692722372,
1189
+ "grad_norm": 0.6503387689590454,
1190
+ "learning_rate": 0.0005488656233135456,
1191
+ "loss": 3.8917,
1192
+ "step": 8000
1193
+ },
1194
+ {
1195
+ "epoch": 0.862533692722372,
1196
+ "eval_accuracy": 0.33594255999893957,
1197
+ "eval_loss": 3.8360989093780518,
1198
+ "eval_runtime": 144.8786,
1199
+ "eval_samples_per_second": 124.318,
1200
+ "eval_steps_per_second": 7.772,
1201
+ "step": 8000
1202
+ },
1203
+ {
1204
+ "epoch": 0.8679245283018868,
1205
+ "grad_norm": 0.6546068787574768,
1206
+ "learning_rate": 0.0005485418240690771,
1207
+ "loss": 3.9022,
1208
+ "step": 8050
1209
+ },
1210
+ {
1211
+ "epoch": 0.8733153638814016,
1212
+ "grad_norm": 0.6453102231025696,
1213
+ "learning_rate": 0.0005482180248246087,
1214
+ "loss": 3.8843,
1215
+ "step": 8100
1216
+ },
1217
+ {
1218
+ "epoch": 0.8787061994609164,
1219
+ "grad_norm": 0.5534443259239197,
1220
+ "learning_rate": 0.0005478942255801403,
1221
+ "loss": 3.8988,
1222
+ "step": 8150
1223
+ },
1224
+ {
1225
+ "epoch": 0.8840970350404312,
1226
+ "grad_norm": 0.6230263113975525,
1227
+ "learning_rate": 0.0005475704263356718,
1228
+ "loss": 3.8879,
1229
+ "step": 8200
1230
+ },
1231
+ {
1232
+ "epoch": 0.889487870619946,
1233
+ "grad_norm": 0.6733997464179993,
1234
+ "learning_rate": 0.0005472466270912034,
1235
+ "loss": 3.8729,
1236
+ "step": 8250
1237
+ },
1238
+ {
1239
+ "epoch": 0.894878706199461,
1240
+ "grad_norm": 0.6633635759353638,
1241
+ "learning_rate": 0.000546922827846735,
1242
+ "loss": 3.8991,
1243
+ "step": 8300
1244
+ },
1245
+ {
1246
+ "epoch": 0.9002695417789758,
1247
+ "grad_norm": 0.6503227949142456,
1248
+ "learning_rate": 0.0005465990286022665,
1249
+ "loss": 3.8792,
1250
+ "step": 8350
1251
+ },
1252
+ {
1253
+ "epoch": 0.9056603773584906,
1254
+ "grad_norm": 0.7666671276092529,
1255
+ "learning_rate": 0.0005462752293577981,
1256
+ "loss": 3.8944,
1257
+ "step": 8400
1258
+ },
1259
+ {
1260
+ "epoch": 0.9110512129380054,
1261
+ "grad_norm": 0.6036889553070068,
1262
+ "learning_rate": 0.0005459514301133296,
1263
+ "loss": 3.9057,
1264
+ "step": 8450
1265
+ },
1266
+ {
1267
+ "epoch": 0.9164420485175202,
1268
+ "grad_norm": 0.6154916882514954,
1269
+ "learning_rate": 0.0005456276308688613,
1270
+ "loss": 3.894,
1271
+ "step": 8500
1272
+ },
1273
+ {
1274
+ "epoch": 0.921832884097035,
1275
+ "grad_norm": 0.5653623938560486,
1276
+ "learning_rate": 0.0005453038316243929,
1277
+ "loss": 3.8763,
1278
+ "step": 8550
1279
+ },
1280
+ {
1281
+ "epoch": 0.9272237196765498,
1282
+ "grad_norm": 0.5695276856422424,
1283
+ "learning_rate": 0.0005449800323799244,
1284
+ "loss": 3.8709,
1285
+ "step": 8600
1286
+ },
1287
+ {
1288
+ "epoch": 0.9326145552560647,
1289
+ "grad_norm": 0.5406414270401001,
1290
+ "learning_rate": 0.000544656233135456,
1291
+ "loss": 3.8623,
1292
+ "step": 8650
1293
+ },
1294
+ {
1295
+ "epoch": 0.9380053908355795,
1296
+ "grad_norm": 0.5876409411430359,
1297
+ "learning_rate": 0.0005443324338909875,
1298
+ "loss": 3.867,
1299
+ "step": 8700
1300
+ },
1301
+ {
1302
+ "epoch": 0.9433962264150944,
1303
+ "grad_norm": 0.5680667757987976,
1304
+ "learning_rate": 0.0005440086346465192,
1305
+ "loss": 3.8691,
1306
+ "step": 8750
1307
+ },
1308
+ {
1309
+ "epoch": 0.9487870619946092,
1310
+ "grad_norm": 0.5708035230636597,
1311
+ "learning_rate": 0.0005436848354020506,
1312
+ "loss": 3.8589,
1313
+ "step": 8800
1314
+ },
1315
+ {
1316
+ "epoch": 0.954177897574124,
1317
+ "grad_norm": 0.5115900635719299,
1318
+ "learning_rate": 0.0005433610361575823,
1319
+ "loss": 3.8488,
1320
+ "step": 8850
1321
+ },
1322
+ {
1323
+ "epoch": 0.9595687331536388,
1324
+ "grad_norm": 0.6006278395652771,
1325
+ "learning_rate": 0.0005430372369131138,
1326
+ "loss": 3.8652,
1327
+ "step": 8900
1328
+ },
1329
+ {
1330
+ "epoch": 0.9649595687331537,
1331
+ "grad_norm": 0.6494414210319519,
1332
+ "learning_rate": 0.0005427134376686454,
1333
+ "loss": 3.8716,
1334
+ "step": 8950
1335
+ },
1336
+ {
1337
+ "epoch": 0.9703504043126685,
1338
+ "grad_norm": 0.5888278484344482,
1339
+ "learning_rate": 0.0005423896384241769,
1340
+ "loss": 3.8592,
1341
+ "step": 9000
1342
+ },
1343
+ {
1344
+ "epoch": 0.9703504043126685,
1345
+ "eval_accuracy": 0.34041004078720755,
1346
+ "eval_loss": 3.7949342727661133,
1347
+ "eval_runtime": 144.616,
1348
+ "eval_samples_per_second": 124.544,
1349
+ "eval_steps_per_second": 7.786,
1350
+ "step": 9000
1351
+ },
1352
+ {
1353
+ "epoch": 0.9757412398921833,
1354
+ "grad_norm": 0.5697623491287231,
1355
+ "learning_rate": 0.0005420658391797086,
1356
+ "loss": 3.855,
1357
+ "step": 9050
1358
+ },
1359
+ {
1360
+ "epoch": 0.9811320754716981,
1361
+ "grad_norm": 0.6447634696960449,
1362
+ "learning_rate": 0.0005417420399352401,
1363
+ "loss": 3.866,
1364
+ "step": 9100
1365
+ },
1366
+ {
1367
+ "epoch": 0.9865229110512129,
1368
+ "grad_norm": 0.583088219165802,
1369
+ "learning_rate": 0.0005414182406907717,
1370
+ "loss": 3.8415,
1371
+ "step": 9150
1372
+ },
1373
+ {
1374
+ "epoch": 0.9919137466307277,
1375
+ "grad_norm": 0.5796465277671814,
1376
+ "learning_rate": 0.0005410944414463032,
1377
+ "loss": 3.8376,
1378
+ "step": 9200
1379
+ },
1380
+ {
1381
+ "epoch": 0.9973045822102425,
1382
+ "grad_norm": 0.6316399574279785,
1383
+ "learning_rate": 0.0005407706422018348,
1384
+ "loss": 3.8524,
1385
+ "step": 9250
1386
+ },
1387
+ {
1388
+ "epoch": 1.0026954177897573,
1389
+ "grad_norm": 0.5727218389511108,
1390
+ "learning_rate": 0.0005404468429573664,
1391
+ "loss": 3.8268,
1392
+ "step": 9300
1393
+ },
1394
+ {
1395
+ "epoch": 1.0080862533692723,
1396
+ "grad_norm": 0.5713456869125366,
1397
+ "learning_rate": 0.000540123043712898,
1398
+ "loss": 3.7921,
1399
+ "step": 9350
1400
+ },
1401
+ {
1402
+ "epoch": 1.013477088948787,
1403
+ "grad_norm": 0.6198617815971375,
1404
+ "learning_rate": 0.0005397992444684295,
1405
+ "loss": 3.7876,
1406
+ "step": 9400
1407
+ },
1408
+ {
1409
+ "epoch": 1.0188679245283019,
1410
+ "grad_norm": 0.5770072937011719,
1411
+ "learning_rate": 0.0005394754452239611,
1412
+ "loss": 3.7856,
1413
+ "step": 9450
1414
+ },
1415
+ {
1416
+ "epoch": 1.0242587601078168,
1417
+ "grad_norm": 0.5920796990394592,
1418
+ "learning_rate": 0.0005391516459794927,
1419
+ "loss": 3.7855,
1420
+ "step": 9500
1421
+ },
1422
+ {
1423
+ "epoch": 1.0296495956873315,
1424
+ "grad_norm": 0.5636431574821472,
1425
+ "learning_rate": 0.0005388278467350242,
1426
+ "loss": 3.7857,
1427
+ "step": 9550
1428
+ },
1429
+ {
1430
+ "epoch": 1.0350404312668464,
1431
+ "grad_norm": 0.669791579246521,
1432
+ "learning_rate": 0.0005385040474905557,
1433
+ "loss": 3.7723,
1434
+ "step": 9600
1435
+ },
1436
+ {
1437
+ "epoch": 1.0404312668463611,
1438
+ "grad_norm": 0.6635991930961609,
1439
+ "learning_rate": 0.0005381802482460874,
1440
+ "loss": 3.8142,
1441
+ "step": 9650
1442
+ },
1443
+ {
1444
+ "epoch": 1.045822102425876,
1445
+ "grad_norm": 0.601437509059906,
1446
+ "learning_rate": 0.000537856449001619,
1447
+ "loss": 3.798,
1448
+ "step": 9700
1449
+ },
1450
+ {
1451
+ "epoch": 1.0512129380053907,
1452
+ "grad_norm": 0.5584002137184143,
1453
+ "learning_rate": 0.0005375326497571505,
1454
+ "loss": 3.7985,
1455
+ "step": 9750
1456
+ },
1457
+ {
1458
+ "epoch": 1.0566037735849056,
1459
+ "grad_norm": 0.5526735782623291,
1460
+ "learning_rate": 0.000537208850512682,
1461
+ "loss": 3.8074,
1462
+ "step": 9800
1463
+ },
1464
+ {
1465
+ "epoch": 1.0619946091644206,
1466
+ "grad_norm": 0.5977774858474731,
1467
+ "learning_rate": 0.0005368850512682137,
1468
+ "loss": 3.7989,
1469
+ "step": 9850
1470
+ },
1471
+ {
1472
+ "epoch": 1.0673854447439353,
1473
+ "grad_norm": 0.5950897932052612,
1474
+ "learning_rate": 0.0005365612520237453,
1475
+ "loss": 3.7924,
1476
+ "step": 9900
1477
+ },
1478
+ {
1479
+ "epoch": 1.0727762803234502,
1480
+ "grad_norm": 0.5768362283706665,
1481
+ "learning_rate": 0.0005362374527792768,
1482
+ "loss": 3.7855,
1483
+ "step": 9950
1484
+ },
1485
+ {
1486
+ "epoch": 1.0781671159029649,
1487
+ "grad_norm": 0.6516294479370117,
1488
+ "learning_rate": 0.0005359136535348084,
1489
+ "loss": 3.7811,
1490
+ "step": 10000
1491
+ },
1492
+ {
1493
+ "epoch": 1.0781671159029649,
1494
+ "eval_accuracy": 0.34365658907510427,
1495
+ "eval_loss": 3.762051582336426,
1496
+ "eval_runtime": 144.5563,
1497
+ "eval_samples_per_second": 124.595,
1498
+ "eval_steps_per_second": 7.789,
1499
+ "step": 10000
1500
+ }
1501
+ ],
1502
+ "logging_steps": 50,
1503
+ "max_steps": 92750,
1504
+ "num_input_tokens_seen": 0,
1505
+ "num_train_epochs": 10,
1506
+ "save_steps": 10000,
1507
+ "stateful_callbacks": {
1508
+ "TrainerControl": {
1509
+ "args": {
1510
+ "should_epoch_stop": false,
1511
+ "should_evaluate": false,
1512
+ "should_log": false,
1513
+ "should_save": true,
1514
+ "should_training_stop": false
1515
+ },
1516
+ "attributes": {}
1517
+ }
1518
+ },
1519
+ "total_flos": 8.36069179392e+16,
1520
+ "train_batch_size": 32,
1521
+ "trial_name": null,
1522
+ "trial_params": null
1523
+ }
checkpoint-10000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55f163471d892c378f64db98f5c5595dd2d777d1501861716698287d58ed0c89
3
+ size 5304