ChiefTheLord commited on
Commit
d4c716f
·
verified ·
1 Parent(s): 586c6f2

Delete checkpoints

Browse files
checkpoints/checkpoint-10240/ema.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7f0949c4744e72fe73db782124fc4f290b639828c0631c12aa3d5f212ca9978
3
- size 634904
 
 
 
 
checkpoints/checkpoint-10240/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e05f24acc8e4f4d75f94ac6f49a49b505fdda3c67b618e6220ee132cd57534c
3
- size 19272752
 
 
 
 
checkpoints/checkpoint-10240/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0421234bf12d55e2308b14119e0395687cb7ffe91ba2f48abbe1d75de8ff6d62
3
- size 1337227
 
 
 
 
checkpoints/checkpoint-10240/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2c04be13edf4b232ba08eac3996218459dd3d050909829aaa71859abddd8737
3
- size 14645
 
 
 
 
checkpoints/checkpoint-10240/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:94ddcf074384d3a34ade0cdda457b71df0096ecbfea07f5653bd35b87f62e3e9
3
- size 1383
 
 
 
 
checkpoints/checkpoint-10240/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5eab6a06d702c463fa681f5c59aa4c76b4a7cb4dc7af81563f2aa2d201d5e87
3
- size 1465
 
 
 
 
checkpoints/checkpoint-10240/trainer_state.json DELETED
@@ -1,464 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.472957369174634,
6
- "eval_steps": 1024,
7
- "global_step": 10240,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.011823934229365849,
14
- "grad_norm": 0.11696221679449081,
15
- "learning_rate": 9.418282548476454e-05,
16
- "loss": 1.568,
17
- "step": 256
18
- },
19
- {
20
- "epoch": 0.023647868458731697,
21
- "grad_norm": 0.14284025132656097,
22
- "learning_rate": 0.00018873499538319484,
23
- "loss": 1.4627,
24
- "step": 512
25
- },
26
- {
27
- "epoch": 0.03547180268809755,
28
- "grad_norm": 0.14623360335826874,
29
- "learning_rate": 0.00028328716528162515,
30
- "loss": 1.3763,
31
- "step": 768
32
- },
33
- {
34
- "epoch": 0.047295736917463395,
35
- "grad_norm": 0.1999262273311615,
36
- "learning_rate": 0.0003778393351800554,
37
- "loss": 1.325,
38
- "step": 1024
39
- },
40
- {
41
- "epoch": 0.047295736917463395,
42
- "eval_loss": 1.2694775628172643,
43
- "eval_mse_loss": 1.2694775628172643,
44
- "step": 1024
45
- },
46
- {
47
- "epoch": 0.047295736917463395,
48
- "eval_loss": 1.2694775628172643,
49
- "eval_mse_loss": 1.2694775628172643,
50
- "eval_runtime": 99.7615,
51
- "eval_samples_per_second": 280.599,
52
- "eval_steps_per_second": 4.39,
53
- "step": 1024
54
- },
55
- {
56
- "epoch": 0.05911967114682925,
57
- "grad_norm": 0.17195427417755127,
58
- "learning_rate": 0.000399910381984159,
59
- "loss": 1.2775,
60
- "step": 1280
61
- },
62
- {
63
- "epoch": 0.0709436053761951,
64
- "grad_norm": 0.24726267158985138,
65
- "learning_rate": 0.0003995235470960099,
66
- "loss": 1.2335,
67
- "step": 1536
68
- },
69
- {
70
- "epoch": 0.08276753960556095,
71
- "grad_norm": 0.29991263151168823,
72
- "learning_rate": 0.000398831687956263,
73
- "loss": 1.2043,
74
- "step": 1792
75
- },
76
- {
77
- "epoch": 0.09459147383492679,
78
- "grad_norm": 0.2526465356349945,
79
- "learning_rate": 0.00039783586225369426,
80
- "loss": 1.1758,
81
- "step": 2048
82
- },
83
- {
84
- "epoch": 0.09459147383492679,
85
- "eval_loss": 1.1193939001048536,
86
- "eval_mse_loss": 1.1193939001048536,
87
- "step": 2048
88
- },
89
- {
90
- "epoch": 0.09459147383492679,
91
- "eval_loss": 1.1193939001048536,
92
- "eval_mse_loss": 1.1193939001048536,
93
- "eval_runtime": 99.592,
94
- "eval_samples_per_second": 281.077,
95
- "eval_steps_per_second": 4.398,
96
- "step": 2048
97
- },
98
- {
99
- "epoch": 0.10641540806429264,
100
- "grad_norm": 0.23426194489002228,
101
- "learning_rate": 0.0003965375923699678,
102
- "loss": 1.1514,
103
- "step": 2304
104
- },
105
- {
106
- "epoch": 0.1182393422936585,
107
- "grad_norm": 0.2490096390247345,
108
- "learning_rate": 0.0003949388630522747,
109
- "loss": 1.1287,
110
- "step": 2560
111
- },
112
- {
113
- "epoch": 0.13006327652302435,
114
- "grad_norm": 0.3001377582550049,
115
- "learning_rate": 0.0003930421183791246,
116
- "loss": 1.1064,
117
- "step": 2816
118
- },
119
- {
120
- "epoch": 0.1418872107523902,
121
- "grad_norm": 0.31271305680274963,
122
- "learning_rate": 0.00039085025802392876,
123
- "loss": 1.0907,
124
- "step": 3072
125
- },
126
- {
127
- "epoch": 0.1418872107523902,
128
- "eval_loss": 1.0367718265481192,
129
- "eval_mse_loss": 1.0367718265481192,
130
- "step": 3072
131
- },
132
- {
133
- "epoch": 0.1418872107523902,
134
- "eval_loss": 1.0367718265481192,
135
- "eval_mse_loss": 1.0367718265481192,
136
- "eval_runtime": 99.0318,
137
- "eval_samples_per_second": 282.667,
138
- "eval_steps_per_second": 4.423,
139
- "step": 3072
140
- },
141
- {
142
- "epoch": 0.15371114498175603,
143
- "grad_norm": 0.31617286801338196,
144
- "learning_rate": 0.0003883666328220855,
145
- "loss": 1.0795,
146
- "step": 3328
147
- },
148
- {
149
- "epoch": 0.1655350792111219,
150
- "grad_norm": 0.3006609082221985,
151
- "learning_rate": 0.00038559503964834713,
152
- "loss": 1.072,
153
- "step": 3584
154
- },
155
- {
156
- "epoch": 0.17735901344048774,
157
- "grad_norm": 0.3017251491546631,
158
- "learning_rate": 0.00038253971561229733,
159
- "loss": 1.0632,
160
- "step": 3840
161
- },
162
- {
163
- "epoch": 0.18918294766985358,
164
- "grad_norm": 0.29505959153175354,
165
- "learning_rate": 0.00037920533158081383,
166
- "loss": 1.0583,
167
- "step": 4096
168
- },
169
- {
170
- "epoch": 0.18918294766985358,
171
- "eval_loss": 1.0053560841301261,
172
- "eval_mse_loss": 1.0053560841301261,
173
- "step": 4096
174
- },
175
- {
176
- "epoch": 0.18918294766985358,
177
- "eval_loss": 1.0053560841301261,
178
- "eval_mse_loss": 1.0053560841301261,
179
- "eval_runtime": 99.7606,
180
- "eval_samples_per_second": 280.602,
181
- "eval_steps_per_second": 4.391,
182
- "step": 4096
183
- },
184
- {
185
- "epoch": 0.20100688189921945,
186
- "grad_norm": 0.327700138092041,
187
- "learning_rate": 0.00037559698503741954,
188
- "loss": 1.0539,
189
- "step": 4352
190
- },
191
- {
192
- "epoch": 0.2128308161285853,
193
- "grad_norm": 0.3910951316356659,
194
- "learning_rate": 0.0003717201922894372,
195
- "loss": 1.0525,
196
- "step": 4608
197
- },
198
- {
199
- "epoch": 0.22465475035795113,
200
- "grad_norm": 0.3237537443637848,
201
- "learning_rate": 0.0003675808800348619,
202
- "loss": 1.0461,
203
- "step": 4864
204
- },
205
- {
206
- "epoch": 0.236478684587317,
207
- "grad_norm": 0.4316788911819458,
208
- "learning_rate": 0.0003631853763018435,
209
- "loss": 1.042,
210
- "step": 5120
211
- },
212
- {
213
- "epoch": 0.236478684587317,
214
- "eval_loss": 0.9950277422933274,
215
- "eval_mse_loss": 0.9950277422933274,
216
- "step": 5120
217
- },
218
- {
219
- "epoch": 0.236478684587317,
220
- "eval_loss": 0.9950277422933274,
221
- "eval_mse_loss": 0.9950277422933274,
222
- "eval_runtime": 99.7527,
223
- "eval_samples_per_second": 280.624,
224
- "eval_steps_per_second": 4.391,
225
- "step": 5120
226
- },
227
- {
228
- "epoch": 0.24830261881668284,
229
- "grad_norm": 0.32822364568710327,
230
- "learning_rate": 0.0003585404007746302,
231
- "loss": 1.0385,
232
- "step": 5376
233
- },
234
- {
235
- "epoch": 0.2601265530460487,
236
- "grad_norm": 0.40738987922668457,
237
- "learning_rate": 0.0003536530545207627,
238
- "loss": 1.0378,
239
- "step": 5632
240
- },
241
- {
242
- "epoch": 0.27195048727541454,
243
- "grad_norm": 0.318866103887558,
244
- "learning_rate": 0.0003485308091352234,
245
- "loss": 1.0375,
246
- "step": 5888
247
- },
248
- {
249
- "epoch": 0.2837744215047804,
250
- "grad_norm": 0.34148862957954407,
251
- "learning_rate": 0.0003431814953181376,
252
- "loss": 1.0351,
253
- "step": 6144
254
- },
255
- {
256
- "epoch": 0.2837744215047804,
257
- "eval_loss": 0.9900110875634842,
258
- "eval_mse_loss": 0.9900110875634842,
259
- "step": 6144
260
- },
261
- {
262
- "epoch": 0.2837744215047804,
263
- "eval_loss": 0.9900110875634842,
264
- "eval_mse_loss": 0.9900110875634842,
265
- "eval_runtime": 99.6906,
266
- "eval_samples_per_second": 280.799,
267
- "eval_steps_per_second": 4.394,
268
- "step": 6144
269
- },
270
- {
271
- "epoch": 0.2955983557341462,
272
- "grad_norm": 0.2927286624908447,
273
- "learning_rate": 0.0003376132909034871,
274
- "loss": 1.0342,
275
- "step": 6400
276
- },
277
- {
278
- "epoch": 0.30742228996351206,
279
- "grad_norm": 0.3355765640735626,
280
- "learning_rate": 0.000331834708357139,
281
- "loss": 1.0334,
282
- "step": 6656
283
- },
284
- {
285
- "epoch": 0.3192462241928779,
286
- "grad_norm": 0.3341493308544159,
287
- "learning_rate": 0.00032585458176330104,
288
- "loss": 1.0315,
289
- "step": 6912
290
- },
291
- {
292
- "epoch": 0.3310701584222438,
293
- "grad_norm": 0.3863889276981354,
294
- "learning_rate": 0.0003196820533192994,
295
- "loss": 1.0316,
296
- "step": 7168
297
- },
298
- {
299
- "epoch": 0.3310701584222438,
300
- "eval_loss": 0.9869283731669596,
301
- "eval_mse_loss": 0.9869283731669596,
302
- "step": 7168
303
- },
304
- {
305
- "epoch": 0.3310701584222438,
306
- "eval_loss": 0.9869283731669596,
307
- "eval_mse_loss": 0.9869283731669596,
308
- "eval_runtime": 99.9137,
309
- "eval_samples_per_second": 280.172,
310
- "eval_steps_per_second": 4.384,
311
- "step": 7168
312
- },
313
- {
314
- "epoch": 0.34289409265160964,
315
- "grad_norm": 0.37314340472221375,
316
- "learning_rate": 0.00031332655935932356,
317
- "loss": 1.0304,
318
- "step": 7424
319
- },
320
- {
321
- "epoch": 0.3547180268809755,
322
- "grad_norm": 0.31663990020751953,
323
- "learning_rate": 0.0003067978159285067,
324
- "loss": 1.0291,
325
- "step": 7680
326
- },
327
- {
328
- "epoch": 0.3665419611103413,
329
- "grad_norm": 0.3562430143356323,
330
- "learning_rate": 0.0003001058039293929,
331
- "loss": 1.0282,
332
- "step": 7936
333
- },
334
- {
335
- "epoch": 0.37836589533970716,
336
- "grad_norm": 0.32352393865585327,
337
- "learning_rate": 0.0002932607538635016,
338
- "loss": 1.0289,
339
- "step": 8192
340
- },
341
- {
342
- "epoch": 0.37836589533970716,
343
- "eval_loss": 0.9830868584380302,
344
- "eval_mse_loss": 0.9830868584380302,
345
- "step": 8192
346
- },
347
- {
348
- "epoch": 0.37836589533970716,
349
- "eval_loss": 0.9830868584380302,
350
- "eval_mse_loss": 0.9830868584380302,
351
- "eval_runtime": 100.3923,
352
- "eval_samples_per_second": 278.836,
353
- "eval_steps_per_second": 4.363,
354
- "step": 8192
355
- },
356
- {
357
- "epoch": 0.390189829569073,
358
- "grad_norm": 0.3661426603794098,
359
- "learning_rate": 0.0002862731301913133,
360
- "loss": 1.0279,
361
- "step": 8448
362
- },
363
- {
364
- "epoch": 0.4020137637984389,
365
- "grad_norm": 0.3046676218509674,
366
- "learning_rate": 0.0002791536153345875,
367
- "loss": 1.0275,
368
- "step": 8704
369
- },
370
- {
371
- "epoch": 0.41383769802780473,
372
- "grad_norm": 0.3376181721687317,
373
- "learning_rate": 0.0002719130933454701,
374
- "loss": 1.0282,
375
- "step": 8960
376
- },
377
- {
378
- "epoch": 0.4256616322571706,
379
- "grad_norm": 0.3915064334869385,
380
- "learning_rate": 0.00026456263326735493,
381
- "loss": 1.0307,
382
- "step": 9216
383
- },
384
- {
385
- "epoch": 0.4256616322571706,
386
- "eval_loss": 0.984883288406346,
387
- "eval_mse_loss": 0.984883288406346,
388
- "step": 9216
389
- },
390
- {
391
- "epoch": 0.4256616322571706,
392
- "eval_loss": 0.984883288406346,
393
- "eval_mse_loss": 0.984883288406346,
394
- "eval_runtime": 100.2629,
395
- "eval_samples_per_second": 279.196,
396
- "eval_steps_per_second": 4.369,
397
- "step": 9216
398
- },
399
- {
400
- "epoch": 0.4374855664865364,
401
- "grad_norm": 0.33798637986183167,
402
- "learning_rate": 0.0002571134722129381,
403
- "loss": 1.0297,
404
- "step": 9472
405
- },
406
- {
407
- "epoch": 0.44930950071590225,
408
- "grad_norm": 0.2875025272369385,
409
- "learning_rate": 0.0002495769981853336,
410
- "loss": 1.0315,
411
- "step": 9728
412
- },
413
- {
414
- "epoch": 0.4611334349452681,
415
- "grad_norm": 0.33755892515182495,
416
- "learning_rate": 0.0002419647326685136,
417
- "loss": 1.031,
418
- "step": 9984
419
- },
420
- {
421
- "epoch": 0.472957369174634,
422
- "grad_norm": 0.2760786712169647,
423
- "learning_rate": 0.00023431840867685062,
424
- "loss": 1.0319,
425
- "step": 10240
426
- },
427
- {
428
- "epoch": 0.472957369174634,
429
- "eval_loss": 0.9867455217664101,
430
- "eval_mse_loss": 0.9867455217664101,
431
- "step": 10240
432
- },
433
- {
434
- "epoch": 0.472957369174634,
435
- "eval_loss": 0.9867455217664101,
436
- "eval_mse_loss": 0.9867455217664101,
437
- "eval_runtime": 100.0615,
438
- "eval_samples_per_second": 279.758,
439
- "eval_steps_per_second": 4.377,
440
- "step": 10240
441
- }
442
- ],
443
- "logging_steps": 256,
444
- "max_steps": 21651,
445
- "num_input_tokens_seen": 0,
446
- "num_train_epochs": 1,
447
- "save_steps": 1024,
448
- "stateful_callbacks": {
449
- "TrainerControl": {
450
- "args": {
451
- "should_epoch_stop": false,
452
- "should_evaluate": false,
453
- "should_log": false,
454
- "should_save": true,
455
- "should_training_stop": false
456
- },
457
- "attributes": {}
458
- }
459
- },
460
- "total_flos": 0.0,
461
- "train_batch_size": 64,
462
- "trial_name": null,
463
- "trial_params": null
464
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-10240/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cb9674560e8f032729757b059398ec449fd02c0ccb60413a5d94de5b6442629
3
- size 5777
 
 
 
 
checkpoints/checkpoint-21651/ema.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:feaea8def4267d7feb8cdf3fa3b5e5783c1d9ec500b941ac06d1d3ba82f8388d
3
- size 634904
 
 
 
 
checkpoints/checkpoint-21651/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:93d2c6522a514560e4df605c64e92cb78dac8fe345eba6a8eca9ecd6f292870a
3
- size 19272752
 
 
 
 
checkpoints/checkpoint-21651/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fa293976ee742dd2f3feb412e7fc165c5c4169ba85540aafda7ad68ce306239
3
- size 1337227
 
 
 
 
checkpoints/checkpoint-21651/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cbe65e3891168fa6d91b79ffd0ce826f20fd596a538e5897552ab55ecbe73a0
3
- size 14645
 
 
 
 
checkpoints/checkpoint-21651/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b4840253f689f85b53507bf8bfcf7219c70eaf98d96eb849f5c45b1fe52ac00
3
- size 1383
 
 
 
 
checkpoints/checkpoint-21651/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ff448be4b4d90a62cca3d12f4b0e7a0f6acf476b040d09aac40be4d55ce181c
3
- size 1465
 
 
 
 
checkpoints/checkpoint-21651/trainer_state.json DELETED
@@ -1,937 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
- "eval_steps": 1024,
7
- "global_step": 21651,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.011823934229365849,
14
- "grad_norm": 0.11696221679449081,
15
- "learning_rate": 9.418282548476454e-05,
16
- "loss": 1.568,
17
- "step": 256
18
- },
19
- {
20
- "epoch": 0.023647868458731697,
21
- "grad_norm": 0.14284025132656097,
22
- "learning_rate": 0.00018873499538319484,
23
- "loss": 1.4627,
24
- "step": 512
25
- },
26
- {
27
- "epoch": 0.03547180268809755,
28
- "grad_norm": 0.14623360335826874,
29
- "learning_rate": 0.00028328716528162515,
30
- "loss": 1.3763,
31
- "step": 768
32
- },
33
- {
34
- "epoch": 0.047295736917463395,
35
- "grad_norm": 0.1999262273311615,
36
- "learning_rate": 0.0003778393351800554,
37
- "loss": 1.325,
38
- "step": 1024
39
- },
40
- {
41
- "epoch": 0.047295736917463395,
42
- "eval_loss": 1.2694775628172643,
43
- "eval_mse_loss": 1.2694775628172643,
44
- "step": 1024
45
- },
46
- {
47
- "epoch": 0.047295736917463395,
48
- "eval_loss": 1.2694775628172643,
49
- "eval_mse_loss": 1.2694775628172643,
50
- "eval_runtime": 99.7615,
51
- "eval_samples_per_second": 280.599,
52
- "eval_steps_per_second": 4.39,
53
- "step": 1024
54
- },
55
- {
56
- "epoch": 0.05911967114682925,
57
- "grad_norm": 0.17195427417755127,
58
- "learning_rate": 0.000399910381984159,
59
- "loss": 1.2775,
60
- "step": 1280
61
- },
62
- {
63
- "epoch": 0.0709436053761951,
64
- "grad_norm": 0.24726267158985138,
65
- "learning_rate": 0.0003995235470960099,
66
- "loss": 1.2335,
67
- "step": 1536
68
- },
69
- {
70
- "epoch": 0.08276753960556095,
71
- "grad_norm": 0.29991263151168823,
72
- "learning_rate": 0.000398831687956263,
73
- "loss": 1.2043,
74
- "step": 1792
75
- },
76
- {
77
- "epoch": 0.09459147383492679,
78
- "grad_norm": 0.2526465356349945,
79
- "learning_rate": 0.00039783586225369426,
80
- "loss": 1.1758,
81
- "step": 2048
82
- },
83
- {
84
- "epoch": 0.09459147383492679,
85
- "eval_loss": 1.1193939001048536,
86
- "eval_mse_loss": 1.1193939001048536,
87
- "step": 2048
88
- },
89
- {
90
- "epoch": 0.09459147383492679,
91
- "eval_loss": 1.1193939001048536,
92
- "eval_mse_loss": 1.1193939001048536,
93
- "eval_runtime": 99.592,
94
- "eval_samples_per_second": 281.077,
95
- "eval_steps_per_second": 4.398,
96
- "step": 2048
97
- },
98
- {
99
- "epoch": 0.10641540806429264,
100
- "grad_norm": 0.23426194489002228,
101
- "learning_rate": 0.0003965375923699678,
102
- "loss": 1.1514,
103
- "step": 2304
104
- },
105
- {
106
- "epoch": 0.1182393422936585,
107
- "grad_norm": 0.2490096390247345,
108
- "learning_rate": 0.0003949388630522747,
109
- "loss": 1.1287,
110
- "step": 2560
111
- },
112
- {
113
- "epoch": 0.13006327652302435,
114
- "grad_norm": 0.3001377582550049,
115
- "learning_rate": 0.0003930421183791246,
116
- "loss": 1.1064,
117
- "step": 2816
118
- },
119
- {
120
- "epoch": 0.1418872107523902,
121
- "grad_norm": 0.31271305680274963,
122
- "learning_rate": 0.00039085025802392876,
123
- "loss": 1.0907,
124
- "step": 3072
125
- },
126
- {
127
- "epoch": 0.1418872107523902,
128
- "eval_loss": 1.0367718265481192,
129
- "eval_mse_loss": 1.0367718265481192,
130
- "step": 3072
131
- },
132
- {
133
- "epoch": 0.1418872107523902,
134
- "eval_loss": 1.0367718265481192,
135
- "eval_mse_loss": 1.0367718265481192,
136
- "eval_runtime": 99.0318,
137
- "eval_samples_per_second": 282.667,
138
- "eval_steps_per_second": 4.423,
139
- "step": 3072
140
- },
141
- {
142
- "epoch": 0.15371114498175603,
143
- "grad_norm": 0.31617286801338196,
144
- "learning_rate": 0.0003883666328220855,
145
- "loss": 1.0795,
146
- "step": 3328
147
- },
148
- {
149
- "epoch": 0.1655350792111219,
150
- "grad_norm": 0.3006609082221985,
151
- "learning_rate": 0.00038559503964834713,
152
- "loss": 1.072,
153
- "step": 3584
154
- },
155
- {
156
- "epoch": 0.17735901344048774,
157
- "grad_norm": 0.3017251491546631,
158
- "learning_rate": 0.00038253971561229733,
159
- "loss": 1.0632,
160
- "step": 3840
161
- },
162
- {
163
- "epoch": 0.18918294766985358,
164
- "grad_norm": 0.29505959153175354,
165
- "learning_rate": 0.00037920533158081383,
166
- "loss": 1.0583,
167
- "step": 4096
168
- },
169
- {
170
- "epoch": 0.18918294766985358,
171
- "eval_loss": 1.0053560841301261,
172
- "eval_mse_loss": 1.0053560841301261,
173
- "step": 4096
174
- },
175
- {
176
- "epoch": 0.18918294766985358,
177
- "eval_loss": 1.0053560841301261,
178
- "eval_mse_loss": 1.0053560841301261,
179
- "eval_runtime": 99.7606,
180
- "eval_samples_per_second": 280.602,
181
- "eval_steps_per_second": 4.391,
182
- "step": 4096
183
- },
184
- {
185
- "epoch": 0.20100688189921945,
186
- "grad_norm": 0.327700138092041,
187
- "learning_rate": 0.00037559698503741954,
188
- "loss": 1.0539,
189
- "step": 4352
190
- },
191
- {
192
- "epoch": 0.2128308161285853,
193
- "grad_norm": 0.3910951316356659,
194
- "learning_rate": 0.0003717201922894372,
195
- "loss": 1.0525,
196
- "step": 4608
197
- },
198
- {
199
- "epoch": 0.22465475035795113,
200
- "grad_norm": 0.3237537443637848,
201
- "learning_rate": 0.0003675808800348619,
202
- "loss": 1.0461,
203
- "step": 4864
204
- },
205
- {
206
- "epoch": 0.236478684587317,
207
- "grad_norm": 0.4316788911819458,
208
- "learning_rate": 0.0003631853763018435,
209
- "loss": 1.042,
210
- "step": 5120
211
- },
212
- {
213
- "epoch": 0.236478684587317,
214
- "eval_loss": 0.9950277422933274,
215
- "eval_mse_loss": 0.9950277422933274,
216
- "step": 5120
217
- },
218
- {
219
- "epoch": 0.236478684587317,
220
- "eval_loss": 0.9950277422933274,
221
- "eval_mse_loss": 0.9950277422933274,
222
- "eval_runtime": 99.7527,
223
- "eval_samples_per_second": 280.624,
224
- "eval_steps_per_second": 4.391,
225
- "step": 5120
226
- },
227
- {
228
- "epoch": 0.24830261881668284,
229
- "grad_norm": 0.32822364568710327,
230
- "learning_rate": 0.0003585404007746302,
231
- "loss": 1.0385,
232
- "step": 5376
233
- },
234
- {
235
- "epoch": 0.2601265530460487,
236
- "grad_norm": 0.40738987922668457,
237
- "learning_rate": 0.0003536530545207627,
238
- "loss": 1.0378,
239
- "step": 5632
240
- },
241
- {
242
- "epoch": 0.27195048727541454,
243
- "grad_norm": 0.318866103887558,
244
- "learning_rate": 0.0003485308091352234,
245
- "loss": 1.0375,
246
- "step": 5888
247
- },
248
- {
249
- "epoch": 0.2837744215047804,
250
- "grad_norm": 0.34148862957954407,
251
- "learning_rate": 0.0003431814953181376,
252
- "loss": 1.0351,
253
- "step": 6144
254
- },
255
- {
256
- "epoch": 0.2837744215047804,
257
- "eval_loss": 0.9900110875634842,
258
- "eval_mse_loss": 0.9900110875634842,
259
- "step": 6144
260
- },
261
- {
262
- "epoch": 0.2837744215047804,
263
- "eval_loss": 0.9900110875634842,
264
- "eval_mse_loss": 0.9900110875634842,
265
- "eval_runtime": 99.6906,
266
- "eval_samples_per_second": 280.799,
267
- "eval_steps_per_second": 4.394,
268
- "step": 6144
269
- },
270
- {
271
- "epoch": 0.2955983557341462,
272
- "grad_norm": 0.2927286624908447,
273
- "learning_rate": 0.0003376132909034871,
274
- "loss": 1.0342,
275
- "step": 6400
276
- },
277
- {
278
- "epoch": 0.30742228996351206,
279
- "grad_norm": 0.3355765640735626,
280
- "learning_rate": 0.000331834708357139,
281
- "loss": 1.0334,
282
- "step": 6656
283
- },
284
- {
285
- "epoch": 0.3192462241928779,
286
- "grad_norm": 0.3341493308544159,
287
- "learning_rate": 0.00032585458176330104,
288
- "loss": 1.0315,
289
- "step": 6912
290
- },
291
- {
292
- "epoch": 0.3310701584222438,
293
- "grad_norm": 0.3863889276981354,
294
- "learning_rate": 0.0003196820533192994,
295
- "loss": 1.0316,
296
- "step": 7168
297
- },
298
- {
299
- "epoch": 0.3310701584222438,
300
- "eval_loss": 0.9869283731669596,
301
- "eval_mse_loss": 0.9869283731669596,
302
- "step": 7168
303
- },
304
- {
305
- "epoch": 0.3310701584222438,
306
- "eval_loss": 0.9869283731669596,
307
- "eval_mse_loss": 0.9869283731669596,
308
- "eval_runtime": 99.9137,
309
- "eval_samples_per_second": 280.172,
310
- "eval_steps_per_second": 4.384,
311
- "step": 7168
312
- },
313
- {
314
- "epoch": 0.34289409265160964,
315
- "grad_norm": 0.37314340472221375,
316
- "learning_rate": 0.00031332655935932356,
317
- "loss": 1.0304,
318
- "step": 7424
319
- },
320
- {
321
- "epoch": 0.3547180268809755,
322
- "grad_norm": 0.31663990020751953,
323
- "learning_rate": 0.0003067978159285067,
324
- "loss": 1.0291,
325
- "step": 7680
326
- },
327
- {
328
- "epoch": 0.3665419611103413,
329
- "grad_norm": 0.3562430143356323,
330
- "learning_rate": 0.0003001058039293929,
331
- "loss": 1.0282,
332
- "step": 7936
333
- },
334
- {
335
- "epoch": 0.37836589533970716,
336
- "grad_norm": 0.32352393865585327,
337
- "learning_rate": 0.0002932607538635016,
338
- "loss": 1.0289,
339
- "step": 8192
340
- },
341
- {
342
- "epoch": 0.37836589533970716,
343
- "eval_loss": 0.9830868584380302,
344
- "eval_mse_loss": 0.9830868584380302,
345
- "step": 8192
346
- },
347
- {
348
- "epoch": 0.37836589533970716,
349
- "eval_loss": 0.9830868584380302,
350
- "eval_mse_loss": 0.9830868584380302,
351
- "eval_runtime": 100.3923,
352
- "eval_samples_per_second": 278.836,
353
- "eval_steps_per_second": 4.363,
354
- "step": 8192
355
- },
356
- {
357
- "epoch": 0.390189829569073,
358
- "grad_norm": 0.3661426603794098,
359
- "learning_rate": 0.0002862731301913133,
360
- "loss": 1.0279,
361
- "step": 8448
362
- },
363
- {
364
- "epoch": 0.4020137637984389,
365
- "grad_norm": 0.3046676218509674,
366
- "learning_rate": 0.0002791536153345875,
367
- "loss": 1.0275,
368
- "step": 8704
369
- },
370
- {
371
- "epoch": 0.41383769802780473,
372
- "grad_norm": 0.3376181721687317,
373
- "learning_rate": 0.0002719130933454701,
374
- "loss": 1.0282,
375
- "step": 8960
376
- },
377
- {
378
- "epoch": 0.4256616322571706,
379
- "grad_norm": 0.3915064334869385,
380
- "learning_rate": 0.00026456263326735493,
381
- "loss": 1.0307,
382
- "step": 9216
383
- },
384
- {
385
- "epoch": 0.4256616322571706,
386
- "eval_loss": 0.984883288406346,
387
- "eval_mse_loss": 0.984883288406346,
388
- "step": 9216
389
- },
390
- {
391
- "epoch": 0.4256616322571706,
392
- "eval_loss": 0.984883288406346,
393
- "eval_mse_loss": 0.984883288406346,
394
- "eval_runtime": 100.2629,
395
- "eval_samples_per_second": 279.196,
396
- "eval_steps_per_second": 4.369,
397
- "step": 9216
398
- },
399
- {
400
- "epoch": 0.4374855664865364,
401
- "grad_norm": 0.33798637986183167,
402
- "learning_rate": 0.0002571134722129381,
403
- "loss": 1.0297,
404
- "step": 9472
405
- },
406
- {
407
- "epoch": 0.44930950071590225,
408
- "grad_norm": 0.2875025272369385,
409
- "learning_rate": 0.0002495769981853336,
410
- "loss": 1.0315,
411
- "step": 9728
412
- },
413
- {
414
- "epoch": 0.4611334349452681,
415
- "grad_norm": 0.33755892515182495,
416
- "learning_rate": 0.0002419647326685136,
417
- "loss": 1.031,
418
- "step": 9984
419
- },
420
- {
421
- "epoch": 0.472957369174634,
422
- "grad_norm": 0.2760786712169647,
423
- "learning_rate": 0.00023431840867685062,
424
- "loss": 1.0319,
425
- "step": 10240
426
- },
427
- {
428
- "epoch": 0.472957369174634,
429
- "eval_loss": 0.9867455217664101,
430
- "eval_mse_loss": 0.9867455217664101,
431
- "step": 10240
432
- },
433
- {
434
- "epoch": 0.472957369174634,
435
- "eval_loss": 0.9867455217664101,
436
- "eval_mse_loss": 0.9867455217664101,
437
- "eval_runtime": 100.0615,
438
- "eval_samples_per_second": 279.758,
439
- "eval_steps_per_second": 4.377,
440
- "step": 10240
441
- },
442
- {
443
- "epoch": 0.48478130340399983,
444
- "grad_norm": 0.3291190266609192,
445
- "learning_rate": 0.00022658975213149085,
446
- "loss": 1.0324,
447
- "step": 10496
448
- },
449
- {
450
- "epoch": 0.49660523763336567,
451
- "grad_norm": 0.3030407130718231,
452
- "learning_rate": 0.00021882044615220646,
453
- "loss": 1.0305,
454
- "step": 10752
455
- },
456
- {
457
- "epoch": 0.5084291718627315,
458
- "grad_norm": 0.2691882848739624,
459
- "learning_rate": 0.00021105286996472446,
460
- "loss": 1.0362,
461
- "step": 11008
462
- },
463
- {
464
- "epoch": 0.5202531060920974,
465
- "grad_norm": 0.2871313989162445,
466
- "learning_rate": 0.00020323798398011488,
467
- "loss": 1.0375,
468
- "step": 11264
469
- },
470
- {
471
- "epoch": 0.5202531060920974,
472
- "eval_loss": 0.9919488762067333,
473
- "eval_mse_loss": 0.9919488762067333,
474
- "step": 11264
475
- },
476
- {
477
- "epoch": 0.5202531060920974,
478
- "eval_loss": 0.9919488762067333,
479
- "eval_mse_loss": 0.9919488762067333,
480
- "eval_runtime": 99.6352,
481
- "eval_samples_per_second": 280.955,
482
- "eval_steps_per_second": 4.396,
483
- "step": 11264
484
- },
485
- {
486
- "epoch": 0.5320770403214632,
487
- "grad_norm": 0.34249189496040344,
488
- "learning_rate": 0.0001954181478848312,
489
- "loss": 1.035,
490
- "step": 11520
491
- },
492
- {
493
- "epoch": 0.5439009745508291,
494
- "grad_norm": 0.30227038264274597,
495
- "learning_rate": 0.00018760531635634027,
496
- "loss": 1.0401,
497
- "step": 11776
498
- },
499
- {
500
- "epoch": 0.5557249087801949,
501
- "grad_norm": 0.3367672860622406,
502
- "learning_rate": 0.0001798114333637852,
503
- "loss": 1.0416,
504
- "step": 12032
505
- },
506
- {
507
- "epoch": 0.5675488430095608,
508
- "grad_norm": 0.35392817854881287,
509
- "learning_rate": 0.00017204841390848513,
510
- "loss": 1.0407,
511
- "step": 12288
512
- },
513
- {
514
- "epoch": 0.5675488430095608,
515
- "eval_loss": 1.0043473209692464,
516
- "eval_mse_loss": 1.0043473209692464,
517
- "step": 12288
518
- },
519
- {
520
- "epoch": 0.5675488430095608,
521
- "eval_loss": 1.0043473209692464,
522
- "eval_mse_loss": 1.0043473209692464,
523
- "eval_runtime": 99.386,
524
- "eval_samples_per_second": 281.659,
525
- "eval_steps_per_second": 4.407,
526
- "step": 12288
527
- },
528
- {
529
- "epoch": 0.5793727772389267,
530
- "grad_norm": 0.2517242431640625,
531
- "learning_rate": 0.00016432812580871982,
532
- "loss": 1.0431,
533
- "step": 12544
534
- },
535
- {
536
- "epoch": 0.5911967114682924,
537
- "grad_norm": 0.31317463517189026,
538
- "learning_rate": 0.00015666237155664652,
539
- "loss": 1.0451,
540
- "step": 12800
541
- },
542
- {
543
- "epoch": 0.6030206456976583,
544
- "grad_norm": 0.27259454131126404,
545
- "learning_rate": 0.00014906287027508398,
546
- "loss": 1.0491,
547
- "step": 13056
548
- },
549
- {
550
- "epoch": 0.6148445799270241,
551
- "grad_norm": 0.2674678862094879,
552
- "learning_rate": 0.00014154123980174944,
553
- "loss": 1.0467,
554
- "step": 13312
555
- },
556
- {
557
- "epoch": 0.6148445799270241,
558
- "eval_loss": 1.0088394922480735,
559
- "eval_mse_loss": 1.0088394922480735,
560
- "step": 13312
561
- },
562
- {
563
- "epoch": 0.6148445799270241,
564
- "eval_loss": 1.0088394922480735,
565
- "eval_mse_loss": 1.0088394922480735,
566
- "eval_runtime": 100.1788,
567
- "eval_samples_per_second": 279.43,
568
- "eval_steps_per_second": 4.372,
569
- "step": 13312
570
- },
571
- {
572
- "epoch": 0.62666851415639,
573
- "grad_norm": 0.3362578749656677,
574
- "learning_rate": 0.00013410897892833596,
575
- "loss": 1.049,
576
- "step": 13568
577
- },
578
- {
579
- "epoch": 0.6384924483857558,
580
- "grad_norm": 0.26616010069847107,
581
- "learning_rate": 0.00012677744982158334,
582
- "loss": 1.0532,
583
- "step": 13824
584
- },
585
- {
586
- "epoch": 0.6503163826151217,
587
- "grad_norm": 0.2684532403945923,
588
- "learning_rate": 0.00011955786065321544,
589
- "loss": 1.0556,
590
- "step": 14080
591
- },
592
- {
593
- "epoch": 0.6621403168444876,
594
- "grad_norm": 0.24931219220161438,
595
- "learning_rate": 0.00011246124846530075,
596
- "loss": 1.0575,
597
- "step": 14336
598
- },
599
- {
600
- "epoch": 0.6621403168444876,
601
- "eval_loss": 1.0203365499842656,
602
- "eval_mse_loss": 1.0203365499842656,
603
- "step": 14336
604
- },
605
- {
606
- "epoch": 0.6621403168444876,
607
- "eval_loss": 1.0203365499842656,
608
- "eval_mse_loss": 1.0203365499842656,
609
- "eval_runtime": 99.6009,
610
- "eval_samples_per_second": 281.052,
611
- "eval_steps_per_second": 4.398,
612
- "step": 14336
613
- },
614
- {
615
- "epoch": 0.6739642510738534,
616
- "grad_norm": 0.2652958929538727,
617
- "learning_rate": 0.00010549846229722837,
618
- "loss": 1.0648,
619
- "step": 14592
620
- },
621
- {
622
- "epoch": 0.6857881853032193,
623
- "grad_norm": 0.2742628753185272,
624
- "learning_rate": 9.868014660009637e-05,
625
- "loss": 1.0651,
626
- "step": 14848
627
- },
628
- {
629
- "epoch": 0.6976121195325851,
630
- "grad_norm": 0.2334495335817337,
631
- "learning_rate": 9.204243934249532e-05,
632
- "loss": 1.0699,
633
- "step": 15104
634
- },
635
- {
636
- "epoch": 0.709436053761951,
637
- "grad_norm": 0.2503329813480377,
638
- "learning_rate": 8.554343421704003e-05,
639
- "loss": 1.0706,
640
- "step": 15360
641
- },
642
- {
643
- "epoch": 0.709436053761951,
644
- "eval_loss": 1.030193983828096,
645
- "eval_mse_loss": 1.030193983828096,
646
- "step": 15360
647
- },
648
- {
649
- "epoch": 0.709436053761951,
650
- "eval_loss": 1.030193983828096,
651
- "eval_mse_loss": 1.030193983828096,
652
- "eval_runtime": 100.1105,
653
- "eval_samples_per_second": 279.621,
654
- "eval_steps_per_second": 4.375,
655
- "step": 15360
656
- },
657
- {
658
- "epoch": 0.7212599879913169,
659
- "grad_norm": 0.2387477457523346,
660
- "learning_rate": 7.921940607463536e-05,
661
- "loss": 1.0736,
662
- "step": 15616
663
- },
664
- {
665
- "epoch": 0.7330839222206826,
666
- "grad_norm": 0.2524156868457794,
667
- "learning_rate": 7.308002285663097e-05,
668
- "loss": 1.0767,
669
- "step": 15872
670
- },
671
- {
672
- "epoch": 0.7449078564500485,
673
- "grad_norm": 0.23690827190876007,
674
- "learning_rate": 6.713467022601562e-05,
675
- "loss": 1.0788,
676
- "step": 16128
677
- },
678
- {
679
- "epoch": 0.7567317906794143,
680
- "grad_norm": 0.25175976753234863,
681
- "learning_rate": 6.139243721896071e-05,
682
- "loss": 1.0853,
683
- "step": 16384
684
- },
685
- {
686
- "epoch": 0.7567317906794143,
687
- "eval_loss": 1.038661680809439,
688
- "eval_mse_loss": 1.038661680809439,
689
- "step": 16384
690
- },
691
- {
692
- "epoch": 0.7567317906794143,
693
- "eval_loss": 1.038661680809439,
694
- "eval_mse_loss": 1.038661680809439,
695
- "eval_runtime": 100.1169,
696
- "eval_samples_per_second": 279.603,
697
- "eval_steps_per_second": 4.375,
698
- "step": 16384
699
- },
700
- {
701
- "epoch": 0.7685557249087802,
702
- "grad_norm": 0.21369080245494843,
703
- "learning_rate": 5.586210234983709e-05,
704
- "loss": 1.0881,
705
- "step": 16640
706
- },
707
- {
708
- "epoch": 0.780379659138146,
709
- "grad_norm": 0.20162087678909302,
710
- "learning_rate": 5.057242299355618e-05,
711
- "loss": 1.0924,
712
- "step": 16896
713
- },
714
- {
715
- "epoch": 0.7922035933675119,
716
- "grad_norm": 0.24671520292758942,
717
- "learning_rate": 4.549000344276444e-05,
718
- "loss": 1.0936,
719
- "step": 17152
720
- },
721
- {
722
- "epoch": 0.8040275275968778,
723
- "grad_norm": 0.21685843169689178,
724
- "learning_rate": 4.0643793085073956e-05,
725
- "loss": 1.0967,
726
- "step": 17408
727
- },
728
- {
729
- "epoch": 0.8040275275968778,
730
- "eval_loss": 1.0570238397273843,
731
- "eval_mse_loss": 1.0570238397273843,
732
- "step": 17408
733
- },
734
- {
735
- "epoch": 0.8040275275968778,
736
- "eval_loss": 1.0570238397273843,
737
- "eval_mse_loss": 1.0570238397273843,
738
- "eval_runtime": 100.299,
739
- "eval_samples_per_second": 279.095,
740
- "eval_steps_per_second": 4.367,
741
- "step": 17408
742
- },
743
- {
744
- "epoch": 0.8158514618262436,
745
- "grad_norm": 0.22125214338302612,
746
- "learning_rate": 3.604120062842402e-05,
747
- "loss": 1.1033,
748
- "step": 17664
749
- },
750
- {
751
- "epoch": 0.8276753960556095,
752
- "grad_norm": 0.2175501137971878,
753
- "learning_rate": 3.168926234667766e-05,
754
- "loss": 1.1094,
755
- "step": 17920
756
- },
757
- {
758
- "epoch": 0.8394993302849753,
759
- "grad_norm": 0.2011047899723053,
760
- "learning_rate": 2.759463132282547e-05,
761
- "loss": 1.1108,
762
- "step": 18176
763
- },
764
- {
765
- "epoch": 0.8513232645143411,
766
- "grad_norm": 0.2260669618844986,
767
- "learning_rate": 2.3763567277996935e-05,
768
- "loss": 1.1152,
769
- "step": 18432
770
- },
771
- {
772
- "epoch": 0.8513232645143411,
773
- "eval_loss": 1.0761699030116267,
774
- "eval_mse_loss": 1.0761699030116267,
775
- "step": 18432
776
- },
777
- {
778
- "epoch": 0.8513232645143411,
779
- "eval_loss": 1.0761699030116267,
780
- "eval_mse_loss": 1.0761699030116267,
781
- "eval_runtime": 99.8777,
782
- "eval_samples_per_second": 280.273,
783
- "eval_steps_per_second": 4.385,
784
- "step": 18432
785
- },
786
- {
787
- "epoch": 0.8631471987437069,
788
- "grad_norm": 0.18276292085647583,
789
- "learning_rate": 2.0201927001829836e-05,
790
- "loss": 1.1203,
791
- "step": 18688
792
- },
793
- {
794
- "epoch": 0.8749711329730728,
795
- "grad_norm": 0.172455832362175,
796
- "learning_rate": 1.6915155398825423e-05,
797
- "loss": 1.124,
798
- "step": 18944
799
- },
800
- {
801
- "epoch": 0.8867950672024387,
802
- "grad_norm": 0.20908404886722565,
803
- "learning_rate": 1.3919472160342173e-05,
804
- "loss": 1.1311,
805
- "step": 19200
806
- },
807
- {
808
- "epoch": 0.8986190014318045,
809
- "grad_norm": 0.19443683326244354,
810
- "learning_rate": 1.1195964427203121e-05,
811
- "loss": 1.1341,
812
- "step": 19456
813
- },
814
- {
815
- "epoch": 0.8986190014318045,
816
- "eval_loss": 1.0952530190280583,
817
- "eval_mse_loss": 1.0952530190280583,
818
- "step": 19456
819
- },
820
- {
821
- "epoch": 0.8986190014318045,
822
- "eval_loss": 1.0952530190280583,
823
- "eval_mse_loss": 1.0952530190280583,
824
- "eval_runtime": 100.0076,
825
- "eval_samples_per_second": 279.909,
826
- "eval_steps_per_second": 4.38,
827
- "step": 19456
828
- },
829
- {
830
- "epoch": 0.9104429356611704,
831
- "grad_norm": 0.16602899134159088,
832
- "learning_rate": 8.761093351177674e-06,
833
- "loss": 1.1367,
834
- "step": 19712
835
- },
836
- {
837
- "epoch": 0.9222668698905362,
838
- "grad_norm": 0.1926073431968689,
839
- "learning_rate": 6.618581273505786e-06,
840
- "loss": 1.145,
841
- "step": 19968
842
- },
843
- {
844
- "epoch": 0.9340908041199021,
845
- "grad_norm": 0.19851690530776978,
846
- "learning_rate": 4.771703587756516e-06,
847
- "loss": 1.1507,
848
- "step": 20224
849
- },
850
- {
851
- "epoch": 0.945914738349268,
852
- "grad_norm": 0.19018347561359406,
853
- "learning_rate": 3.223283732526894e-06,
854
- "loss": 1.1569,
855
- "step": 20480
856
- },
857
- {
858
- "epoch": 0.945914738349268,
859
- "eval_loss": 1.1172848334323326,
860
- "eval_mse_loss": 1.1172848334323326,
861
- "step": 20480
862
- },
863
- {
864
- "epoch": 0.945914738349268,
865
- "eval_loss": 1.1172848334323326,
866
- "eval_mse_loss": 1.1172848334323326,
867
- "eval_runtime": 101.775,
868
- "eval_samples_per_second": 275.048,
869
- "eval_steps_per_second": 4.304,
870
- "step": 20480
871
- },
872
- {
873
- "epoch": 0.9577386725786338,
874
- "grad_norm": 0.1803227663040161,
875
- "learning_rate": 1.9756888750729786e-06,
876
- "loss": 1.1608,
877
- "step": 20736
878
- },
879
- {
880
- "epoch": 0.9695626068079997,
881
- "grad_norm": 0.1926874816417694,
882
- "learning_rate": 1.0308262924717538e-06,
883
- "loss": 1.1687,
884
- "step": 20992
885
- },
886
- {
887
- "epoch": 0.9813865410373654,
888
- "grad_norm": 0.21967464685440063,
889
- "learning_rate": 3.90140455846133e-07,
890
- "loss": 1.175,
891
- "step": 21248
892
- },
893
- {
894
- "epoch": 0.9932104752667313,
895
- "grad_norm": 0.16599184274673462,
896
- "learning_rate": 5.532698894212818e-08,
897
- "loss": 1.1762,
898
- "step": 21504
899
- },
900
- {
901
- "epoch": 0.9932104752667313,
902
- "eval_loss": 1.1381378846081425,
903
- "eval_mse_loss": 1.1381378846081425,
904
- "step": 21504
905
- },
906
- {
907
- "epoch": 0.9932104752667313,
908
- "eval_loss": 1.1381378846081425,
909
- "eval_mse_loss": 1.1381378846081425,
910
- "eval_runtime": 99.4896,
911
- "eval_samples_per_second": 281.366,
912
- "eval_steps_per_second": 4.402,
913
- "step": 21504
914
- }
915
- ],
916
- "logging_steps": 256,
917
- "max_steps": 21651,
918
- "num_input_tokens_seen": 0,
919
- "num_train_epochs": 1,
920
- "save_steps": 1024,
921
- "stateful_callbacks": {
922
- "TrainerControl": {
923
- "args": {
924
- "should_epoch_stop": false,
925
- "should_evaluate": false,
926
- "should_log": false,
927
- "should_save": true,
928
- "should_training_stop": true
929
- },
930
- "attributes": {}
931
- }
932
- },
933
- "total_flos": 0.0,
934
- "train_batch_size": 64,
935
- "trial_name": null,
936
- "trial_params": null
937
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-21651/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c9b601f2d9cc5910b4d51f85ab22a2e835b407c81b98749f8408466eb47ba16
3
- size 5777