mgh6 commited on
Commit
94eb086
·
verified ·
1 Parent(s): a1b6c15

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95e876461f6941ed752cf45d9ac9302d2fcd14f16fa213ea0d206586aa9f8ad0
3
- size 2695611744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66bf7ec1906085b9017dec52af7ee1d8cb6d38fa8ca3a526334a82ecc740dcbd
3
+ size 2708729576
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efc1f9e0d1f0ba25073b30251fc09e761cd12b900a86d5d2832f4e9ab7cf17ff
3
- size 26261260
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f48d93af9f829aae4cbfe51ee4fffab7189d58377b64586f31b892a561403e60
3
+ size 52499200
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ac7ad6975b02cb2fe1ae9b24f6d70c26049c580d43be5a2feb4f3aa6fc1aa47
3
  size 15006
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:838e024e0a852529a2257dc1dc52b7019dd292f53abc55b9fa61835eb1065232
3
  size 15006
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c42147d2487e975dcb8b015449194c61c9350b5c1b3a114ecd6e3942d3403969
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bca4dbe650e04bc8012dd3f1938dfb2a637329721abd75c3bd59d28a64007b54
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1804 +1,180 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.929233772571987,
5
  "eval_steps": 10,
6
- "global_step": 1280,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.07808687164470474,
13
- "grad_norm": 498170.15625,
14
  "learning_rate": 9.921875000000001e-05,
15
- "loss": 24141.5234,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.07808687164470474,
20
- "eval_runtime": 19.5232,
21
- "eval_samples_per_second": 22.128,
22
- "eval_steps_per_second": 5.532,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.15617374328940947,
27
- "grad_norm": 281270.96875,
28
  "learning_rate": 9.84375e-05,
29
- "loss": 11375.4516,
30
  "step": 20
31
  },
32
  {
33
  "epoch": 0.15617374328940947,
34
- "eval_runtime": 19.5682,
35
- "eval_samples_per_second": 22.077,
36
- "eval_steps_per_second": 5.519,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.2342606149341142,
41
- "grad_norm": 567554.3125,
42
  "learning_rate": 9.765625e-05,
43
- "loss": 8222.268,
44
  "step": 30
45
  },
46
  {
47
  "epoch": 0.2342606149341142,
48
- "eval_runtime": 19.5935,
49
- "eval_samples_per_second": 22.048,
50
- "eval_steps_per_second": 5.512,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.31234748657881894,
55
- "grad_norm": 256184.6875,
56
  "learning_rate": 9.687500000000001e-05,
57
- "loss": 6837.6664,
58
  "step": 40
59
  },
60
  {
61
  "epoch": 0.31234748657881894,
62
- "eval_runtime": 19.5946,
63
- "eval_samples_per_second": 22.047,
64
- "eval_steps_per_second": 5.512,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.3904343582235237,
69
- "grad_norm": 365265.625,
70
  "learning_rate": 9.609375e-05,
71
- "loss": 5512.5262,
72
  "step": 50
73
  },
74
  {
75
  "epoch": 0.3904343582235237,
76
- "eval_runtime": 19.603,
77
- "eval_samples_per_second": 22.037,
78
- "eval_steps_per_second": 5.509,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.4685212298682284,
83
- "grad_norm": 318942.46875,
84
  "learning_rate": 9.53125e-05,
85
- "loss": 4918.2848,
86
  "step": 60
87
  },
88
  {
89
  "epoch": 0.4685212298682284,
90
- "eval_runtime": 19.625,
91
- "eval_samples_per_second": 22.013,
92
- "eval_steps_per_second": 5.503,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.5466081015129332,
97
- "grad_norm": 508354.03125,
98
  "learning_rate": 9.453125000000001e-05,
99
- "loss": 4611.2082,
100
  "step": 70
101
  },
102
  {
103
  "epoch": 0.5466081015129332,
104
- "eval_runtime": 19.629,
105
- "eval_samples_per_second": 22.008,
106
- "eval_steps_per_second": 5.502,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.6246949731576379,
111
- "grad_norm": 372461.53125,
112
  "learning_rate": 9.375e-05,
113
- "loss": 4174.407,
114
  "step": 80
115
  },
116
  {
117
  "epoch": 0.6246949731576379,
118
- "eval_runtime": 19.625,
119
- "eval_samples_per_second": 22.013,
120
- "eval_steps_per_second": 5.503,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.7027818448023426,
125
- "grad_norm": 196101.109375,
126
  "learning_rate": 9.296875e-05,
127
- "loss": 3694.2605,
128
  "step": 90
129
  },
130
  {
131
  "epoch": 0.7027818448023426,
132
- "eval_runtime": 19.6048,
133
- "eval_samples_per_second": 22.035,
134
- "eval_steps_per_second": 5.509,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.7808687164470474,
139
- "grad_norm": 324380.46875,
140
  "learning_rate": 9.21875e-05,
141
- "loss": 3561.6039,
142
  "step": 100
143
  },
144
  {
145
  "epoch": 0.7808687164470474,
146
- "eval_runtime": 19.6105,
147
- "eval_samples_per_second": 22.029,
148
- "eval_steps_per_second": 5.507,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.8589555880917521,
153
- "grad_norm": 253753.125,
154
  "learning_rate": 9.140625e-05,
155
- "loss": 3622.6652,
156
  "step": 110
157
  },
158
  {
159
  "epoch": 0.8589555880917521,
160
- "eval_runtime": 19.6179,
161
- "eval_samples_per_second": 22.021,
162
- "eval_steps_per_second": 5.505,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 0.9370424597364568,
167
- "grad_norm": 471811.90625,
168
  "learning_rate": 9.062500000000001e-05,
169
- "loss": 3647.8051,
170
  "step": 120
171
  },
172
  {
173
  "epoch": 0.9370424597364568,
174
- "eval_runtime": 19.628,
175
- "eval_samples_per_second": 22.009,
176
- "eval_steps_per_second": 5.502,
177
  "step": 120
178
- },
179
- {
180
- "epoch": 1.0078086871644705,
181
- "grad_norm": 688836.3125,
182
- "learning_rate": 8.984375e-05,
183
- "loss": 4461.798,
184
- "step": 130
185
- },
186
- {
187
- "epoch": 1.0078086871644705,
188
- "eval_runtime": 19.4608,
189
- "eval_samples_per_second": 22.198,
190
- "eval_steps_per_second": 5.55,
191
- "step": 130
192
- },
193
- {
194
- "epoch": 1.0858955588091752,
195
- "grad_norm": 564319.6875,
196
- "learning_rate": 8.90625e-05,
197
- "loss": 3695.1797,
198
- "step": 140
199
- },
200
- {
201
- "epoch": 1.0858955588091752,
202
- "eval_runtime": 19.5702,
203
- "eval_samples_per_second": 22.074,
204
- "eval_steps_per_second": 5.519,
205
- "step": 140
206
- },
207
- {
208
- "epoch": 1.16398243045388,
209
- "grad_norm": 297341.28125,
210
- "learning_rate": 8.828125000000001e-05,
211
- "loss": 3063.5557,
212
- "step": 150
213
- },
214
- {
215
- "epoch": 1.16398243045388,
216
- "eval_runtime": 19.6107,
217
- "eval_samples_per_second": 22.029,
218
- "eval_steps_per_second": 5.507,
219
- "step": 150
220
- },
221
- {
222
- "epoch": 1.2420693020985847,
223
- "grad_norm": 294460.375,
224
- "learning_rate": 8.75e-05,
225
- "loss": 3861.7359,
226
- "step": 160
227
- },
228
- {
229
- "epoch": 1.2420693020985847,
230
- "eval_runtime": 19.6236,
231
- "eval_samples_per_second": 22.014,
232
- "eval_steps_per_second": 5.504,
233
- "step": 160
234
- },
235
- {
236
- "epoch": 1.3201561737432894,
237
- "grad_norm": 446928.125,
238
- "learning_rate": 8.671875e-05,
239
- "loss": 3945.3711,
240
- "step": 170
241
- },
242
- {
243
- "epoch": 1.3201561737432894,
244
- "eval_runtime": 19.6104,
245
- "eval_samples_per_second": 22.029,
246
- "eval_steps_per_second": 5.507,
247
- "step": 170
248
- },
249
- {
250
- "epoch": 1.3982430453879942,
251
- "grad_norm": 477259.65625,
252
- "learning_rate": 8.593750000000001e-05,
253
- "loss": 3992.1383,
254
- "step": 180
255
- },
256
- {
257
- "epoch": 1.3982430453879942,
258
- "eval_runtime": 19.6165,
259
- "eval_samples_per_second": 22.022,
260
- "eval_steps_per_second": 5.506,
261
- "step": 180
262
- },
263
- {
264
- "epoch": 1.476329917032699,
265
- "grad_norm": 416177.8125,
266
- "learning_rate": 8.515625e-05,
267
- "loss": 3208.7566,
268
- "step": 190
269
- },
270
- {
271
- "epoch": 1.476329917032699,
272
- "eval_runtime": 19.6029,
273
- "eval_samples_per_second": 22.038,
274
- "eval_steps_per_second": 5.509,
275
- "step": 190
276
- },
277
- {
278
- "epoch": 1.5544167886774036,
279
- "grad_norm": 585219.5,
280
- "learning_rate": 8.4375e-05,
281
- "loss": 3544.4445,
282
- "step": 200
283
- },
284
- {
285
- "epoch": 1.5544167886774036,
286
- "eval_runtime": 19.6024,
287
- "eval_samples_per_second": 22.038,
288
- "eval_steps_per_second": 5.51,
289
- "step": 200
290
- },
291
- {
292
- "epoch": 1.6325036603221084,
293
- "grad_norm": 122144.359375,
294
- "learning_rate": 8.359375000000001e-05,
295
- "loss": 3495.2234,
296
- "step": 210
297
- },
298
- {
299
- "epoch": 1.6325036603221084,
300
- "eval_runtime": 19.6241,
301
- "eval_samples_per_second": 22.014,
302
- "eval_steps_per_second": 5.503,
303
- "step": 210
304
- },
305
- {
306
- "epoch": 1.710590531966813,
307
- "grad_norm": 517473.9375,
308
- "learning_rate": 8.28125e-05,
309
- "loss": 3106.2523,
310
- "step": 220
311
- },
312
- {
313
- "epoch": 1.710590531966813,
314
- "eval_runtime": 19.6219,
315
- "eval_samples_per_second": 22.016,
316
- "eval_steps_per_second": 5.504,
317
- "step": 220
318
- },
319
- {
320
- "epoch": 1.7886774036115178,
321
- "grad_norm": 279783.34375,
322
- "learning_rate": 8.203125e-05,
323
- "loss": 2969.224,
324
- "step": 230
325
- },
326
- {
327
- "epoch": 1.7886774036115178,
328
- "eval_runtime": 19.6274,
329
- "eval_samples_per_second": 22.01,
330
- "eval_steps_per_second": 5.503,
331
- "step": 230
332
- },
333
- {
334
- "epoch": 1.8667642752562226,
335
- "grad_norm": 500301.21875,
336
- "learning_rate": 8.125000000000001e-05,
337
- "loss": 2546.8709,
338
- "step": 240
339
- },
340
- {
341
- "epoch": 1.8667642752562226,
342
- "eval_runtime": 19.6224,
343
- "eval_samples_per_second": 22.016,
344
- "eval_steps_per_second": 5.504,
345
- "step": 240
346
- },
347
- {
348
- "epoch": 1.9448511469009273,
349
- "grad_norm": 420029.9375,
350
- "learning_rate": 8.046875e-05,
351
- "loss": 3124.2377,
352
- "step": 250
353
- },
354
- {
355
- "epoch": 1.9448511469009273,
356
- "eval_runtime": 19.62,
357
- "eval_samples_per_second": 22.018,
358
- "eval_steps_per_second": 5.505,
359
- "step": 250
360
- },
361
- {
362
- "epoch": 2.015617374328941,
363
- "grad_norm": 287152.59375,
364
- "learning_rate": 7.96875e-05,
365
- "loss": 2647.9721,
366
- "step": 260
367
- },
368
- {
369
- "epoch": 2.015617374328941,
370
- "eval_runtime": 19.5051,
371
- "eval_samples_per_second": 22.148,
372
- "eval_steps_per_second": 5.537,
373
- "step": 260
374
- },
375
- {
376
- "epoch": 2.0937042459736457,
377
- "grad_norm": 227855.6875,
378
- "learning_rate": 7.890625000000001e-05,
379
- "loss": 2429.9967,
380
- "step": 270
381
- },
382
- {
383
- "epoch": 2.0937042459736457,
384
- "eval_runtime": 19.5753,
385
- "eval_samples_per_second": 22.069,
386
- "eval_steps_per_second": 5.517,
387
- "step": 270
388
- },
389
- {
390
- "epoch": 2.1717911176183504,
391
- "grad_norm": 180075.734375,
392
- "learning_rate": 7.8125e-05,
393
- "loss": 2353.5262,
394
- "step": 280
395
- },
396
- {
397
- "epoch": 2.1717911176183504,
398
- "eval_runtime": 19.6192,
399
- "eval_samples_per_second": 22.019,
400
- "eval_steps_per_second": 5.505,
401
- "step": 280
402
- },
403
- {
404
- "epoch": 2.249877989263055,
405
- "grad_norm": 374717.40625,
406
- "learning_rate": 7.734375e-05,
407
- "loss": 2223.9002,
408
- "step": 290
409
- },
410
- {
411
- "epoch": 2.249877989263055,
412
- "eval_runtime": 19.6086,
413
- "eval_samples_per_second": 22.031,
414
- "eval_steps_per_second": 5.508,
415
- "step": 290
416
- },
417
- {
418
- "epoch": 2.32796486090776,
419
- "grad_norm": 302520.9375,
420
- "learning_rate": 7.65625e-05,
421
- "loss": 1899.2279,
422
- "step": 300
423
- },
424
- {
425
- "epoch": 2.32796486090776,
426
- "eval_runtime": 19.6274,
427
- "eval_samples_per_second": 22.01,
428
- "eval_steps_per_second": 5.503,
429
- "step": 300
430
- },
431
- {
432
- "epoch": 2.4060517325524646,
433
- "grad_norm": 178822.03125,
434
- "learning_rate": 7.578125e-05,
435
- "loss": 1682.3375,
436
- "step": 310
437
- },
438
- {
439
- "epoch": 2.4060517325524646,
440
- "eval_runtime": 19.6223,
441
- "eval_samples_per_second": 22.016,
442
- "eval_steps_per_second": 5.504,
443
- "step": 310
444
- },
445
- {
446
- "epoch": 2.4841386041971694,
447
- "grad_norm": 495346.4375,
448
- "learning_rate": 7.500000000000001e-05,
449
- "loss": 2398.752,
450
- "step": 320
451
- },
452
- {
453
- "epoch": 2.4841386041971694,
454
- "eval_runtime": 19.5967,
455
- "eval_samples_per_second": 22.044,
456
- "eval_steps_per_second": 5.511,
457
- "step": 320
458
- },
459
- {
460
- "epoch": 2.562225475841874,
461
- "grad_norm": 389064.15625,
462
- "learning_rate": 7.421875e-05,
463
- "loss": 3152.5979,
464
- "step": 330
465
- },
466
- {
467
- "epoch": 2.562225475841874,
468
- "eval_runtime": 19.613,
469
- "eval_samples_per_second": 22.026,
470
- "eval_steps_per_second": 5.507,
471
- "step": 330
472
- },
473
- {
474
- "epoch": 2.640312347486579,
475
- "grad_norm": 364833.375,
476
- "learning_rate": 7.34375e-05,
477
- "loss": 2780.8895,
478
- "step": 340
479
- },
480
- {
481
- "epoch": 2.640312347486579,
482
- "eval_runtime": 19.6058,
483
- "eval_samples_per_second": 22.034,
484
- "eval_steps_per_second": 5.509,
485
- "step": 340
486
- },
487
- {
488
- "epoch": 2.7183992191312836,
489
- "grad_norm": 250624.234375,
490
- "learning_rate": 7.265625000000001e-05,
491
- "loss": 2240.6416,
492
- "step": 350
493
- },
494
- {
495
- "epoch": 2.7183992191312836,
496
- "eval_runtime": 19.6306,
497
- "eval_samples_per_second": 22.006,
498
- "eval_steps_per_second": 5.502,
499
- "step": 350
500
- },
501
- {
502
- "epoch": 2.7964860907759883,
503
- "grad_norm": 355084.1875,
504
- "learning_rate": 7.1875e-05,
505
- "loss": 1683.0152,
506
- "step": 360
507
- },
508
- {
509
- "epoch": 2.7964860907759883,
510
- "eval_runtime": 19.6274,
511
- "eval_samples_per_second": 22.01,
512
- "eval_steps_per_second": 5.503,
513
- "step": 360
514
- },
515
- {
516
- "epoch": 2.874572962420693,
517
- "grad_norm": 408760.53125,
518
- "learning_rate": 7.109375e-05,
519
- "loss": 1970.6135,
520
- "step": 370
521
- },
522
- {
523
- "epoch": 2.874572962420693,
524
- "eval_runtime": 19.6037,
525
- "eval_samples_per_second": 22.037,
526
- "eval_steps_per_second": 5.509,
527
- "step": 370
528
- },
529
- {
530
- "epoch": 2.952659834065398,
531
- "grad_norm": 253820.046875,
532
- "learning_rate": 7.031250000000001e-05,
533
- "loss": 2003.0234,
534
- "step": 380
535
- },
536
- {
537
- "epoch": 2.952659834065398,
538
- "eval_runtime": 19.6328,
539
- "eval_samples_per_second": 22.004,
540
- "eval_steps_per_second": 5.501,
541
- "step": 380
542
- },
543
- {
544
- "epoch": 3.0234260614934114,
545
- "grad_norm": 322866.875,
546
- "learning_rate": 6.953125e-05,
547
- "loss": 1676.3783,
548
- "step": 390
549
- },
550
- {
551
- "epoch": 3.0234260614934114,
552
- "eval_runtime": 19.5185,
553
- "eval_samples_per_second": 22.133,
554
- "eval_steps_per_second": 5.533,
555
- "step": 390
556
- },
557
- {
558
- "epoch": 3.101512933138116,
559
- "grad_norm": 238511.0625,
560
- "learning_rate": 6.875e-05,
561
- "loss": 1455.4051,
562
- "step": 400
563
- },
564
- {
565
- "epoch": 3.101512933138116,
566
- "eval_runtime": 19.6242,
567
- "eval_samples_per_second": 22.014,
568
- "eval_steps_per_second": 5.503,
569
- "step": 400
570
- },
571
- {
572
- "epoch": 3.179599804782821,
573
- "grad_norm": 91918.703125,
574
- "learning_rate": 6.796875000000001e-05,
575
- "loss": 989.8731,
576
- "step": 410
577
- },
578
- {
579
- "epoch": 3.179599804782821,
580
- "eval_runtime": 19.636,
581
- "eval_samples_per_second": 22.0,
582
- "eval_steps_per_second": 5.5,
583
- "step": 410
584
- },
585
- {
586
- "epoch": 3.2576866764275256,
587
- "grad_norm": 362047.0,
588
- "learning_rate": 6.71875e-05,
589
- "loss": 1274.4269,
590
- "step": 420
591
- },
592
- {
593
- "epoch": 3.2576866764275256,
594
- "eval_runtime": 19.6293,
595
- "eval_samples_per_second": 22.008,
596
- "eval_steps_per_second": 5.502,
597
- "step": 420
598
- },
599
- {
600
- "epoch": 3.3357735480722304,
601
- "grad_norm": 230756.203125,
602
- "learning_rate": 6.640625e-05,
603
- "loss": 1462.9975,
604
- "step": 430
605
- },
606
- {
607
- "epoch": 3.3357735480722304,
608
- "eval_runtime": 19.6142,
609
- "eval_samples_per_second": 22.025,
610
- "eval_steps_per_second": 5.506,
611
- "step": 430
612
- },
613
- {
614
- "epoch": 3.413860419716935,
615
- "grad_norm": 551364.8125,
616
- "learning_rate": 6.562500000000001e-05,
617
- "loss": 1467.0002,
618
- "step": 440
619
- },
620
- {
621
- "epoch": 3.413860419716935,
622
- "eval_runtime": 19.6339,
623
- "eval_samples_per_second": 22.003,
624
- "eval_steps_per_second": 5.501,
625
- "step": 440
626
- },
627
- {
628
- "epoch": 3.49194729136164,
629
- "grad_norm": 238589.6875,
630
- "learning_rate": 6.484375e-05,
631
- "loss": 1181.1227,
632
- "step": 450
633
- },
634
- {
635
- "epoch": 3.49194729136164,
636
- "eval_runtime": 19.6121,
637
- "eval_samples_per_second": 22.027,
638
- "eval_steps_per_second": 5.507,
639
- "step": 450
640
- },
641
- {
642
- "epoch": 3.5700341630063446,
643
- "grad_norm": 200804.40625,
644
- "learning_rate": 6.40625e-05,
645
- "loss": 1450.5544,
646
- "step": 460
647
- },
648
- {
649
- "epoch": 3.5700341630063446,
650
- "eval_runtime": 19.6183,
651
- "eval_samples_per_second": 22.02,
652
- "eval_steps_per_second": 5.505,
653
- "step": 460
654
- },
655
- {
656
- "epoch": 3.6481210346510493,
657
- "grad_norm": 148718.125,
658
- "learning_rate": 6.328125e-05,
659
- "loss": 1220.6021,
660
- "step": 470
661
- },
662
- {
663
- "epoch": 3.6481210346510493,
664
- "eval_runtime": 19.5968,
665
- "eval_samples_per_second": 22.044,
666
- "eval_steps_per_second": 5.511,
667
- "step": 470
668
- },
669
- {
670
- "epoch": 3.726207906295754,
671
- "grad_norm": 154837.71875,
672
- "learning_rate": 6.25e-05,
673
- "loss": 1027.0058,
674
- "step": 480
675
- },
676
- {
677
- "epoch": 3.726207906295754,
678
- "eval_runtime": 19.6342,
679
- "eval_samples_per_second": 22.002,
680
- "eval_steps_per_second": 5.501,
681
- "step": 480
682
- },
683
- {
684
- "epoch": 3.804294777940459,
685
- "grad_norm": 388905.5625,
686
- "learning_rate": 6.171875e-05,
687
- "loss": 1102.8801,
688
- "step": 490
689
- },
690
- {
691
- "epoch": 3.804294777940459,
692
- "eval_runtime": 19.6156,
693
- "eval_samples_per_second": 22.023,
694
- "eval_steps_per_second": 5.506,
695
- "step": 490
696
- },
697
- {
698
- "epoch": 3.8823816495851635,
699
- "grad_norm": 251027.203125,
700
- "learning_rate": 6.0937500000000004e-05,
701
- "loss": 955.4972,
702
- "step": 500
703
- },
704
- {
705
- "epoch": 3.8823816495851635,
706
- "eval_runtime": 19.6156,
707
- "eval_samples_per_second": 22.023,
708
- "eval_steps_per_second": 5.506,
709
- "step": 500
710
- },
711
- {
712
- "epoch": 3.9604685212298683,
713
- "grad_norm": 446547.46875,
714
- "learning_rate": 6.015625e-05,
715
- "loss": 1092.9813,
716
- "step": 510
717
- },
718
- {
719
- "epoch": 3.9604685212298683,
720
- "eval_runtime": 19.6311,
721
- "eval_samples_per_second": 22.006,
722
- "eval_steps_per_second": 5.501,
723
- "step": 510
724
- },
725
- {
726
- "epoch": 4.031234748657882,
727
- "grad_norm": 255615.1875,
728
- "learning_rate": 5.9375e-05,
729
- "loss": 900.3142,
730
- "step": 520
731
- },
732
- {
733
- "epoch": 4.031234748657882,
734
- "eval_runtime": 19.5145,
735
- "eval_samples_per_second": 22.137,
736
- "eval_steps_per_second": 5.534,
737
- "step": 520
738
- },
739
- {
740
- "epoch": 4.109321620302587,
741
- "grad_norm": 318956.46875,
742
- "learning_rate": 5.8593750000000005e-05,
743
- "loss": 998.6581,
744
- "step": 530
745
- },
746
- {
747
- "epoch": 4.109321620302587,
748
- "eval_runtime": 19.5902,
749
- "eval_samples_per_second": 22.052,
750
- "eval_steps_per_second": 5.513,
751
- "step": 530
752
- },
753
- {
754
- "epoch": 4.187408491947291,
755
- "grad_norm": 291643.28125,
756
- "learning_rate": 5.78125e-05,
757
- "loss": 1226.5887,
758
- "step": 540
759
- },
760
- {
761
- "epoch": 4.187408491947291,
762
- "eval_runtime": 19.6079,
763
- "eval_samples_per_second": 22.032,
764
- "eval_steps_per_second": 5.508,
765
- "step": 540
766
- },
767
- {
768
- "epoch": 4.265495363591996,
769
- "grad_norm": 283195.4375,
770
- "learning_rate": 5.703125e-05,
771
- "loss": 1072.7449,
772
- "step": 550
773
- },
774
- {
775
- "epoch": 4.265495363591996,
776
- "eval_runtime": 19.6116,
777
- "eval_samples_per_second": 22.028,
778
- "eval_steps_per_second": 5.507,
779
- "step": 550
780
- },
781
- {
782
- "epoch": 4.343582235236701,
783
- "grad_norm": 335369.625,
784
- "learning_rate": 5.6250000000000005e-05,
785
- "loss": 801.423,
786
- "step": 560
787
- },
788
- {
789
- "epoch": 4.343582235236701,
790
- "eval_runtime": 19.6203,
791
- "eval_samples_per_second": 22.018,
792
- "eval_steps_per_second": 5.505,
793
- "step": 560
794
- },
795
- {
796
- "epoch": 4.421669106881406,
797
- "grad_norm": 363015.75,
798
- "learning_rate": 5.546875e-05,
799
- "loss": 1544.8841,
800
- "step": 570
801
- },
802
- {
803
- "epoch": 4.421669106881406,
804
- "eval_runtime": 19.633,
805
- "eval_samples_per_second": 22.004,
806
- "eval_steps_per_second": 5.501,
807
- "step": 570
808
- },
809
- {
810
- "epoch": 4.49975597852611,
811
- "grad_norm": 314545.34375,
812
- "learning_rate": 5.46875e-05,
813
- "loss": 1318.7243,
814
- "step": 580
815
- },
816
- {
817
- "epoch": 4.49975597852611,
818
- "eval_runtime": 19.6216,
819
- "eval_samples_per_second": 22.017,
820
- "eval_steps_per_second": 5.504,
821
- "step": 580
822
- },
823
- {
824
- "epoch": 4.577842850170815,
825
- "grad_norm": 436113.09375,
826
- "learning_rate": 5.3906250000000006e-05,
827
- "loss": 1256.4427,
828
- "step": 590
829
- },
830
- {
831
- "epoch": 4.577842850170815,
832
- "eval_runtime": 19.6143,
833
- "eval_samples_per_second": 22.025,
834
- "eval_steps_per_second": 5.506,
835
- "step": 590
836
- },
837
- {
838
- "epoch": 4.65592972181552,
839
- "grad_norm": 307997.78125,
840
- "learning_rate": 5.3125000000000004e-05,
841
- "loss": 1325.2429,
842
- "step": 600
843
- },
844
- {
845
- "epoch": 4.65592972181552,
846
- "eval_runtime": 19.6167,
847
- "eval_samples_per_second": 22.022,
848
- "eval_steps_per_second": 5.506,
849
- "step": 600
850
- },
851
- {
852
- "epoch": 4.7340165934602245,
853
- "grad_norm": 383733.25,
854
- "learning_rate": 5.234375e-05,
855
- "loss": 1247.956,
856
- "step": 610
857
- },
858
- {
859
- "epoch": 4.7340165934602245,
860
- "eval_runtime": 19.6214,
861
- "eval_samples_per_second": 22.017,
862
- "eval_steps_per_second": 5.504,
863
- "step": 610
864
- },
865
- {
866
- "epoch": 4.812103465104929,
867
- "grad_norm": 434974.0,
868
- "learning_rate": 5.15625e-05,
869
- "loss": 1198.3052,
870
- "step": 620
871
- },
872
- {
873
- "epoch": 4.812103465104929,
874
- "eval_runtime": 19.6132,
875
- "eval_samples_per_second": 22.026,
876
- "eval_steps_per_second": 5.506,
877
- "step": 620
878
- },
879
- {
880
- "epoch": 4.890190336749634,
881
- "grad_norm": 182716.515625,
882
- "learning_rate": 5.0781250000000004e-05,
883
- "loss": 853.0629,
884
- "step": 630
885
- },
886
- {
887
- "epoch": 4.890190336749634,
888
- "eval_runtime": 19.618,
889
- "eval_samples_per_second": 22.021,
890
- "eval_steps_per_second": 5.505,
891
- "step": 630
892
- },
893
- {
894
- "epoch": 4.968277208394339,
895
- "grad_norm": 364455.15625,
896
- "learning_rate": 5e-05,
897
- "loss": 798.3182,
898
- "step": 640
899
- },
900
- {
901
- "epoch": 4.968277208394339,
902
- "eval_runtime": 19.6019,
903
- "eval_samples_per_second": 22.039,
904
- "eval_steps_per_second": 5.51,
905
- "step": 640
906
- },
907
- {
908
- "epoch": 5.039043435822352,
909
- "grad_norm": 185726.75,
910
- "learning_rate": 4.921875e-05,
911
- "loss": 753.334,
912
- "step": 650
913
- },
914
- {
915
- "epoch": 5.039043435822352,
916
- "eval_runtime": 19.544,
917
- "eval_samples_per_second": 22.104,
918
- "eval_steps_per_second": 5.526,
919
- "step": 650
920
- },
921
- {
922
- "epoch": 5.1171303074670575,
923
- "grad_norm": 219226.359375,
924
- "learning_rate": 4.8437500000000005e-05,
925
- "loss": 1082.2985,
926
- "step": 660
927
- },
928
- {
929
- "epoch": 5.1171303074670575,
930
- "eval_runtime": 19.5817,
931
- "eval_samples_per_second": 22.061,
932
- "eval_steps_per_second": 5.515,
933
- "step": 660
934
- },
935
- {
936
- "epoch": 5.195217179111761,
937
- "grad_norm": 207455.5625,
938
- "learning_rate": 4.765625e-05,
939
- "loss": 879.2348,
940
- "step": 670
941
- },
942
- {
943
- "epoch": 5.195217179111761,
944
- "eval_runtime": 19.5896,
945
- "eval_samples_per_second": 22.053,
946
- "eval_steps_per_second": 5.513,
947
- "step": 670
948
- },
949
- {
950
- "epoch": 5.273304050756467,
951
- "grad_norm": 379599.875,
952
- "learning_rate": 4.6875e-05,
953
- "loss": 896.7317,
954
- "step": 680
955
- },
956
- {
957
- "epoch": 5.273304050756467,
958
- "eval_runtime": 19.5928,
959
- "eval_samples_per_second": 22.049,
960
- "eval_steps_per_second": 5.512,
961
- "step": 680
962
- },
963
- {
964
- "epoch": 5.351390922401171,
965
- "grad_norm": 303185.625,
966
- "learning_rate": 4.609375e-05,
967
- "loss": 754.6702,
968
- "step": 690
969
- },
970
- {
971
- "epoch": 5.351390922401171,
972
- "eval_runtime": 19.5979,
973
- "eval_samples_per_second": 22.043,
974
- "eval_steps_per_second": 5.511,
975
- "step": 690
976
- },
977
- {
978
- "epoch": 5.4294777940458765,
979
- "grad_norm": 405251.96875,
980
- "learning_rate": 4.5312500000000004e-05,
981
- "loss": 715.7025,
982
- "step": 700
983
- },
984
- {
985
- "epoch": 5.4294777940458765,
986
- "eval_runtime": 19.6118,
987
- "eval_samples_per_second": 22.028,
988
- "eval_steps_per_second": 5.507,
989
- "step": 700
990
- },
991
- {
992
- "epoch": 5.50756466569058,
993
- "grad_norm": 204986.03125,
994
- "learning_rate": 4.453125e-05,
995
- "loss": 728.391,
996
- "step": 710
997
- },
998
- {
999
- "epoch": 5.50756466569058,
1000
- "eval_runtime": 19.6098,
1001
- "eval_samples_per_second": 22.03,
1002
- "eval_steps_per_second": 5.507,
1003
- "step": 710
1004
- },
1005
- {
1006
- "epoch": 5.585651537335286,
1007
- "grad_norm": 321820.75,
1008
- "learning_rate": 4.375e-05,
1009
- "loss": 699.9287,
1010
- "step": 720
1011
- },
1012
- {
1013
- "epoch": 5.585651537335286,
1014
- "eval_runtime": 19.618,
1015
- "eval_samples_per_second": 22.021,
1016
- "eval_steps_per_second": 5.505,
1017
- "step": 720
1018
- },
1019
- {
1020
- "epoch": 5.66373840897999,
1021
- "grad_norm": 168577.65625,
1022
- "learning_rate": 4.2968750000000004e-05,
1023
- "loss": 586.6525,
1024
- "step": 730
1025
- },
1026
- {
1027
- "epoch": 5.66373840897999,
1028
- "eval_runtime": 19.6206,
1029
- "eval_samples_per_second": 22.018,
1030
- "eval_steps_per_second": 5.504,
1031
- "step": 730
1032
- },
1033
- {
1034
- "epoch": 5.741825280624695,
1035
- "grad_norm": 244243.9375,
1036
- "learning_rate": 4.21875e-05,
1037
- "loss": 625.2781,
1038
- "step": 740
1039
- },
1040
- {
1041
- "epoch": 5.741825280624695,
1042
- "eval_runtime": 19.6218,
1043
- "eval_samples_per_second": 22.016,
1044
- "eval_steps_per_second": 5.504,
1045
- "step": 740
1046
- },
1047
- {
1048
- "epoch": 5.819912152269399,
1049
- "grad_norm": 192660.640625,
1050
- "learning_rate": 4.140625e-05,
1051
- "loss": 719.4099,
1052
- "step": 750
1053
- },
1054
- {
1055
- "epoch": 5.819912152269399,
1056
- "eval_runtime": 19.6162,
1057
- "eval_samples_per_second": 22.023,
1058
- "eval_steps_per_second": 5.506,
1059
- "step": 750
1060
- },
1061
- {
1062
- "epoch": 5.897999023914105,
1063
- "grad_norm": 146700.25,
1064
- "learning_rate": 4.0625000000000005e-05,
1065
- "loss": 577.4713,
1066
- "step": 760
1067
- },
1068
- {
1069
- "epoch": 5.897999023914105,
1070
- "eval_runtime": 19.6213,
1071
- "eval_samples_per_second": 22.017,
1072
- "eval_steps_per_second": 5.504,
1073
- "step": 760
1074
- },
1075
- {
1076
- "epoch": 5.976085895558809,
1077
- "grad_norm": 209883.296875,
1078
- "learning_rate": 3.984375e-05,
1079
- "loss": 637.9117,
1080
- "step": 770
1081
- },
1082
- {
1083
- "epoch": 5.976085895558809,
1084
- "eval_runtime": 19.6267,
1085
- "eval_samples_per_second": 22.011,
1086
- "eval_steps_per_second": 5.503,
1087
- "step": 770
1088
- },
1089
- {
1090
- "epoch": 6.046852122986823,
1091
- "grad_norm": 468637.6875,
1092
- "learning_rate": 3.90625e-05,
1093
- "loss": 621.1816,
1094
- "step": 780
1095
- },
1096
- {
1097
- "epoch": 6.046852122986823,
1098
- "eval_runtime": 19.6074,
1099
- "eval_samples_per_second": 22.032,
1100
- "eval_steps_per_second": 5.508,
1101
- "step": 780
1102
- },
1103
- {
1104
- "epoch": 6.124938994631528,
1105
- "grad_norm": 374385.40625,
1106
- "learning_rate": 3.828125e-05,
1107
- "loss": 914.6565,
1108
- "step": 790
1109
- },
1110
- {
1111
- "epoch": 6.124938994631528,
1112
- "eval_runtime": 19.626,
1113
- "eval_samples_per_second": 22.012,
1114
- "eval_steps_per_second": 5.503,
1115
- "step": 790
1116
- },
1117
- {
1118
- "epoch": 6.203025866276232,
1119
- "grad_norm": 283100.3125,
1120
- "learning_rate": 3.7500000000000003e-05,
1121
- "loss": 830.0896,
1122
- "step": 800
1123
- },
1124
- {
1125
- "epoch": 6.203025866276232,
1126
- "eval_runtime": 19.6195,
1127
- "eval_samples_per_second": 22.019,
1128
- "eval_steps_per_second": 5.505,
1129
- "step": 800
1130
- },
1131
- {
1132
- "epoch": 6.281112737920937,
1133
- "grad_norm": 186444.921875,
1134
- "learning_rate": 3.671875e-05,
1135
- "loss": 822.068,
1136
- "step": 810
1137
- },
1138
- {
1139
- "epoch": 6.281112737920937,
1140
- "eval_runtime": 19.6377,
1141
- "eval_samples_per_second": 21.998,
1142
- "eval_steps_per_second": 5.5,
1143
- "step": 810
1144
- },
1145
- {
1146
- "epoch": 6.359199609565642,
1147
- "grad_norm": 431370.5,
1148
- "learning_rate": 3.59375e-05,
1149
- "loss": 690.5968,
1150
- "step": 820
1151
- },
1152
- {
1153
- "epoch": 6.359199609565642,
1154
- "eval_runtime": 19.6147,
1155
- "eval_samples_per_second": 22.024,
1156
- "eval_steps_per_second": 5.506,
1157
- "step": 820
1158
- },
1159
- {
1160
- "epoch": 6.4372864812103465,
1161
- "grad_norm": 435885.03125,
1162
- "learning_rate": 3.5156250000000004e-05,
1163
- "loss": 1016.4219,
1164
- "step": 830
1165
- },
1166
- {
1167
- "epoch": 6.4372864812103465,
1168
- "eval_runtime": 19.6089,
1169
- "eval_samples_per_second": 22.031,
1170
- "eval_steps_per_second": 5.508,
1171
- "step": 830
1172
- },
1173
- {
1174
- "epoch": 6.515373352855051,
1175
- "grad_norm": 473700.9375,
1176
- "learning_rate": 3.4375e-05,
1177
- "loss": 806.1664,
1178
- "step": 840
1179
- },
1180
- {
1181
- "epoch": 6.515373352855051,
1182
- "eval_runtime": 19.6159,
1183
- "eval_samples_per_second": 22.023,
1184
- "eval_steps_per_second": 5.506,
1185
- "step": 840
1186
- },
1187
- {
1188
- "epoch": 6.593460224499756,
1189
- "grad_norm": 398720.625,
1190
- "learning_rate": 3.359375e-05,
1191
- "loss": 668.8824,
1192
- "step": 850
1193
- },
1194
- {
1195
- "epoch": 6.593460224499756,
1196
- "eval_runtime": 19.6281,
1197
- "eval_samples_per_second": 22.009,
1198
- "eval_steps_per_second": 5.502,
1199
- "step": 850
1200
- },
1201
- {
1202
- "epoch": 6.671547096144461,
1203
- "grad_norm": 305606.28125,
1204
- "learning_rate": 3.2812500000000005e-05,
1205
- "loss": 522.7901,
1206
- "step": 860
1207
- },
1208
- {
1209
- "epoch": 6.671547096144461,
1210
- "eval_runtime": 19.6214,
1211
- "eval_samples_per_second": 22.017,
1212
- "eval_steps_per_second": 5.504,
1213
- "step": 860
1214
- },
1215
- {
1216
- "epoch": 6.7496339677891655,
1217
- "grad_norm": 263199.21875,
1218
- "learning_rate": 3.203125e-05,
1219
- "loss": 559.4416,
1220
- "step": 870
1221
- },
1222
- {
1223
- "epoch": 6.7496339677891655,
1224
- "eval_runtime": 19.6359,
1225
- "eval_samples_per_second": 22.0,
1226
- "eval_steps_per_second": 5.5,
1227
- "step": 870
1228
- },
1229
- {
1230
- "epoch": 6.82772083943387,
1231
- "grad_norm": 203541.921875,
1232
- "learning_rate": 3.125e-05,
1233
- "loss": 602.3201,
1234
- "step": 880
1235
- },
1236
- {
1237
- "epoch": 6.82772083943387,
1238
- "eval_runtime": 19.6292,
1239
- "eval_samples_per_second": 22.008,
1240
- "eval_steps_per_second": 5.502,
1241
- "step": 880
1242
- },
1243
- {
1244
- "epoch": 6.905807711078575,
1245
- "grad_norm": 109014.71875,
1246
- "learning_rate": 3.0468750000000002e-05,
1247
- "loss": 527.6462,
1248
- "step": 890
1249
- },
1250
- {
1251
- "epoch": 6.905807711078575,
1252
- "eval_runtime": 19.615,
1253
- "eval_samples_per_second": 22.024,
1254
- "eval_steps_per_second": 5.506,
1255
- "step": 890
1256
- },
1257
- {
1258
- "epoch": 6.98389458272328,
1259
- "grad_norm": 173269.515625,
1260
- "learning_rate": 2.96875e-05,
1261
- "loss": 509.9485,
1262
- "step": 900
1263
- },
1264
- {
1265
- "epoch": 6.98389458272328,
1266
- "eval_runtime": 19.6287,
1267
- "eval_samples_per_second": 22.009,
1268
- "eval_steps_per_second": 5.502,
1269
- "step": 900
1270
- },
1271
- {
1272
- "epoch": 7.054660810151294,
1273
- "grad_norm": 89940.78125,
1274
- "learning_rate": 2.890625e-05,
1275
- "loss": 514.7005,
1276
- "step": 910
1277
- },
1278
- {
1279
- "epoch": 7.054660810151294,
1280
- "eval_runtime": 19.6378,
1281
- "eval_samples_per_second": 21.998,
1282
- "eval_steps_per_second": 5.5,
1283
- "step": 910
1284
- },
1285
- {
1286
- "epoch": 7.132747681795998,
1287
- "grad_norm": 64957.203125,
1288
- "learning_rate": 2.8125000000000003e-05,
1289
- "loss": 516.4828,
1290
- "step": 920
1291
- },
1292
- {
1293
- "epoch": 7.132747681795998,
1294
- "eval_runtime": 19.6159,
1295
- "eval_samples_per_second": 22.023,
1296
- "eval_steps_per_second": 5.506,
1297
- "step": 920
1298
- },
1299
- {
1300
- "epoch": 7.210834553440703,
1301
- "grad_norm": 191568.875,
1302
- "learning_rate": 2.734375e-05,
1303
- "loss": 469.4625,
1304
- "step": 930
1305
- },
1306
- {
1307
- "epoch": 7.210834553440703,
1308
- "eval_runtime": 19.6149,
1309
- "eval_samples_per_second": 22.024,
1310
- "eval_steps_per_second": 5.506,
1311
- "step": 930
1312
- },
1313
- {
1314
- "epoch": 7.288921425085407,
1315
- "grad_norm": 180467.84375,
1316
- "learning_rate": 2.6562500000000002e-05,
1317
- "loss": 627.1263,
1318
- "step": 940
1319
- },
1320
- {
1321
- "epoch": 7.288921425085407,
1322
- "eval_runtime": 19.6273,
1323
- "eval_samples_per_second": 22.01,
1324
- "eval_steps_per_second": 5.503,
1325
- "step": 940
1326
- },
1327
- {
1328
- "epoch": 7.367008296730113,
1329
- "grad_norm": 175262.3125,
1330
- "learning_rate": 2.578125e-05,
1331
- "loss": 497.1456,
1332
- "step": 950
1333
- },
1334
- {
1335
- "epoch": 7.367008296730113,
1336
- "eval_runtime": 19.6298,
1337
- "eval_samples_per_second": 22.007,
1338
- "eval_steps_per_second": 5.502,
1339
- "step": 950
1340
- },
1341
- {
1342
- "epoch": 7.4450951683748166,
1343
- "grad_norm": 194304.703125,
1344
- "learning_rate": 2.5e-05,
1345
- "loss": 651.0766,
1346
- "step": 960
1347
- },
1348
- {
1349
- "epoch": 7.4450951683748166,
1350
- "eval_runtime": 19.6413,
1351
- "eval_samples_per_second": 21.994,
1352
- "eval_steps_per_second": 5.499,
1353
- "step": 960
1354
- },
1355
- {
1356
- "epoch": 7.523182040019522,
1357
- "grad_norm": 221815.171875,
1358
- "learning_rate": 2.4218750000000003e-05,
1359
- "loss": 419.7869,
1360
- "step": 970
1361
- },
1362
- {
1363
- "epoch": 7.523182040019522,
1364
- "eval_runtime": 19.6255,
1365
- "eval_samples_per_second": 22.012,
1366
- "eval_steps_per_second": 5.503,
1367
- "step": 970
1368
- },
1369
- {
1370
- "epoch": 7.601268911664226,
1371
- "grad_norm": 330153.84375,
1372
- "learning_rate": 2.34375e-05,
1373
- "loss": 606.2052,
1374
- "step": 980
1375
- },
1376
- {
1377
- "epoch": 7.601268911664226,
1378
- "eval_runtime": 19.6246,
1379
- "eval_samples_per_second": 22.013,
1380
- "eval_steps_per_second": 5.503,
1381
- "step": 980
1382
- },
1383
- {
1384
- "epoch": 7.679355783308932,
1385
- "grad_norm": 134138.4375,
1386
- "learning_rate": 2.2656250000000002e-05,
1387
- "loss": 636.721,
1388
- "step": 990
1389
- },
1390
- {
1391
- "epoch": 7.679355783308932,
1392
- "eval_runtime": 19.622,
1393
- "eval_samples_per_second": 22.016,
1394
- "eval_steps_per_second": 5.504,
1395
- "step": 990
1396
- },
1397
- {
1398
- "epoch": 7.7574426549536355,
1399
- "grad_norm": 86497.265625,
1400
- "learning_rate": 2.1875e-05,
1401
- "loss": 565.6015,
1402
- "step": 1000
1403
- },
1404
- {
1405
- "epoch": 7.7574426549536355,
1406
- "eval_runtime": 19.6167,
1407
- "eval_samples_per_second": 22.022,
1408
- "eval_steps_per_second": 5.506,
1409
- "step": 1000
1410
- },
1411
- {
1412
- "epoch": 7.835529526598341,
1413
- "grad_norm": 256321.390625,
1414
- "learning_rate": 2.109375e-05,
1415
- "loss": 383.8543,
1416
- "step": 1010
1417
- },
1418
- {
1419
- "epoch": 7.835529526598341,
1420
- "eval_runtime": 19.6312,
1421
- "eval_samples_per_second": 22.006,
1422
- "eval_steps_per_second": 5.501,
1423
- "step": 1010
1424
- },
1425
- {
1426
- "epoch": 7.913616398243045,
1427
- "grad_norm": 300360.125,
1428
- "learning_rate": 2.0312500000000002e-05,
1429
- "loss": 386.0633,
1430
- "step": 1020
1431
- },
1432
- {
1433
- "epoch": 7.913616398243045,
1434
- "eval_runtime": 19.6333,
1435
- "eval_samples_per_second": 22.003,
1436
- "eval_steps_per_second": 5.501,
1437
- "step": 1020
1438
- },
1439
- {
1440
- "epoch": 7.991703269887751,
1441
- "grad_norm": 65186.7578125,
1442
- "learning_rate": 1.953125e-05,
1443
- "loss": 379.998,
1444
- "step": 1030
1445
- },
1446
- {
1447
- "epoch": 7.991703269887751,
1448
- "eval_runtime": 19.6079,
1449
- "eval_samples_per_second": 22.032,
1450
- "eval_steps_per_second": 5.508,
1451
- "step": 1030
1452
- },
1453
- {
1454
- "epoch": 8.062469497315764,
1455
- "grad_norm": 306364.5,
1456
- "learning_rate": 1.8750000000000002e-05,
1457
- "loss": 458.2917,
1458
- "step": 1040
1459
- },
1460
- {
1461
- "epoch": 8.062469497315764,
1462
- "eval_runtime": 19.626,
1463
- "eval_samples_per_second": 22.012,
1464
- "eval_steps_per_second": 5.503,
1465
- "step": 1040
1466
- },
1467
- {
1468
- "epoch": 8.140556368960468,
1469
- "grad_norm": 287573.96875,
1470
- "learning_rate": 1.796875e-05,
1471
- "loss": 459.69,
1472
- "step": 1050
1473
- },
1474
- {
1475
- "epoch": 8.140556368960468,
1476
- "eval_runtime": 19.64,
1477
- "eval_samples_per_second": 21.996,
1478
- "eval_steps_per_second": 5.499,
1479
- "step": 1050
1480
- },
1481
- {
1482
- "epoch": 8.218643240605173,
1483
- "grad_norm": 87142.5625,
1484
- "learning_rate": 1.71875e-05,
1485
- "loss": 380.1467,
1486
- "step": 1060
1487
- },
1488
- {
1489
- "epoch": 8.218643240605173,
1490
- "eval_runtime": 19.6334,
1491
- "eval_samples_per_second": 22.003,
1492
- "eval_steps_per_second": 5.501,
1493
- "step": 1060
1494
- },
1495
- {
1496
- "epoch": 8.296730112249879,
1497
- "grad_norm": 301649.96875,
1498
- "learning_rate": 1.6406250000000002e-05,
1499
- "loss": 384.5057,
1500
- "step": 1070
1501
- },
1502
- {
1503
- "epoch": 8.296730112249879,
1504
- "eval_runtime": 19.6212,
1505
- "eval_samples_per_second": 22.017,
1506
- "eval_steps_per_second": 5.504,
1507
- "step": 1070
1508
- },
1509
- {
1510
- "epoch": 8.374816983894583,
1511
- "grad_norm": 236951.1875,
1512
- "learning_rate": 1.5625e-05,
1513
- "loss": 374.7868,
1514
- "step": 1080
1515
- },
1516
- {
1517
- "epoch": 8.374816983894583,
1518
- "eval_runtime": 19.6187,
1519
- "eval_samples_per_second": 22.02,
1520
- "eval_steps_per_second": 5.505,
1521
- "step": 1080
1522
- },
1523
- {
1524
- "epoch": 8.452903855539287,
1525
- "grad_norm": 76360.2734375,
1526
- "learning_rate": 1.484375e-05,
1527
- "loss": 312.2099,
1528
- "step": 1090
1529
- },
1530
- {
1531
- "epoch": 8.452903855539287,
1532
- "eval_runtime": 19.6209,
1533
- "eval_samples_per_second": 22.017,
1534
- "eval_steps_per_second": 5.504,
1535
- "step": 1090
1536
- },
1537
- {
1538
- "epoch": 8.530990727183992,
1539
- "grad_norm": 76876.0859375,
1540
- "learning_rate": 1.4062500000000001e-05,
1541
- "loss": 303.1329,
1542
- "step": 1100
1543
- },
1544
- {
1545
- "epoch": 8.530990727183992,
1546
- "eval_runtime": 19.6126,
1547
- "eval_samples_per_second": 22.027,
1548
- "eval_steps_per_second": 5.507,
1549
- "step": 1100
1550
- },
1551
- {
1552
- "epoch": 8.609077598828698,
1553
- "grad_norm": 80524.609375,
1554
- "learning_rate": 1.3281250000000001e-05,
1555
- "loss": 336.2521,
1556
- "step": 1110
1557
- },
1558
- {
1559
- "epoch": 8.609077598828698,
1560
- "eval_runtime": 19.5907,
1561
- "eval_samples_per_second": 22.051,
1562
- "eval_steps_per_second": 5.513,
1563
- "step": 1110
1564
- },
1565
- {
1566
- "epoch": 8.687164470473402,
1567
- "grad_norm": 114438.3828125,
1568
- "learning_rate": 1.25e-05,
1569
- "loss": 342.3281,
1570
- "step": 1120
1571
- },
1572
- {
1573
- "epoch": 8.687164470473402,
1574
- "eval_runtime": 19.614,
1575
- "eval_samples_per_second": 22.025,
1576
- "eval_steps_per_second": 5.506,
1577
- "step": 1120
1578
- },
1579
- {
1580
- "epoch": 8.765251342118106,
1581
- "grad_norm": 281197.375,
1582
- "learning_rate": 1.171875e-05,
1583
- "loss": 358.492,
1584
- "step": 1130
1585
- },
1586
- {
1587
- "epoch": 8.765251342118106,
1588
- "eval_runtime": 19.5986,
1589
- "eval_samples_per_second": 22.042,
1590
- "eval_steps_per_second": 5.511,
1591
- "step": 1130
1592
- },
1593
- {
1594
- "epoch": 8.843338213762811,
1595
- "grad_norm": 39132.8515625,
1596
- "learning_rate": 1.09375e-05,
1597
- "loss": 297.0417,
1598
- "step": 1140
1599
- },
1600
- {
1601
- "epoch": 8.843338213762811,
1602
- "eval_runtime": 19.6013,
1603
- "eval_samples_per_second": 22.039,
1604
- "eval_steps_per_second": 5.51,
1605
- "step": 1140
1606
- },
1607
- {
1608
- "epoch": 8.921425085407517,
1609
- "grad_norm": 270795.65625,
1610
- "learning_rate": 1.0156250000000001e-05,
1611
- "loss": 306.2402,
1612
- "step": 1150
1613
- },
1614
- {
1615
- "epoch": 8.921425085407517,
1616
- "eval_runtime": 19.6264,
1617
- "eval_samples_per_second": 22.011,
1618
- "eval_steps_per_second": 5.503,
1619
- "step": 1150
1620
- },
1621
- {
1622
- "epoch": 8.99951195705222,
1623
- "grad_norm": 124614.0390625,
1624
- "learning_rate": 9.375000000000001e-06,
1625
- "loss": 354.61,
1626
- "step": 1160
1627
- },
1628
- {
1629
- "epoch": 8.99951195705222,
1630
- "eval_runtime": 19.6258,
1631
- "eval_samples_per_second": 22.012,
1632
- "eval_steps_per_second": 5.503,
1633
- "step": 1160
1634
- },
1635
- {
1636
- "epoch": 9.070278184480234,
1637
- "grad_norm": 194401.953125,
1638
- "learning_rate": 8.59375e-06,
1639
- "loss": 306.9392,
1640
- "step": 1170
1641
- },
1642
- {
1643
- "epoch": 9.070278184480234,
1644
- "eval_runtime": 19.6327,
1645
- "eval_samples_per_second": 22.004,
1646
- "eval_steps_per_second": 5.501,
1647
- "step": 1170
1648
- },
1649
- {
1650
- "epoch": 9.14836505612494,
1651
- "grad_norm": 134935.84375,
1652
- "learning_rate": 7.8125e-06,
1653
- "loss": 309.7358,
1654
- "step": 1180
1655
- },
1656
- {
1657
- "epoch": 9.14836505612494,
1658
- "eval_runtime": 19.625,
1659
- "eval_samples_per_second": 22.013,
1660
- "eval_steps_per_second": 5.503,
1661
- "step": 1180
1662
- },
1663
- {
1664
- "epoch": 9.226451927769643,
1665
- "grad_norm": 169914.25,
1666
- "learning_rate": 7.031250000000001e-06,
1667
- "loss": 316.8677,
1668
- "step": 1190
1669
- },
1670
- {
1671
- "epoch": 9.226451927769643,
1672
- "eval_runtime": 19.6279,
1673
- "eval_samples_per_second": 22.01,
1674
- "eval_steps_per_second": 5.502,
1675
- "step": 1190
1676
- },
1677
- {
1678
- "epoch": 9.304538799414349,
1679
- "grad_norm": 145255.390625,
1680
- "learning_rate": 6.25e-06,
1681
- "loss": 281.0021,
1682
- "step": 1200
1683
- },
1684
- {
1685
- "epoch": 9.304538799414349,
1686
- "eval_runtime": 19.6322,
1687
- "eval_samples_per_second": 22.005,
1688
- "eval_steps_per_second": 5.501,
1689
- "step": 1200
1690
- },
1691
- {
1692
- "epoch": 9.382625671059053,
1693
- "grad_norm": 77945.0625,
1694
- "learning_rate": 5.46875e-06,
1695
- "loss": 225.1448,
1696
- "step": 1210
1697
- },
1698
- {
1699
- "epoch": 9.382625671059053,
1700
- "eval_runtime": 19.6173,
1701
- "eval_samples_per_second": 22.021,
1702
- "eval_steps_per_second": 5.505,
1703
- "step": 1210
1704
- },
1705
- {
1706
- "epoch": 9.460712542703758,
1707
- "grad_norm": 106036.7421875,
1708
- "learning_rate": 4.6875000000000004e-06,
1709
- "loss": 260.6584,
1710
- "step": 1220
1711
- },
1712
- {
1713
- "epoch": 9.460712542703758,
1714
- "eval_runtime": 19.6147,
1715
- "eval_samples_per_second": 22.024,
1716
- "eval_steps_per_second": 5.506,
1717
- "step": 1220
1718
- },
1719
- {
1720
- "epoch": 9.538799414348462,
1721
- "grad_norm": 103023.9296875,
1722
- "learning_rate": 3.90625e-06,
1723
- "loss": 246.4592,
1724
- "step": 1230
1725
- },
1726
- {
1727
- "epoch": 9.538799414348462,
1728
- "eval_runtime": 19.6287,
1729
- "eval_samples_per_second": 22.009,
1730
- "eval_steps_per_second": 5.502,
1731
- "step": 1230
1732
- },
1733
- {
1734
- "epoch": 9.616886285993168,
1735
- "grad_norm": 69943.5,
1736
- "learning_rate": 3.125e-06,
1737
- "loss": 254.6815,
1738
- "step": 1240
1739
- },
1740
- {
1741
- "epoch": 9.616886285993168,
1742
- "eval_runtime": 19.6092,
1743
- "eval_samples_per_second": 22.031,
1744
- "eval_steps_per_second": 5.508,
1745
- "step": 1240
1746
- },
1747
- {
1748
- "epoch": 9.694973157637872,
1749
- "grad_norm": 157203.640625,
1750
- "learning_rate": 2.3437500000000002e-06,
1751
- "loss": 247.872,
1752
- "step": 1250
1753
- },
1754
- {
1755
- "epoch": 9.694973157637872,
1756
- "eval_runtime": 19.6301,
1757
- "eval_samples_per_second": 22.007,
1758
- "eval_steps_per_second": 5.502,
1759
- "step": 1250
1760
- },
1761
- {
1762
- "epoch": 9.773060029282577,
1763
- "grad_norm": 40032.71484375,
1764
- "learning_rate": 1.5625e-06,
1765
- "loss": 239.199,
1766
- "step": 1260
1767
- },
1768
- {
1769
- "epoch": 9.773060029282577,
1770
- "eval_runtime": 19.6402,
1771
- "eval_samples_per_second": 21.996,
1772
- "eval_steps_per_second": 5.499,
1773
- "step": 1260
1774
- },
1775
- {
1776
- "epoch": 9.851146900927281,
1777
- "grad_norm": 77447.0546875,
1778
- "learning_rate": 7.8125e-07,
1779
- "loss": 222.1013,
1780
- "step": 1270
1781
- },
1782
- {
1783
- "epoch": 9.851146900927281,
1784
- "eval_runtime": 19.623,
1785
- "eval_samples_per_second": 22.015,
1786
- "eval_steps_per_second": 5.504,
1787
- "step": 1270
1788
- },
1789
- {
1790
- "epoch": 9.929233772571987,
1791
- "grad_norm": 100750.3046875,
1792
- "learning_rate": 0.0,
1793
- "loss": 234.0448,
1794
- "step": 1280
1795
- },
1796
- {
1797
- "epoch": 9.929233772571987,
1798
- "eval_runtime": 19.621,
1799
- "eval_samples_per_second": 22.017,
1800
- "eval_steps_per_second": 5.504,
1801
- "step": 1280
1802
  }
1803
  ],
1804
  "logging_steps": 10,
@@ -1813,7 +189,7 @@
1813
  "should_evaluate": false,
1814
  "should_log": false,
1815
  "should_save": true,
1816
- "should_training_stop": true
1817
  },
1818
  "attributes": {}
1819
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 10,
6
+ "global_step": 129,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.07808687164470474,
13
+ "grad_norm": 562671.25,
14
  "learning_rate": 9.921875000000001e-05,
15
+ "loss": 41909.175,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.07808687164470474,
20
+ "eval_runtime": 19.6603,
21
+ "eval_samples_per_second": 21.973,
22
+ "eval_steps_per_second": 5.493,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.15617374328940947,
27
+ "grad_norm": 344896.03125,
28
  "learning_rate": 9.84375e-05,
29
+ "loss": 13890.8641,
30
  "step": 20
31
  },
32
  {
33
  "epoch": 0.15617374328940947,
34
+ "eval_runtime": 19.7074,
35
+ "eval_samples_per_second": 21.921,
36
+ "eval_steps_per_second": 5.48,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.2342606149341142,
41
+ "grad_norm": 515524.21875,
42
  "learning_rate": 9.765625e-05,
43
+ "loss": 11401.4898,
44
  "step": 30
45
  },
46
  {
47
  "epoch": 0.2342606149341142,
48
+ "eval_runtime": 19.7344,
49
+ "eval_samples_per_second": 21.891,
50
+ "eval_steps_per_second": 5.473,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.31234748657881894,
55
+ "grad_norm": 388208.53125,
56
  "learning_rate": 9.687500000000001e-05,
57
+ "loss": 6596.7703,
58
  "step": 40
59
  },
60
  {
61
  "epoch": 0.31234748657881894,
62
+ "eval_runtime": 19.7515,
63
+ "eval_samples_per_second": 21.872,
64
+ "eval_steps_per_second": 5.468,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.3904343582235237,
69
+ "grad_norm": 236512.34375,
70
  "learning_rate": 9.609375e-05,
71
+ "loss": 3515.6215,
72
  "step": 50
73
  },
74
  {
75
  "epoch": 0.3904343582235237,
76
+ "eval_runtime": 19.738,
77
+ "eval_samples_per_second": 21.887,
78
+ "eval_steps_per_second": 5.472,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.4685212298682284,
83
+ "grad_norm": 694649.5625,
84
  "learning_rate": 9.53125e-05,
85
+ "loss": 4275.273,
86
  "step": 60
87
  },
88
  {
89
  "epoch": 0.4685212298682284,
90
+ "eval_runtime": 19.7487,
91
+ "eval_samples_per_second": 21.875,
92
+ "eval_steps_per_second": 5.469,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.5466081015129332,
97
+ "grad_norm": 375526.53125,
98
  "learning_rate": 9.453125000000001e-05,
99
+ "loss": 5022.9281,
100
  "step": 70
101
  },
102
  {
103
  "epoch": 0.5466081015129332,
104
+ "eval_runtime": 19.7307,
105
+ "eval_samples_per_second": 21.895,
106
+ "eval_steps_per_second": 5.474,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.6246949731576379,
111
+ "grad_norm": 603116.25,
112
  "learning_rate": 9.375e-05,
113
+ "loss": 3630.4668,
114
  "step": 80
115
  },
116
  {
117
  "epoch": 0.6246949731576379,
118
+ "eval_runtime": 19.7405,
119
+ "eval_samples_per_second": 21.884,
120
+ "eval_steps_per_second": 5.471,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.7027818448023426,
125
+ "grad_norm": 440137.71875,
126
  "learning_rate": 9.296875e-05,
127
+ "loss": 3183.9186,
128
  "step": 90
129
  },
130
  {
131
  "epoch": 0.7027818448023426,
132
+ "eval_runtime": 19.7964,
133
+ "eval_samples_per_second": 21.822,
134
+ "eval_steps_per_second": 5.456,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.7808687164470474,
139
+ "grad_norm": 574069.4375,
140
  "learning_rate": 9.21875e-05,
141
+ "loss": 3536.15,
142
  "step": 100
143
  },
144
  {
145
  "epoch": 0.7808687164470474,
146
+ "eval_runtime": 19.7759,
147
+ "eval_samples_per_second": 21.845,
148
+ "eval_steps_per_second": 5.461,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.8589555880917521,
153
+ "grad_norm": 639365.375,
154
  "learning_rate": 9.140625e-05,
155
+ "loss": 4252.8562,
156
  "step": 110
157
  },
158
  {
159
  "epoch": 0.8589555880917521,
160
+ "eval_runtime": 19.7724,
161
+ "eval_samples_per_second": 21.849,
162
+ "eval_steps_per_second": 5.462,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 0.9370424597364568,
167
+ "grad_norm": 398296.34375,
168
  "learning_rate": 9.062500000000001e-05,
169
+ "loss": 3483.168,
170
  "step": 120
171
  },
172
  {
173
  "epoch": 0.9370424597364568,
174
+ "eval_runtime": 19.7473,
175
+ "eval_samples_per_second": 21.876,
176
+ "eval_steps_per_second": 5.469,
177
  "step": 120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  }
179
  ],
180
  "logging_steps": 10,
 
189
  "should_evaluate": false,
190
  "should_log": false,
191
  "should_save": true,
192
+ "should_training_stop": false
193
  },
194
  "attributes": {}
195
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38c6a63ec4c4930b895b9b2c38482c3f00670f918edf566cda7d6c982c1fa789
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00e1815a75e87f99681c46c6d470e12a833e128941d3a562bbcb63c47c459934
3
  size 5368