Fanucci commited on
Commit
b74cbc2
·
verified ·
1 Parent(s): 5322f9f

Training in progress, step 50, checkpoint

Browse files
last-checkpoint/model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61839d9cde8235147437788fd1dc57b62d264040f11811ba7bece1d4a43194a3
3
  size 4995335576
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7851edd1043360267b2db8577a31064ca1621c04e7df4289b379b9f8a9793999
3
  size 4995335576
last-checkpoint/model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bb657d86345e8f9e9e47568f554a968f17dc699b34d4a4ce21ef508e3ddd9bc
3
  size 1857639032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89a804dd9adf23eb7826763228d6ef8cbd45f3a451d852daf22e2c4f088aba1d
3
  size 1857639032
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4807a409fc6314d809b3b7c3f773f26fa381874359f3107093b715584c85d47
3
  size 13706103974
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36f0fb1c7e888518dcdfd67ad8c4327597ddef100f84eb6544a0f16da3ca1898
3
  size 13706103974
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9213080fe2b45399b87036ca9ff9164533abe6b368e5c828136ee184486749d4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cf9097d4513154245c48236b6ec5137b7ee2a21c9f58f2cba798ea275c6026f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:462c3770d14e466903ac3cbb8d02a07b05bb99c1e78e9ab65cd1a8165b933c02
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03bfcb5cd3899a02f7a89e2033f35f63eb1a6773ac4ce6695121020cac9264f0
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 1,
3
  "best_metric": 1.4945952892303467,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.04996876951905059,
6
  "eval_steps": 50,
7
- "global_step": 800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12,698 +12,53 @@
12
  {
13
  "epoch": 6.246096189881324e-05,
14
  "eval_loss": 1.4945952892303467,
15
- "eval_runtime": 43.5116,
16
- "eval_samples_per_second": 19.374,
17
- "eval_steps_per_second": 19.374,
18
  "step": 1
19
  },
20
  {
21
  "epoch": 0.0006246096189881324,
22
- "grad_norm": 90.5,
23
  "learning_rate": 0.045000000000000005,
24
- "loss": 54.0301,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 0.0012492192379762648,
29
- "grad_norm": 138.0,
30
  "learning_rate": 0.04998980482070473,
31
- "loss": 261.8933,
32
  "step": 20
33
  },
34
  {
35
  "epoch": 0.0018738288569643974,
36
- "grad_norm": 143.0,
37
  "learning_rate": 0.049954572901111285,
38
- "loss": 327.693,
39
  "step": 30
40
  },
41
  {
42
  "epoch": 0.0024984384759525295,
43
- "grad_norm": 704.0,
44
  "learning_rate": 0.04989421384191499,
45
- "loss": 360.9668,
46
  "step": 40
47
  },
48
  {
49
  "epoch": 0.003123048094940662,
50
- "grad_norm": 12352.0,
51
  "learning_rate": 0.04980878841957203,
52
- "loss": 321.0979,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 0.003123048094940662,
57
- "eval_loss": 220.1619415283203,
58
- "eval_runtime": 43.4736,
59
- "eval_samples_per_second": 19.391,
60
- "eval_steps_per_second": 19.391,
61
  "step": 50
62
- },
63
- {
64
- "epoch": 0.0037476577139287947,
65
- "grad_norm": 46.25,
66
- "learning_rate": 0.049698382650241506,
67
- "loss": 227.9017,
68
- "step": 60
69
- },
70
- {
71
- "epoch": 0.004372267332916927,
72
- "grad_norm": 129.0,
73
- "learning_rate": 0.04956310770317444,
74
- "loss": 91.6515,
75
- "step": 70
76
- },
77
- {
78
- "epoch": 0.004996876951905059,
79
- "grad_norm": 25.375,
80
- "learning_rate": 0.04940309978877575,
81
- "loss": 43.7426,
82
- "step": 80
83
- },
84
- {
85
- "epoch": 0.005621486570893191,
86
- "grad_norm": 26.125,
87
- "learning_rate": 0.04921852002145197,
88
- "loss": 35.9591,
89
- "step": 90
90
- },
91
- {
92
- "epoch": 0.006246096189881324,
93
- "grad_norm": 71.5,
94
- "learning_rate": 0.04900955425738262,
95
- "loss": 25.3901,
96
- "step": 100
97
- },
98
- {
99
- "epoch": 0.006246096189881324,
100
- "eval_loss": 21.986961364746094,
101
- "eval_runtime": 53.4891,
102
- "eval_samples_per_second": 15.76,
103
- "eval_steps_per_second": 15.76,
104
- "step": 100
105
- },
106
- {
107
- "epoch": 0.006870705808869456,
108
- "grad_norm": 29.75,
109
- "learning_rate": 0.048776412907378844,
110
- "loss": 16.5617,
111
- "step": 110
112
- },
113
- {
114
- "epoch": 0.007495315427857589,
115
- "grad_norm": 29.75,
116
- "learning_rate": 0.04851933072501756,
117
- "loss": 11.0683,
118
- "step": 120
119
- },
120
- {
121
- "epoch": 0.008119925046845722,
122
- "grad_norm": 41.5,
123
- "learning_rate": 0.048238566570264485,
124
- "loss": 9.7898,
125
- "step": 130
126
- },
127
- {
128
- "epoch": 0.008744534665833853,
129
- "grad_norm": 26.75,
130
- "learning_rate": 0.047934403148824085,
131
- "loss": 9.437,
132
- "step": 140
133
- },
134
- {
135
- "epoch": 0.009369144284821987,
136
- "grad_norm": 14.1875,
137
- "learning_rate": 0.047607146727478934,
138
- "loss": 8.8562,
139
- "step": 150
140
- },
141
- {
142
- "epoch": 0.009369144284821987,
143
- "eval_loss": 10.050978660583496,
144
- "eval_runtime": 54.8139,
145
- "eval_samples_per_second": 15.379,
146
- "eval_steps_per_second": 15.379,
147
- "step": 150
148
- },
149
- {
150
- "epoch": 0.009993753903810118,
151
- "grad_norm": 26.0,
152
- "learning_rate": 0.04725712682570498,
153
- "loss": 10.5338,
154
- "step": 160
155
- },
156
- {
157
- "epoch": 0.010618363522798251,
158
- "grad_norm": 28.125,
159
- "learning_rate": 0.046884695883873395,
160
- "loss": 9.3137,
161
- "step": 170
162
- },
163
- {
164
- "epoch": 0.011242973141786383,
165
- "grad_norm": 37.75,
166
- "learning_rate": 0.04649022890837298,
167
- "loss": 9.4597,
168
- "step": 180
169
- },
170
- {
171
- "epoch": 0.011867582760774516,
172
- "grad_norm": 46.75,
173
- "learning_rate": 0.046074123094010544,
174
- "loss": 9.584,
175
- "step": 190
176
- },
177
- {
178
- "epoch": 0.012492192379762648,
179
- "grad_norm": 48.25,
180
- "learning_rate": 0.04563679742406935,
181
- "loss": 9.6385,
182
- "step": 200
183
- },
184
- {
185
- "epoch": 0.012492192379762648,
186
- "eval_loss": 9.662774085998535,
187
- "eval_runtime": 53.0984,
188
- "eval_samples_per_second": 15.876,
189
- "eval_steps_per_second": 15.876,
190
- "step": 200
191
- },
192
- {
193
- "epoch": 0.01311680199875078,
194
- "grad_norm": 100.5,
195
- "learning_rate": 0.045178692248428534,
196
- "loss": 10.0078,
197
- "step": 210
198
- },
199
- {
200
- "epoch": 0.013741411617738912,
201
- "grad_norm": 51.5,
202
- "learning_rate": 0.04470026884016805,
203
- "loss": 9.4955,
204
- "step": 220
205
- },
206
- {
207
- "epoch": 0.014366021236727046,
208
- "grad_norm": 7.71875,
209
- "learning_rate": 0.0442020089311058,
210
- "loss": 8.3445,
211
- "step": 230
212
- },
213
- {
214
- "epoch": 0.014990630855715179,
215
- "grad_norm": 13.0,
216
- "learning_rate": 0.043684414226734525,
217
- "loss": 8.7676,
218
- "step": 240
219
- },
220
- {
221
- "epoch": 0.01561524047470331,
222
- "grad_norm": 15.625,
223
- "learning_rate": 0.04314800590104691,
224
- "loss": 9.141,
225
- "step": 250
226
- },
227
- {
228
- "epoch": 0.01561524047470331,
229
- "eval_loss": 8.216238975524902,
230
- "eval_runtime": 55.3897,
231
- "eval_samples_per_second": 15.219,
232
- "eval_steps_per_second": 15.219,
233
- "step": 250
234
- },
235
- {
236
- "epoch": 0.016239850093691444,
237
- "grad_norm": 19.0,
238
- "learning_rate": 0.04259332407175751,
239
- "loss": 8.5766,
240
- "step": 260
241
- },
242
- {
243
- "epoch": 0.016864459712679577,
244
- "grad_norm": 282.0,
245
- "learning_rate": 0.04202092725645009,
246
- "loss": 10.2655,
247
- "step": 270
248
- },
249
- {
250
- "epoch": 0.017489069331667707,
251
- "grad_norm": 7.53125,
252
- "learning_rate": 0.04143139181019764,
253
- "loss": 8.9695,
254
- "step": 280
255
- },
256
- {
257
- "epoch": 0.01811367895065584,
258
- "grad_norm": 31.5,
259
- "learning_rate": 0.040825311345221764,
260
- "loss": 8.7729,
261
- "step": 290
262
- },
263
- {
264
- "epoch": 0.018738288569643973,
265
- "grad_norm": 14.125,
266
- "learning_rate": 0.04020329613317545,
267
- "loss": 9.6525,
268
- "step": 300
269
- },
270
- {
271
- "epoch": 0.018738288569643973,
272
- "eval_loss": 8.672038078308105,
273
- "eval_runtime": 52.539,
274
- "eval_samples_per_second": 16.045,
275
- "eval_steps_per_second": 16.045,
276
- "step": 300
277
- },
278
- {
279
- "epoch": 0.019362898188632106,
280
- "grad_norm": 15.8125,
281
- "learning_rate": 0.03956597249065126,
282
- "loss": 8.3444,
283
- "step": 310
284
- },
285
- {
286
- "epoch": 0.019987507807620236,
287
- "grad_norm": 26.25,
288
- "learning_rate": 0.0389139821485336,
289
- "loss": 8.1302,
290
- "step": 320
291
- },
292
- {
293
- "epoch": 0.02061211742660837,
294
- "grad_norm": 32.25,
295
- "learning_rate": 0.03824798160583012,
296
- "loss": 8.5943,
297
- "step": 330
298
- },
299
- {
300
- "epoch": 0.021236727045596503,
301
- "grad_norm": 22.375,
302
- "learning_rate": 0.037568641468632896,
303
- "loss": 8.2872,
304
- "step": 340
305
- },
306
- {
307
- "epoch": 0.021861336664584636,
308
- "grad_norm": 35.5,
309
- "learning_rate": 0.03687664577487488,
310
- "loss": 9.0739,
311
- "step": 350
312
- },
313
- {
314
- "epoch": 0.021861336664584636,
315
- "eval_loss": 10.262110710144043,
316
- "eval_runtime": 53.54,
317
- "eval_samples_per_second": 15.745,
318
- "eval_steps_per_second": 15.745,
319
- "step": 350
320
- },
321
- {
322
- "epoch": 0.022485946283572766,
323
- "grad_norm": 356.0,
324
- "learning_rate": 0.03617269130556171,
325
- "loss": 10.7064,
326
- "step": 360
327
- },
328
- {
329
- "epoch": 0.0231105559025609,
330
- "grad_norm": 4512.0,
331
- "learning_rate": 0.035457486883172316,
332
- "loss": 8.8088,
333
- "step": 370
334
- },
335
- {
336
- "epoch": 0.023735165521549032,
337
- "grad_norm": 193536.0,
338
- "learning_rate": 0.03473175265793479,
339
- "loss": 10.325,
340
- "step": 380
341
- },
342
- {
343
- "epoch": 0.024359775140537165,
344
- "grad_norm": 173.0,
345
- "learning_rate": 0.033996219382696063,
346
- "loss": 9.4952,
347
- "step": 390
348
- },
349
- {
350
- "epoch": 0.024984384759525295,
351
- "grad_norm": 120.0,
352
- "learning_rate": 0.033251627677115835,
353
- "loss": 10.3895,
354
- "step": 400
355
- },
356
- {
357
- "epoch": 0.024984384759525295,
358
- "eval_loss": 9.044400215148926,
359
- "eval_runtime": 55.2547,
360
- "eval_samples_per_second": 15.257,
361
- "eval_steps_per_second": 15.257,
362
- "step": 400
363
- },
364
- {
365
- "epoch": 0.02560899437851343,
366
- "grad_norm": 64768.0,
367
- "learning_rate": 0.032498727281925266,
368
- "loss": 9.7613,
369
- "step": 410
370
- },
371
- {
372
- "epoch": 0.02623360399750156,
373
- "grad_norm": 2179072.0,
374
- "learning_rate": 0.0317382763040017,
375
- "loss": 9.7948,
376
- "step": 420
377
- },
378
- {
379
- "epoch": 0.026858213616489695,
380
- "grad_norm": 27136.0,
381
- "learning_rate": 0.030971040453019225,
382
- "loss": 9.0155,
383
- "step": 430
384
- },
385
- {
386
- "epoch": 0.027482823235477825,
387
- "grad_norm": 71168.0,
388
- "learning_rate": 0.03019779227044398,
389
- "loss": 11.3283,
390
- "step": 440
391
- },
392
- {
393
- "epoch": 0.028107432854465958,
394
- "grad_norm": 8768.0,
395
- "learning_rate": 0.029419310351650393,
396
- "loss": 8.5037,
397
- "step": 450
398
- },
399
- {
400
- "epoch": 0.028107432854465958,
401
- "eval_loss": 8.112160682678223,
402
- "eval_runtime": 53.9319,
403
- "eval_samples_per_second": 15.631,
404
- "eval_steps_per_second": 15.631,
405
- "step": 450
406
- },
407
- {
408
- "epoch": 0.02873204247345409,
409
- "grad_norm": 650117120.0,
410
- "learning_rate": 0.02863637856194159,
411
- "loss": 8.5855,
412
- "step": 460
413
- },
414
- {
415
- "epoch": 0.029356652092442224,
416
- "grad_norm": 11008.0,
417
- "learning_rate": 0.027849785247263517,
418
- "loss": 8.1377,
419
- "step": 470
420
- },
421
- {
422
- "epoch": 0.029981261711430358,
423
- "grad_norm": 2491081031680.0,
424
- "learning_rate": 0.02706032244040741,
425
- "loss": 8.1602,
426
- "step": 480
427
- },
428
- {
429
- "epoch": 0.030605871330418487,
430
- "grad_norm": 15269888.0,
431
- "learning_rate": 0.026268785063499858,
432
- "loss": 7.909,
433
- "step": 490
434
- },
435
- {
436
- "epoch": 0.03123048094940662,
437
- "grad_norm": 245366784.0,
438
- "learning_rate": 0.025475970127583666,
439
- "loss": 7.9959,
440
- "step": 500
441
- },
442
- {
443
- "epoch": 0.03123048094940662,
444
- "eval_loss": 8.136107444763184,
445
- "eval_runtime": 56.6292,
446
- "eval_samples_per_second": 14.886,
447
- "eval_steps_per_second": 14.886,
448
- "step": 500
449
- },
450
- {
451
- "epoch": 0.03185509056839475,
452
- "grad_norm": 958464.0,
453
- "learning_rate": 0.024682675930095266,
454
- "loss": 8.2639,
455
- "step": 510
456
- },
457
- {
458
- "epoch": 0.03247970018738289,
459
- "grad_norm": 2483027968.0,
460
- "learning_rate": 0.02388970125104685,
461
- "loss": 8.0817,
462
- "step": 520
463
- },
464
- {
465
- "epoch": 0.03310430980637102,
466
- "grad_norm": 2.386907802506363e+17,
467
- "learning_rate": 0.02309784454872262,
468
- "loss": 9.0499,
469
- "step": 530
470
- },
471
- {
472
- "epoch": 0.033728919425359154,
473
- "grad_norm": 225280.0,
474
- "learning_rate": 0.022307903155699027,
475
- "loss": 7.8657,
476
- "step": 540
477
- },
478
- {
479
- "epoch": 0.034353529044347283,
480
- "grad_norm": 1171456.0,
481
- "learning_rate": 0.02152067247599837,
482
- "loss": 7.8539,
483
- "step": 550
484
- },
485
- {
486
- "epoch": 0.034353529044347283,
487
- "eval_loss": 7.987682819366455,
488
- "eval_runtime": 56.3279,
489
- "eval_samples_per_second": 14.966,
490
- "eval_steps_per_second": 14.966,
491
- "step": 550
492
- },
493
- {
494
- "epoch": 0.03497813866333541,
495
- "grad_norm": 83361792.0,
496
- "learning_rate": 0.020736945184184407,
497
- "loss": 7.8864,
498
- "step": 560
499
- },
500
- {
501
- "epoch": 0.03560274828232355,
502
- "grad_norm": 47710208.0,
503
- "learning_rate": 0.019957510427206296,
504
- "loss": 9.0771,
505
- "step": 570
506
- },
507
- {
508
- "epoch": 0.03622735790131168,
509
- "grad_norm": 367001600.0,
510
- "learning_rate": 0.01918315302979444,
511
- "loss": 18.2783,
512
- "step": 580
513
- },
514
- {
515
- "epoch": 0.03685196752029981,
516
- "grad_norm": 19840.0,
517
- "learning_rate": 0.018414652704208584,
518
- "loss": 19.6834,
519
- "step": 590
520
- },
521
- {
522
- "epoch": 0.037476577139287946,
523
- "grad_norm": 11392.0,
524
- "learning_rate": 0.017652783265133608,
525
- "loss": 13.4558,
526
- "step": 600
527
- },
528
- {
529
- "epoch": 0.037476577139287946,
530
- "eval_loss": 10.698236465454102,
531
- "eval_runtime": 54.0772,
532
- "eval_samples_per_second": 15.589,
533
- "eval_steps_per_second": 15.589,
534
- "step": 600
535
- },
536
- {
537
- "epoch": 0.038101186758276076,
538
- "grad_norm": 772.0,
539
- "learning_rate": 0.01689831185051374,
540
- "loss": 10.3479,
541
- "step": 610
542
- },
543
- {
544
- "epoch": 0.03872579637726421,
545
- "grad_norm": 30.0,
546
- "learning_rate": 0.016151998149109708,
547
- "loss": 9.0419,
548
- "step": 620
549
- },
550
- {
551
- "epoch": 0.03935040599625234,
552
- "grad_norm": 36.0,
553
- "learning_rate": 0.015414593635556518,
554
- "loss": 8.9411,
555
- "step": 630
556
- },
557
- {
558
- "epoch": 0.03997501561524047,
559
- "grad_norm": 5216.0,
560
- "learning_rate": 0.014686840813692224,
561
- "loss": 8.2684,
562
- "step": 640
563
- },
564
- {
565
- "epoch": 0.04059962523422861,
566
- "grad_norm": 12288.0,
567
- "learning_rate": 0.013969472468919462,
568
- "loss": 8.5109,
569
- "step": 650
570
- },
571
- {
572
- "epoch": 0.04059962523422861,
573
- "eval_loss": 8.088459014892578,
574
- "eval_runtime": 53.555,
575
- "eval_samples_per_second": 15.741,
576
- "eval_steps_per_second": 15.741,
577
- "step": 650
578
- },
579
- {
580
- "epoch": 0.04122423485321674,
581
- "grad_norm": 2672.0,
582
- "learning_rate": 0.013263210930352737,
583
- "loss": 7.9423,
584
- "step": 660
585
- },
586
- {
587
- "epoch": 0.04184884447220487,
588
- "grad_norm": 47185920.0,
589
- "learning_rate": 0.01256876734349413,
590
- "loss": 8.6554,
591
- "step": 670
592
- },
593
- {
594
- "epoch": 0.042473454091193005,
595
- "grad_norm": 31.125,
596
- "learning_rate": 0.011886840954170141,
597
- "loss": 8.6544,
598
- "step": 680
599
- },
600
- {
601
- "epoch": 0.043098063710181135,
602
- "grad_norm": 916.0,
603
- "learning_rate": 0.011218118404450424,
604
- "loss": 8.0047,
605
- "step": 690
606
- },
607
- {
608
- "epoch": 0.04372267332916927,
609
- "grad_norm": 44.25,
610
- "learning_rate": 0.010563273041257332,
611
- "loss": 8.3487,
612
- "step": 700
613
- },
614
- {
615
- "epoch": 0.04372267332916927,
616
- "eval_loss": 8.62705135345459,
617
- "eval_runtime": 56.6085,
618
- "eval_samples_per_second": 14.892,
619
- "eval_steps_per_second": 14.892,
620
- "step": 700
621
- },
622
- {
623
- "epoch": 0.0443472829481574,
624
- "grad_norm": 47.5,
625
- "learning_rate": 0.009922964238362761,
626
- "loss": 8.9014,
627
- "step": 710
628
- },
629
- {
630
- "epoch": 0.04497189256714553,
631
- "grad_norm": 7840.0,
632
- "learning_rate": 0.009297836732454564,
633
- "loss": 7.7725,
634
- "step": 720
635
- },
636
- {
637
- "epoch": 0.04559650218613367,
638
- "grad_norm": 4194304.0,
639
- "learning_rate": 0.0086885199739414,
640
- "loss": 7.9401,
641
- "step": 730
642
- },
643
- {
644
- "epoch": 0.0462211118051218,
645
- "grad_norm": 16.625,
646
- "learning_rate": 0.00809562749314952,
647
- "loss": 7.5247,
648
- "step": 740
649
- },
650
- {
651
- "epoch": 0.046845721424109935,
652
- "grad_norm": 151.0,
653
- "learning_rate": 0.0075197562825497334,
654
- "loss": 8.3288,
655
- "step": 750
656
- },
657
- {
658
- "epoch": 0.046845721424109935,
659
- "eval_loss": 8.920095443725586,
660
- "eval_runtime": 56.4634,
661
- "eval_samples_per_second": 14.93,
662
- "eval_steps_per_second": 14.93,
663
- "step": 750
664
- },
665
- {
666
- "epoch": 0.047470331043098064,
667
- "grad_norm": 12.0625,
668
- "learning_rate": 0.006961486195636613,
669
- "loss": 8.1405,
670
- "step": 760
671
- },
672
- {
673
- "epoch": 0.048094940662086194,
674
- "grad_norm": 258.0,
675
- "learning_rate": 0.006421379363065142,
676
- "loss": 8.3261,
677
- "step": 770
678
- },
679
- {
680
- "epoch": 0.04871955028107433,
681
- "grad_norm": 1664.0,
682
- "learning_rate": 0.005899979626632835,
683
- "loss": 7.6633,
684
- "step": 780
685
- },
686
- {
687
- "epoch": 0.04934415990006246,
688
- "grad_norm": 12.25,
689
- "learning_rate": 0.005397811991677107,
690
- "loss": 7.9004,
691
- "step": 790
692
- },
693
- {
694
- "epoch": 0.04996876951905059,
695
- "grad_norm": 23.5,
696
- "learning_rate": 0.0049153820984394365,
697
- "loss": 7.9413,
698
- "step": 800
699
- },
700
- {
701
- "epoch": 0.04996876951905059,
702
- "eval_loss": 7.851413249969482,
703
- "eval_runtime": 56.9045,
704
- "eval_samples_per_second": 14.814,
705
- "eval_steps_per_second": 14.814,
706
- "step": 800
707
  }
708
  ],
709
  "logging_steps": 10,
@@ -718,7 +73,7 @@
718
  "early_stopping_threshold": 0.0
719
  },
720
  "attributes": {
721
- "early_stopping_patience_counter": 16
722
  }
723
  },
724
  "TrainerControl": {
@@ -732,7 +87,7 @@
732
  "attributes": {}
733
  }
734
  },
735
- "total_flos": 3.267697311744e+16,
736
  "train_batch_size": 1,
737
  "trial_name": null,
738
  "trial_params": null
 
2
  "best_global_step": 1,
3
  "best_metric": 1.4945952892303467,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.003123048094940662,
6
  "eval_steps": 50,
7
+ "global_step": 50,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12
  {
13
  "epoch": 6.246096189881324e-05,
14
  "eval_loss": 1.4945952892303467,
15
+ "eval_runtime": 43.2738,
16
+ "eval_samples_per_second": 19.481,
17
+ "eval_steps_per_second": 19.481,
18
  "step": 1
19
  },
20
  {
21
  "epoch": 0.0006246096189881324,
22
+ "grad_norm": 116.0,
23
  "learning_rate": 0.045000000000000005,
24
+ "loss": 44.6564,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 0.0012492192379762648,
29
+ "grad_norm": 81.5,
30
  "learning_rate": 0.04998980482070473,
31
+ "loss": 266.9058,
32
  "step": 20
33
  },
34
  {
35
  "epoch": 0.0018738288569643974,
36
+ "grad_norm": 126.0,
37
  "learning_rate": 0.049954572901111285,
38
+ "loss": 349.7276,
39
  "step": 30
40
  },
41
  {
42
  "epoch": 0.0024984384759525295,
43
+ "grad_norm": 47.25,
44
  "learning_rate": 0.04989421384191499,
45
+ "loss": 355.1523,
46
  "step": 40
47
  },
48
  {
49
  "epoch": 0.003123048094940662,
50
+ "grad_norm": 21.625,
51
  "learning_rate": 0.04980878841957203,
52
+ "loss": 271.2299,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 0.003123048094940662,
57
+ "eval_loss": 138.12083435058594,
58
+ "eval_runtime": 43.2248,
59
+ "eval_samples_per_second": 19.503,
60
+ "eval_steps_per_second": 19.503,
61
  "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  }
63
  ],
64
  "logging_steps": 10,
 
73
  "early_stopping_threshold": 0.0
74
  },
75
  "attributes": {
76
+ "early_stopping_patience_counter": 1
77
  }
78
  },
79
  "TrainerControl": {
 
87
  "attributes": {}
88
  }
89
  },
90
+ "total_flos": 2042310819840000.0,
91
  "train_batch_size": 1,
92
  "trial_name": null,
93
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8703975468e7453f9b0532194f6849fa8472145a7cf0016c4bdd9844e783cf7
3
  size 7160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1e61a95b469ad587100b089121f3916eacaeca053d36c6b400373d16e858628
3
  size 7160