Ba2han commited on
Commit
2b0502a
·
verified ·
1 Parent(s): 348e7b0

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df7f3da2bd850f33f7298372ff869be8f81c2b3405227fe6c9bd7c6f2f71a131
3
  size 1008303016
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa0366c83d309f7521675441df462578bea7f619437923da130a73f4b3e1ef98
3
  size 1008303016
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc159d75d63a73fbd137e19fda0856ce4e3720ab62cac1e9b38b17fedbdf4bf8
3
  size 1086712487
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44564ad6d459ee8b6824ecc63fbcda48f4afb20f4f7137194943df892c91515f
3
  size 1086712487
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c800b778fa7e115e4c34de8529902de8b61c9a1b4bab3eb8295d06dafff030e
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9efd33af97ed562c15fc83318701d580bcf56272c251b44d09ee6d97b4cc32c1
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c3b6e6139f923a24202d50bcbca49b224309c9139ea19d316d6f4729bb3d183
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccc24e81b738d82c9183e6957980e5f32ed351a8d1b2f1f22e0d6adc4bee1861
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4280936454849498,
6
  "eval_steps": 50,
7
- "global_step": 50,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -366,6 +366,364 @@
366
  "eval_samples_per_second": 11.856,
367
  "eval_steps_per_second": 2.969,
368
  "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  }
370
  ],
371
  "logging_steps": 1,
@@ -385,7 +743,7 @@
385
  "attributes": {}
386
  }
387
  },
388
- "total_flos": 7.292068282073088e+16,
389
  "train_batch_size": 4,
390
  "trial_name": null,
391
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8561872909698997,
6
  "eval_steps": 50,
7
+ "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
366
  "eval_samples_per_second": 11.856,
367
  "eval_steps_per_second": 2.969,
368
  "step": 50
369
+ },
370
+ {
371
+ "epoch": 0.43665551839464883,
372
+ "grad_norm": 0.490234375,
373
+ "learning_rate": 0.02154667547631338,
374
+ "loss": 6.548553466796875,
375
+ "step": 51
376
+ },
377
+ {
378
+ "epoch": 0.44521739130434784,
379
+ "grad_norm": 0.4296875,
380
+ "learning_rate": 0.02140867714223579,
381
+ "loss": 6.469072341918945,
382
+ "step": 52
383
+ },
384
+ {
385
+ "epoch": 0.4537792642140468,
386
+ "grad_norm": 0.369140625,
387
+ "learning_rate": 0.021268436096329016,
388
+ "loss": 6.5030717849731445,
389
+ "step": 53
390
+ },
391
+ {
392
+ "epoch": 0.4623411371237458,
393
+ "grad_norm": 0.3984375,
394
+ "learning_rate": 0.0211259876435264,
395
+ "loss": 6.464090347290039,
396
+ "step": 54
397
+ },
398
+ {
399
+ "epoch": 0.4709030100334448,
400
+ "grad_norm": 0.4140625,
401
+ "learning_rate": 0.020981367644464153,
402
+ "loss": 6.415444374084473,
403
+ "step": 55
404
+ },
405
+ {
406
+ "epoch": 0.4794648829431438,
407
+ "grad_norm": 0.400390625,
408
+ "learning_rate": 0.020834612506453645,
409
+ "loss": 6.36262845993042,
410
+ "step": 56
411
+ },
412
+ {
413
+ "epoch": 0.48802675585284283,
414
+ "grad_norm": 0.421875,
415
+ "learning_rate": 0.020685759174316067,
416
+ "loss": 6.345418930053711,
417
+ "step": 57
418
+ },
419
+ {
420
+ "epoch": 0.4965886287625418,
421
+ "grad_norm": 0.333984375,
422
+ "learning_rate": 0.02053484512108174,
423
+ "loss": 6.309205532073975,
424
+ "step": 58
425
+ },
426
+ {
427
+ "epoch": 0.5051505016722408,
428
+ "grad_norm": 0.41015625,
429
+ "learning_rate": 0.020381908338556534,
430
+ "loss": 6.3184638023376465,
431
+ "step": 59
432
+ },
433
+ {
434
+ "epoch": 0.5137123745819397,
435
+ "grad_norm": 0.390625,
436
+ "learning_rate": 0.020226987327757566,
437
+ "loss": 6.273001670837402,
438
+ "step": 60
439
+ },
440
+ {
441
+ "epoch": 0.5222742474916388,
442
+ "grad_norm": 0.40234375,
443
+ "learning_rate": 0.020070121089220835,
444
+ "loss": 6.3029961585998535,
445
+ "step": 61
446
+ },
447
+ {
448
+ "epoch": 0.5308361204013378,
449
+ "grad_norm": 0.365234375,
450
+ "learning_rate": 0.01991134911318301,
451
+ "loss": 6.250678062438965,
452
+ "step": 62
453
+ },
454
+ {
455
+ "epoch": 0.5393979933110368,
456
+ "grad_norm": 0.359375,
457
+ "learning_rate": 0.01975071136963998,
458
+ "loss": 6.183889865875244,
459
+ "step": 63
460
+ },
461
+ {
462
+ "epoch": 0.5479598662207358,
463
+ "grad_norm": 0.3515625,
464
+ "learning_rate": 0.019588248298284636,
465
+ "loss": 6.137822151184082,
466
+ "step": 64
467
+ },
468
+ {
469
+ "epoch": 0.5565217391304348,
470
+ "grad_norm": 0.3203125,
471
+ "learning_rate": 0.01942400079832638,
472
+ "loss": 6.1106767654418945,
473
+ "step": 65
474
+ },
475
+ {
476
+ "epoch": 0.5650836120401338,
477
+ "grad_norm": 0.333984375,
478
+ "learning_rate": 0.01925801021819497,
479
+ "loss": 6.08446741104126,
480
+ "step": 66
481
+ },
482
+ {
483
+ "epoch": 0.5736454849498328,
484
+ "grad_norm": 0.302734375,
485
+ "learning_rate": 0.01909031834513128,
486
+ "loss": 6.131768226623535,
487
+ "step": 67
488
+ },
489
+ {
490
+ "epoch": 0.5822073578595318,
491
+ "grad_norm": 0.326171875,
492
+ "learning_rate": 0.018920967394667584,
493
+ "loss": 6.065019607543945,
494
+ "step": 68
495
+ },
496
+ {
497
+ "epoch": 0.5907692307692308,
498
+ "grad_norm": 0.3046875,
499
+ "learning_rate": 0.018750000000000003,
500
+ "loss": 6.0482282638549805,
501
+ "step": 69
502
+ },
503
+ {
504
+ "epoch": 0.5993311036789297,
505
+ "grad_norm": 0.357421875,
506
+ "learning_rate": 0.01857745920125586,
507
+ "loss": 6.048603057861328,
508
+ "step": 70
509
+ },
510
+ {
511
+ "epoch": 0.6078929765886287,
512
+ "grad_norm": 0.365234375,
513
+ "learning_rate": 0.018403388434658535,
514
+ "loss": 5.998836517333984,
515
+ "step": 71
516
+ },
517
+ {
518
+ "epoch": 0.6164548494983277,
519
+ "grad_norm": 0.3515625,
520
+ "learning_rate": 0.01822783152159263,
521
+ "loss": 5.9749650955200195,
522
+ "step": 72
523
+ },
524
+ {
525
+ "epoch": 0.6250167224080267,
526
+ "grad_norm": 0.30859375,
527
+ "learning_rate": 0.018050832657572177,
528
+ "loss": 5.919043064117432,
529
+ "step": 73
530
+ },
531
+ {
532
+ "epoch": 0.6335785953177258,
533
+ "grad_norm": 0.314453125,
534
+ "learning_rate": 0.017872436401114647,
535
+ "loss": 5.9278059005737305,
536
+ "step": 74
537
+ },
538
+ {
539
+ "epoch": 0.6421404682274248,
540
+ "grad_norm": 0.3046875,
541
+ "learning_rate": 0.017692687662523583,
542
+ "loss": 5.9064860343933105,
543
+ "step": 75
544
+ },
545
+ {
546
+ "epoch": 0.6507023411371238,
547
+ "grad_norm": 0.275390625,
548
+ "learning_rate": 0.01751163169258267,
549
+ "loss": 5.921048164367676,
550
+ "step": 76
551
+ },
552
+ {
553
+ "epoch": 0.6592642140468228,
554
+ "grad_norm": 0.28125,
555
+ "learning_rate": 0.017329314071164108,
556
+ "loss": 5.8522515296936035,
557
+ "step": 77
558
+ },
559
+ {
560
+ "epoch": 0.6678260869565218,
561
+ "grad_norm": 0.29296875,
562
+ "learning_rate": 0.017145780695754093,
563
+ "loss": 5.846250057220459,
564
+ "step": 78
565
+ },
566
+ {
567
+ "epoch": 0.6763879598662207,
568
+ "grad_norm": 0.37109375,
569
+ "learning_rate": 0.016961077769898397,
570
+ "loss": 5.8790435791015625,
571
+ "step": 79
572
+ },
573
+ {
574
+ "epoch": 0.6849498327759197,
575
+ "grad_norm": 0.30078125,
576
+ "learning_rate": 0.016775251791570862,
577
+ "loss": 5.849973201751709,
578
+ "step": 80
579
+ },
580
+ {
581
+ "epoch": 0.6935117056856187,
582
+ "grad_norm": 0.3125,
583
+ "learning_rate": 0.016588349541467772,
584
+ "loss": 5.8167405128479,
585
+ "step": 81
586
+ },
587
+ {
588
+ "epoch": 0.7020735785953177,
589
+ "grad_norm": 0.27734375,
590
+ "learning_rate": 0.016400418071231087,
591
+ "loss": 5.796406269073486,
592
+ "step": 82
593
+ },
594
+ {
595
+ "epoch": 0.7106354515050167,
596
+ "grad_norm": 0.30859375,
597
+ "learning_rate": 0.01621150469160344,
598
+ "loss": 5.828444004058838,
599
+ "step": 83
600
+ },
601
+ {
602
+ "epoch": 0.7191973244147157,
603
+ "grad_norm": 0.373046875,
604
+ "learning_rate": 0.016021656960517872,
605
+ "loss": 5.763159275054932,
606
+ "step": 84
607
+ },
608
+ {
609
+ "epoch": 0.7277591973244147,
610
+ "grad_norm": 0.3359375,
611
+ "learning_rate": 0.015830922671125437,
612
+ "loss": 5.761376857757568,
613
+ "step": 85
614
+ },
615
+ {
616
+ "epoch": 0.7363210702341138,
617
+ "grad_norm": 0.3515625,
618
+ "learning_rate": 0.015639349839763488,
619
+ "loss": 5.724917411804199,
620
+ "step": 86
621
+ },
622
+ {
623
+ "epoch": 0.7448829431438128,
624
+ "grad_norm": 0.283203125,
625
+ "learning_rate": 0.015446986693867843,
626
+ "loss": 5.740510940551758,
627
+ "step": 87
628
+ },
629
+ {
630
+ "epoch": 0.7534448160535117,
631
+ "grad_norm": 0.275390625,
632
+ "learning_rate": 0.015253881659831759,
633
+ "loss": 5.739223480224609,
634
+ "step": 88
635
+ },
636
+ {
637
+ "epoch": 0.7620066889632107,
638
+ "grad_norm": 0.2890625,
639
+ "learning_rate": 0.015060083350814886,
640
+ "loss": 5.749945163726807,
641
+ "step": 89
642
+ },
643
+ {
644
+ "epoch": 0.7705685618729097,
645
+ "grad_norm": 0.294921875,
646
+ "learning_rate": 0.014865640554505129,
647
+ "loss": 5.627425670623779,
648
+ "step": 90
649
+ },
650
+ {
651
+ "epoch": 0.7791304347826087,
652
+ "grad_norm": 0.287109375,
653
+ "learning_rate": 0.014670602220836632,
654
+ "loss": 5.6766133308410645,
655
+ "step": 91
656
+ },
657
+ {
658
+ "epoch": 0.7876923076923077,
659
+ "grad_norm": 0.263671875,
660
+ "learning_rate": 0.014475017449666875,
661
+ "loss": 5.628288269042969,
662
+ "step": 92
663
+ },
664
+ {
665
+ "epoch": 0.7962541806020067,
666
+ "grad_norm": 0.287109375,
667
+ "learning_rate": 0.014278935478416067,
668
+ "loss": 5.660680294036865,
669
+ "step": 93
670
+ },
671
+ {
672
+ "epoch": 0.8048160535117057,
673
+ "grad_norm": 0.25390625,
674
+ "learning_rate": 0.014082405669671866,
675
+ "loss": 5.6099348068237305,
676
+ "step": 94
677
+ },
678
+ {
679
+ "epoch": 0.8133779264214047,
680
+ "grad_norm": 0.265625,
681
+ "learning_rate": 0.013885477498762639,
682
+ "loss": 5.595584869384766,
683
+ "step": 95
684
+ },
685
+ {
686
+ "epoch": 0.8219397993311037,
687
+ "grad_norm": 0.2431640625,
688
+ "learning_rate": 0.013688200541302282,
689
+ "loss": 5.5869598388671875,
690
+ "step": 96
691
+ },
692
+ {
693
+ "epoch": 0.8305016722408026,
694
+ "grad_norm": 0.259765625,
695
+ "learning_rate": 0.013490624460709855,
696
+ "loss": 5.562591552734375,
697
+ "step": 97
698
+ },
699
+ {
700
+ "epoch": 0.8390635451505016,
701
+ "grad_norm": 0.2734375,
702
+ "learning_rate": 0.013292798995707057,
703
+ "loss": 5.562747955322266,
704
+ "step": 98
705
+ },
706
+ {
707
+ "epoch": 0.8476254180602006,
708
+ "grad_norm": 0.271484375,
709
+ "learning_rate": 0.013094773947796783,
710
+ "loss": 5.608129501342773,
711
+ "step": 99
712
+ },
713
+ {
714
+ "epoch": 0.8561872909698997,
715
+ "grad_norm": 0.265625,
716
+ "learning_rate": 0.012896599168725847,
717
+ "loss": 5.565805912017822,
718
+ "step": 100
719
+ },
720
+ {
721
+ "epoch": 0.8561872909698997,
722
+ "eval_loss": 5.537259578704834,
723
+ "eval_runtime": 51.1142,
724
+ "eval_samples_per_second": 11.954,
725
+ "eval_steps_per_second": 2.993,
726
+ "step": 100
727
  }
728
  ],
729
  "logging_steps": 1,
 
743
  "attributes": {}
744
  }
745
  },
746
+ "total_flos": 1.4579790139891507e+17,
747
  "train_batch_size": 4,
748
  "trial_name": null,
749
  "trial_params": null