eageringdev commited on
Commit
2c2e2e0
·
verified ·
1 Parent(s): b1cb713

Training in progress, step 116, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f665f4d8bec07fb34d9167a74519000ca1c07ab4c7bda4abe394ba88ac21e636
3
  size 48679352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28a1ae218f1ff6eb872136adb7304db287346d66909b1eb743a4dd4fbcb2e6b7
3
  size 48679352
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eebb42e267e22b0dcd069ba21ca540bf94385bb45036cf004d0ea00a0a68f6f1
3
  size 25152500
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:850d4377277d6b716ba444a1d4bd4d3ee5948814c02b5d64a4cb5cd9d04f1b5c
3
  size 25152500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e43153d8095dfe754b5facaefcfbc05a2190eef6bcf4b8e8d6a0d880bacca91f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a8ecfa01e7c54e7ebac11aad61255a493edc13101f10e4edddeff0bacdcd859
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:969c56c88a0cef1fee3d363c20e94622cc83295b65bf6f7189fcb71ab5f0d40d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5db974519fd77fbcd1d1516436fa53f7a6999ec0d08fdab2b48306286e57ccd6
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.754880694143167,
5
  "eval_steps": 500,
6
- "global_step": 87,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -616,6 +616,217 @@
616
  "learning_rate": 1.5917335155023367e-05,
617
  "loss": 1.1773,
618
  "step": 87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619
  }
620
  ],
621
  "logging_steps": 1,
@@ -630,12 +841,12 @@
630
  "should_evaluate": false,
631
  "should_log": false,
632
  "should_save": true,
633
- "should_training_stop": false
634
  },
635
  "attributes": {}
636
  }
637
  },
638
- "total_flos": 2.50460296445952e+16,
639
  "train_batch_size": 4,
640
  "trial_name": null,
641
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0065075921908895,
5
  "eval_steps": 500,
6
+ "global_step": 116,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
616
  "learning_rate": 1.5917335155023367e-05,
617
  "loss": 1.1773,
618
  "step": 87
619
+ },
620
+ {
621
+ "epoch": 0.7635574837310195,
622
+ "grad_norm": 0.32429060339927673,
623
+ "learning_rate": 1.4895706208868875e-05,
624
+ "loss": 1.2633,
625
+ "step": 88
626
+ },
627
+ {
628
+ "epoch": 0.7722342733188721,
629
+ "grad_norm": 0.31287485361099243,
630
+ "learning_rate": 1.3902195302273779e-05,
631
+ "loss": 1.3551,
632
+ "step": 89
633
+ },
634
+ {
635
+ "epoch": 0.7809110629067245,
636
+ "grad_norm": 0.33581656217575073,
637
+ "learning_rate": 1.2937598223330005e-05,
638
+ "loss": 1.3166,
639
+ "step": 90
640
+ },
641
+ {
642
+ "epoch": 0.789587852494577,
643
+ "grad_norm": 0.32596108317375183,
644
+ "learning_rate": 1.2002687600565137e-05,
645
+ "loss": 1.3277,
646
+ "step": 91
647
+ },
648
+ {
649
+ "epoch": 0.7982646420824295,
650
+ "grad_norm": 0.3356931507587433,
651
+ "learning_rate": 1.1098212284078036e-05,
652
+ "loss": 1.2979,
653
+ "step": 92
654
+ },
655
+ {
656
+ "epoch": 0.806941431670282,
657
+ "grad_norm": 0.3395873010158539,
658
+ "learning_rate": 1.0224896745720514e-05,
659
+ "loss": 1.2646,
660
+ "step": 93
661
+ },
662
+ {
663
+ "epoch": 0.8156182212581344,
664
+ "grad_norm": 0.3901115953922272,
665
+ "learning_rate": 9.383440498805712e-06,
666
+ "loss": 1.2756,
667
+ "step": 94
668
+ },
669
+ {
670
+ "epoch": 0.824295010845987,
671
+ "grad_norm": 0.4069584012031555,
672
+ "learning_rate": 8.574517537807897e-06,
673
+ "loss": 1.5386,
674
+ "step": 95
675
+ },
676
+ {
677
+ "epoch": 0.8329718004338394,
678
+ "grad_norm": 0.37969765067100525,
679
+ "learning_rate": 7.798775798502483e-06,
680
+ "loss": 1.3174,
681
+ "step": 96
682
+ },
683
+ {
684
+ "epoch": 0.841648590021692,
685
+ "grad_norm": 0.3992767930030823,
686
+ "learning_rate": 7.0568366389786975e-06,
687
+ "loss": 1.2494,
688
+ "step": 97
689
+ },
690
+ {
691
+ "epoch": 0.8503253796095445,
692
+ "grad_norm": 0.4310814142227173,
693
+ "learning_rate": 6.349294341940593e-06,
694
+ "loss": 1.2746,
695
+ "step": 98
696
+ },
697
+ {
698
+ "epoch": 0.8590021691973969,
699
+ "grad_norm": 0.4858376681804657,
700
+ "learning_rate": 5.676715638695063e-06,
701
+ "loss": 1.3332,
702
+ "step": 99
703
+ },
704
+ {
705
+ "epoch": 0.8676789587852495,
706
+ "grad_norm": 0.4930087924003601,
707
+ "learning_rate": 5.0396392552081564e-06,
708
+ "loss": 1.1824,
709
+ "step": 100
710
+ },
711
+ {
712
+ "epoch": 0.8763557483731019,
713
+ "grad_norm": 0.5185401439666748,
714
+ "learning_rate": 4.43857548059321e-06,
715
+ "loss": 1.4682,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 0.8850325379609545,
720
+ "grad_norm": 0.5266461968421936,
721
+ "learning_rate": 3.87400575837657e-06,
722
+ "loss": 1.503,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 0.8937093275488069,
727
+ "grad_norm": 0.5809972286224365,
728
+ "learning_rate": 3.346382300868134e-06,
729
+ "loss": 1.641,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 0.9023861171366594,
734
+ "grad_norm": 0.6158227920532227,
735
+ "learning_rate": 2.85612772694579e-06,
736
+ "loss": 1.4546,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 0.911062906724512,
741
+ "grad_norm": 0.6743486523628235,
742
+ "learning_rate": 2.403634723543674e-06,
743
+ "loss": 1.6859,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 0.9197396963123644,
748
+ "grad_norm": 0.7346360683441162,
749
+ "learning_rate": 1.9892657311155248e-06,
750
+ "loss": 1.1147,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 0.928416485900217,
755
+ "grad_norm": 0.7887254357337952,
756
+ "learning_rate": 1.6133526533250565e-06,
757
+ "loss": 1.6007,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 0.9370932754880694,
762
+ "grad_norm": 0.9641562700271606,
763
+ "learning_rate": 1.2761965911958384e-06,
764
+ "loss": 1.6112,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 0.9457700650759219,
769
+ "grad_norm": 0.9735156893730164,
770
+ "learning_rate": 9.780676019336631e-07,
771
+ "loss": 1.7186,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 0.9544468546637744,
776
+ "grad_norm": 1.1226974725723267,
777
+ "learning_rate": 7.192044826145771e-07,
778
+ "loss": 1.5448,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 0.9631236442516269,
783
+ "grad_norm": 1.3849451541900635,
784
+ "learning_rate": 4.998145789118114e-07,
785
+ "loss": 1.4752,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 0.9718004338394793,
790
+ "grad_norm": 2.066307783126831,
791
+ "learning_rate": 3.2007361901485455e-07,
792
+ "loss": 1.7514,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 0.9804772234273319,
797
+ "grad_norm": 0.38120055198669434,
798
+ "learning_rate": 1.8012557287367392e-07,
799
+ "loss": 1.3631,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 0.9891540130151844,
804
+ "grad_norm": 0.4110223650932312,
805
+ "learning_rate": 8.008253688084889e-08,
806
+ "loss": 1.2402,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 0.9978308026030369,
811
+ "grad_norm": 0.6343638896942139,
812
+ "learning_rate": 2.0024644083921352e-08,
813
+ "loss": 1.4255,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 0.9978308026030369,
818
+ "eval_loss": 1.3990576267242432,
819
+ "eval_runtime": 3.4704,
820
+ "eval_samples_per_second": 27.951,
821
+ "eval_steps_per_second": 7.204,
822
+ "step": 115
823
+ },
824
+ {
825
+ "epoch": 1.0065075921908895,
826
+ "grad_norm": 2.7238411903381348,
827
+ "learning_rate": 0.0,
828
+ "loss": 1.7311,
829
+ "step": 116
830
  }
831
  ],
832
  "logging_steps": 1,
 
841
  "should_evaluate": false,
842
  "should_log": false,
843
  "should_save": true,
844
+ "should_training_stop": true
845
  },
846
  "attributes": {}
847
  }
848
  },
849
+ "total_flos": 3.329034773594112e+16,
850
  "train_batch_size": 4,
851
  "trial_name": null,
852
  "trial_params": null