ljcamargo commited on
Commit
c858ad1
·
verified ·
1 Parent(s): 84f045c

Training in progress, step 1200, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1935637205cb627b948fe1329a80486b1da1feb7f14f8a0e15acab010a97b90c
3
  size 2558403928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ad9704bc5747fc17b3c1496ffdc9c4a82ee0c2ceb16f4e948b6593950765fc1
3
  size 2558403928
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:575872657bd8f5c69c8e9a049519a8b9a0d9795ca6890003a302f811f9d4108a
3
  size 1313638993
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ce1ad6cf71b73814b98c18ffc3d1dbf6b9d7b64f05e65823d86e41a0a2a0f4
3
  size 1313638993
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41d7cb8df90bbc1a1f334913d48d210d3a9a45cf39cb2aba7ed6759fa8b44c3a
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68e939733619667823ff09361a70b450356b35690c073061e24545321b21c4b0
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a0b4230f34cfc1b81dc2c15ef8d265bdd348193f5a746ca2018df11549c7ac0
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18783150ac09b6b81cea5af47876a10bfe5f36c3d76aca4ffce5382bdfaf7b28
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46dbc8a28dada13dfcd70ea962672a500c66aa01dc461c5d292f261a3ca3d0fc
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2c4ff631d77bc2fe5cad879e6c434ab3b6d8a7e0b9cce252cee47e42bdf838a
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.72,
6
  "eval_steps": 500,
7
- "global_step": 900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -645,6 +645,216 @@
645
  "learning_rate": 0.0001504157768367901,
646
  "loss": 6.6899,
647
  "step": 900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  }
649
  ],
650
  "logging_steps": 10,
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.96,
6
  "eval_steps": 500,
7
+ "global_step": 1200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
645
  "learning_rate": 0.0001504157768367901,
646
  "loss": 6.6899,
647
  "step": 900
648
+ },
649
+ {
650
+ "epoch": 0.728,
651
+ "grad_norm": 5.094054222106934,
652
+ "learning_rate": 0.00014927912345112616,
653
+ "loss": 6.2008,
654
+ "step": 910
655
+ },
656
+ {
657
+ "epoch": 0.736,
658
+ "grad_norm": 6.107059955596924,
659
+ "learning_rate": 0.0001481339981254846,
660
+ "loss": 6.2149,
661
+ "step": 920
662
+ },
663
+ {
664
+ "epoch": 0.744,
665
+ "grad_norm": 6.355636119842529,
666
+ "learning_rate": 0.00014698059772686202,
667
+ "loss": 6.7521,
668
+ "step": 930
669
+ },
670
+ {
671
+ "epoch": 0.752,
672
+ "grad_norm": 7.230486869812012,
673
+ "learning_rate": 0.00014581912054488413,
674
+ "loss": 6.3823,
675
+ "step": 940
676
+ },
677
+ {
678
+ "epoch": 0.76,
679
+ "grad_norm": 4.960805416107178,
680
+ "learning_rate": 0.00014464976625771654,
681
+ "loss": 6.6149,
682
+ "step": 950
683
+ },
684
+ {
685
+ "epoch": 0.768,
686
+ "grad_norm": 5.269943714141846,
687
+ "learning_rate": 0.00014347273589773637,
688
+ "loss": 6.1001,
689
+ "step": 960
690
+ },
691
+ {
692
+ "epoch": 0.776,
693
+ "grad_norm": 6.840855598449707,
694
+ "learning_rate": 0.0001422882318169716,
695
+ "loss": 6.3677,
696
+ "step": 970
697
+ },
698
+ {
699
+ "epoch": 0.784,
700
+ "grad_norm": 5.1594038009643555,
701
+ "learning_rate": 0.00014109645765231278,
702
+ "loss": 6.3929,
703
+ "step": 980
704
+ },
705
+ {
706
+ "epoch": 0.792,
707
+ "grad_norm": 5.096086025238037,
708
+ "learning_rate": 0.00013989761829050475,
709
+ "loss": 6.1354,
710
+ "step": 990
711
+ },
712
+ {
713
+ "epoch": 0.8,
714
+ "grad_norm": 5.235525608062744,
715
+ "learning_rate": 0.00013869191983292283,
716
+ "loss": 6.4954,
717
+ "step": 1000
718
+ },
719
+ {
720
+ "epoch": 0.808,
721
+ "grad_norm": 5.518918991088867,
722
+ "learning_rate": 0.00013747956956014037,
723
+ "loss": 6.449,
724
+ "step": 1010
725
+ },
726
+ {
727
+ "epoch": 0.816,
728
+ "grad_norm": 4.848990440368652,
729
+ "learning_rate": 0.00013626077589629367,
730
+ "loss": 6.392,
731
+ "step": 1020
732
+ },
733
+ {
734
+ "epoch": 0.824,
735
+ "grad_norm": 7.234468460083008,
736
+ "learning_rate": 0.00013503574837325015,
737
+ "loss": 6.5465,
738
+ "step": 1030
739
+ },
740
+ {
741
+ "epoch": 0.832,
742
+ "grad_norm": 6.593731880187988,
743
+ "learning_rate": 0.00013380469759458643,
744
+ "loss": 6.574,
745
+ "step": 1040
746
+ },
747
+ {
748
+ "epoch": 0.84,
749
+ "grad_norm": 5.687368392944336,
750
+ "learning_rate": 0.00013256783519938154,
751
+ "loss": 6.1995,
752
+ "step": 1050
753
+ },
754
+ {
755
+ "epoch": 0.848,
756
+ "grad_norm": 4.857635498046875,
757
+ "learning_rate": 0.00013132537382583274,
758
+ "loss": 5.8422,
759
+ "step": 1060
760
+ },
761
+ {
762
+ "epoch": 0.856,
763
+ "grad_norm": 7.068734645843506,
764
+ "learning_rate": 0.00013007752707469924,
765
+ "loss": 6.0601,
766
+ "step": 1070
767
+ },
768
+ {
769
+ "epoch": 0.864,
770
+ "grad_norm": 4.396754741668701,
771
+ "learning_rate": 0.00012882450947258045,
772
+ "loss": 5.8387,
773
+ "step": 1080
774
+ },
775
+ {
776
+ "epoch": 0.872,
777
+ "grad_norm": 9.501909255981445,
778
+ "learning_rate": 0.0001275665364350352,
779
+ "loss": 5.9831,
780
+ "step": 1090
781
+ },
782
+ {
783
+ "epoch": 0.88,
784
+ "grad_norm": 6.957056522369385,
785
+ "learning_rate": 0.00012630382422954795,
786
+ "loss": 6.1359,
787
+ "step": 1100
788
+ },
789
+ {
790
+ "epoch": 0.888,
791
+ "grad_norm": 5.782343864440918,
792
+ "learning_rate": 0.00012503658993834885,
793
+ "loss": 6.0754,
794
+ "step": 1110
795
+ },
796
+ {
797
+ "epoch": 0.896,
798
+ "grad_norm": 5.452831268310547,
799
+ "learning_rate": 0.0001237650514210932,
800
+ "loss": 5.6186,
801
+ "step": 1120
802
+ },
803
+ {
804
+ "epoch": 0.904,
805
+ "grad_norm": 6.382038593292236,
806
+ "learning_rate": 0.00012248942727740783,
807
+ "loss": 5.7174,
808
+ "step": 1130
809
+ },
810
+ {
811
+ "epoch": 0.912,
812
+ "grad_norm": 6.288851261138916,
813
+ "learning_rate": 0.00012120993680931003,
814
+ "loss": 5.6529,
815
+ "step": 1140
816
+ },
817
+ {
818
+ "epoch": 0.92,
819
+ "grad_norm": 6.7387166023254395,
820
+ "learning_rate": 0.0001199267999835055,
821
+ "loss": 5.603,
822
+ "step": 1150
823
+ },
824
+ {
825
+ "epoch": 0.928,
826
+ "grad_norm": 5.694065093994141,
827
+ "learning_rate": 0.00011864023739357235,
828
+ "loss": 5.2627,
829
+ "step": 1160
830
+ },
831
+ {
832
+ "epoch": 0.936,
833
+ "grad_norm": 6.711731910705566,
834
+ "learning_rate": 0.00011735047022203741,
835
+ "loss": 5.4706,
836
+ "step": 1170
837
+ },
838
+ {
839
+ "epoch": 0.944,
840
+ "grad_norm": 5.517411708831787,
841
+ "learning_rate": 0.00011605772020235072,
842
+ "loss": 5.6277,
843
+ "step": 1180
844
+ },
845
+ {
846
+ "epoch": 0.952,
847
+ "grad_norm": 6.785055160522461,
848
+ "learning_rate": 0.00011476220958076607,
849
+ "loss": 5.9611,
850
+ "step": 1190
851
+ },
852
+ {
853
+ "epoch": 0.96,
854
+ "grad_norm": 5.702793121337891,
855
+ "learning_rate": 0.00011346416107813267,
856
+ "loss": 5.9226,
857
+ "step": 1200
858
  }
859
  ],
860
  "logging_steps": 10,