ljcamargo commited on
Commit
b8e5164
·
verified ·
1 Parent(s): ab6644a

Training in progress, step 1200, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea44be5f29e63d43296d9d83bd74000d9eec25472608a721883a3def330d0d51
3
  size 3237818848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f18f2b21d1eb9893ef7d432745ca210cc86cd300d6d237450504c29478453fb
3
  size 3237818848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcfc52b46b2bcbd19bdeae44612f8466c1fd2dddd02666025d9a6d924a564419
3
  size 2062251569
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffd49387501c08473c006cb3983fe8e3572862f34ccc79a00ee2957719d3508e
3
  size 2062251569
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60c8632974dc900245d4dfbbcf87a13b532e38345500a34dea8a1b480b697112
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a602fcddae5166b23f64a1263af24cb60ac56e25cf7aa91c125f6b46213120d
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:504b7bc543b9e5f039f6559d07b099507a66c15c86836ff5981e4eee51792c02
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b04ef7af3a89dd0eb8778c7ed7d28aeab310d9f53593d47cc2bdc9458a253ac
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a838d3ba3633bb04603e3afbc02ea3103b4064d4c633a0639c7ced656d5b0c92
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:370edc1f7812cd81a8eae6fcade42c3407f4dcaf97659f9602f84f2549a0a41c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4,
6
  "eval_steps": 300,
7
- "global_step": 900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -650,6 +650,216 @@
650
  "learning_rate": 0.00013498887007137918,
651
  "loss": 1.0813,
652
  "step": 900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
653
  }
654
  ],
655
  "logging_steps": 10,
@@ -669,7 +879,7 @@
669
  "attributes": {}
670
  }
671
  },
672
- "total_flos": 2.4551296598016e+19,
673
  "train_batch_size": 4,
674
  "trial_name": null,
675
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5333333333333333,
6
  "eval_steps": 300,
7
+ "global_step": 1200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
650
  "learning_rate": 0.00013498887007137918,
651
  "loss": 1.0813,
652
  "step": 900
653
+ },
654
+ {
655
+ "epoch": 0.40444444444444444,
656
+ "grad_norm": 7.365835189819336,
657
+ "learning_rate": 0.0001336512708132819,
658
+ "loss": 1.0193,
659
+ "step": 910
660
+ },
661
+ {
662
+ "epoch": 0.4088888888888889,
663
+ "grad_norm": 11.938828468322754,
664
+ "learning_rate": 0.00013230684686628744,
665
+ "loss": 1.1339,
666
+ "step": 920
667
+ },
668
+ {
669
+ "epoch": 0.41333333333333333,
670
+ "grad_norm": 12.521608352661133,
671
+ "learning_rate": 0.00013095587088800902,
672
+ "loss": 1.0743,
673
+ "step": 930
674
+ },
675
+ {
676
+ "epoch": 0.4177777777777778,
677
+ "grad_norm": 9.13135051727295,
678
+ "learning_rate": 0.00012959861686485304,
679
+ "loss": 1.0734,
680
+ "step": 940
681
+ },
682
+ {
683
+ "epoch": 0.4222222222222222,
684
+ "grad_norm": 16.696514129638672,
685
+ "learning_rate": 0.0001282353600564527,
686
+ "loss": 1.1145,
687
+ "step": 950
688
+ },
689
+ {
690
+ "epoch": 0.4266666666666667,
691
+ "grad_norm": 12.382914543151855,
692
+ "learning_rate": 0.00012686637693984384,
693
+ "loss": 0.9964,
694
+ "step": 960
695
+ },
696
+ {
697
+ "epoch": 0.4311111111111111,
698
+ "grad_norm": 10.711663246154785,
699
+ "learning_rate": 0.00012549194515339344,
700
+ "loss": 1.0572,
701
+ "step": 970
702
+ },
703
+ {
704
+ "epoch": 0.43555555555555553,
705
+ "grad_norm": 13.973264694213867,
706
+ "learning_rate": 0.00012411234344049293,
707
+ "loss": 1.0616,
708
+ "step": 980
709
+ },
710
+ {
711
+ "epoch": 0.44,
712
+ "grad_norm": 10.161416053771973,
713
+ "learning_rate": 0.0001227278515930273,
714
+ "loss": 1.0561,
715
+ "step": 990
716
+ },
717
+ {
718
+ "epoch": 0.4444444444444444,
719
+ "grad_norm": 27.74120330810547,
720
+ "learning_rate": 0.00012133875039463148,
721
+ "loss": 1.1011,
722
+ "step": 1000
723
+ },
724
+ {
725
+ "epoch": 0.4488888888888889,
726
+ "grad_norm": 9.277678489685059,
727
+ "learning_rate": 0.00011994532156374574,
728
+ "loss": 1.0957,
729
+ "step": 1010
730
+ },
731
+ {
732
+ "epoch": 0.4533333333333333,
733
+ "grad_norm": 9.599855422973633,
734
+ "learning_rate": 0.00011854784769648137,
735
+ "loss": 1.0394,
736
+ "step": 1020
737
+ },
738
+ {
739
+ "epoch": 0.4577777777777778,
740
+ "grad_norm": 9.483017921447754,
741
+ "learning_rate": 0.00011714661220930833,
742
+ "loss": 0.9773,
743
+ "step": 1030
744
+ },
745
+ {
746
+ "epoch": 0.4622222222222222,
747
+ "grad_norm": 11.467011451721191,
748
+ "learning_rate": 0.00011574189928157689,
749
+ "loss": 1.0346,
750
+ "step": 1040
751
+ },
752
+ {
753
+ "epoch": 0.4666666666666667,
754
+ "grad_norm": 8.953259468078613,
755
+ "learning_rate": 0.00011433399379788387,
756
+ "loss": 1.0622,
757
+ "step": 1050
758
+ },
759
+ {
760
+ "epoch": 0.4711111111111111,
761
+ "grad_norm": 6.459799289703369,
762
+ "learning_rate": 0.00011292318129029665,
763
+ "loss": 0.9814,
764
+ "step": 1060
765
+ },
766
+ {
767
+ "epoch": 0.47555555555555556,
768
+ "grad_norm": 8.728630065917969,
769
+ "learning_rate": 0.00011150974788044521,
770
+ "loss": 1.0526,
771
+ "step": 1070
772
+ },
773
+ {
774
+ "epoch": 0.48,
775
+ "grad_norm": 7.729814529418945,
776
+ "learning_rate": 0.00011009398022149495,
777
+ "loss": 0.9997,
778
+ "step": 1080
779
+ },
780
+ {
781
+ "epoch": 0.48444444444444446,
782
+ "grad_norm": 9.42880916595459,
783
+ "learning_rate": 0.00010867616544001164,
784
+ "loss": 0.9999,
785
+ "step": 1090
786
+ },
787
+ {
788
+ "epoch": 0.4888888888888889,
789
+ "grad_norm": 8.457280158996582,
790
+ "learning_rate": 0.00010725659107773045,
791
+ "loss": 1.0464,
792
+ "step": 1100
793
+ },
794
+ {
795
+ "epoch": 0.49333333333333335,
796
+ "grad_norm": 8.715860366821289,
797
+ "learning_rate": 0.00010583554503324044,
798
+ "loss": 1.0088,
799
+ "step": 1110
800
+ },
801
+ {
802
+ "epoch": 0.49777777777777776,
803
+ "grad_norm": 6.529873847961426,
804
+ "learning_rate": 0.00010441331550359712,
805
+ "loss": 1.0749,
806
+ "step": 1120
807
+ },
808
+ {
809
+ "epoch": 0.5022222222222222,
810
+ "grad_norm": 7.668039798736572,
811
+ "learning_rate": 0.0001029901909258742,
812
+ "loss": 1.026,
813
+ "step": 1130
814
+ },
815
+ {
816
+ "epoch": 0.5066666666666667,
817
+ "grad_norm": 8.81876277923584,
818
+ "learning_rate": 0.00010156645991866677,
819
+ "loss": 1.0293,
820
+ "step": 1140
821
+ },
822
+ {
823
+ "epoch": 0.5111111111111111,
824
+ "grad_norm": 8.55112075805664,
825
+ "learning_rate": 0.00010014241122355762,
826
+ "loss": 1.0282,
827
+ "step": 1150
828
+ },
829
+ {
830
+ "epoch": 0.5155555555555555,
831
+ "grad_norm": 9.484146118164062,
832
+ "learning_rate": 9.871833364655865e-05,
833
+ "loss": 0.9964,
834
+ "step": 1160
835
+ },
836
+ {
837
+ "epoch": 0.52,
838
+ "grad_norm": 10.939757347106934,
839
+ "learning_rate": 9.729451599953917e-05,
840
+ "loss": 1.0519,
841
+ "step": 1170
842
+ },
843
+ {
844
+ "epoch": 0.5244444444444445,
845
+ "grad_norm": 9.229081153869629,
846
+ "learning_rate": 9.587124704165302e-05,
847
+ "loss": 1.0511,
848
+ "step": 1180
849
+ },
850
+ {
851
+ "epoch": 0.5288888888888889,
852
+ "grad_norm": 12.294286727905273,
853
+ "learning_rate": 9.44488154207766e-05,
854
+ "loss": 1.0302,
855
+ "step": 1190
856
+ },
857
+ {
858
+ "epoch": 0.5333333333333333,
859
+ "grad_norm": 11.527563095092773,
860
+ "learning_rate": 9.302750961496888e-05,
861
+ "loss": 1.0333,
862
+ "step": 1200
863
  }
864
  ],
865
  "logging_steps": 10,
 
879
  "attributes": {}
880
  }
881
  },
882
+ "total_flos": 3.2735062130688e+19,
883
  "train_batch_size": 4,
884
  "trial_name": null,
885
  "trial_params": null