Wilsonwin commited on
Commit
98a1005
·
verified ·
1 Parent(s): cb1f5fc

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6e3a61755ce1ee753f0e8d104d120e50755452bd8cbf6e05ff0a62e9faa8d42
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88dd721178de7cb9a4bacf024fc5633ddcfb9a0c3e0c623a628ad9477d487830
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91744d22093227f73ff956561d21826bda5cf09b77b7d63e333247291b5a2ff3
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5aaf7eff4e98edf3615725ee84d901bec88b6934e6cc793a70cccc1ba139f1b1
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:577f0b7cee8114520712167e366c7fd1da45e6cc96d21bb243dccf2193c0d60f
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5c11efa1814b5739819e47e5bc390045b533d07baee31a6d67f2f2c2f772d60
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:641712abd35039f810da46b5ecace55e8c31f5b5a7d2cfa0aaa8182597f8aad6
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2219b103874c49a564cb9902ed8bfe290939ff6276f6750739e5f7ca5ec6aba7
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.16894745734076702,
6
  "eval_steps": 500,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -731,6 +731,364 @@
731
  "eval_samples_per_second": 272.774,
732
  "eval_steps_per_second": 5.728,
733
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
  }
735
  ],
736
  "logging_steps": 10,
@@ -750,7 +1108,7 @@
750
  "attributes": {}
751
  }
752
  },
753
- "total_flos": 3.3445682085888e+16,
754
  "train_batch_size": 48,
755
  "trial_name": null,
756
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.25342118601115055,
6
  "eval_steps": 500,
7
+ "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
731
  "eval_samples_per_second": 272.774,
732
  "eval_steps_per_second": 5.728,
733
  "step": 1000
734
+ },
735
+ {
736
+ "epoch": 0.1706369319141747,
737
+ "grad_norm": 1.1271182298660278,
738
+ "learning_rate": 0.00015134999999999997,
739
+ "loss": 6.613565063476562,
740
+ "step": 1010
741
+ },
742
+ {
743
+ "epoch": 0.17232640648758235,
744
+ "grad_norm": 0.986314058303833,
745
+ "learning_rate": 0.00015284999999999997,
746
+ "loss": 6.617231750488282,
747
+ "step": 1020
748
+ },
749
+ {
750
+ "epoch": 0.17401588106099003,
751
+ "grad_norm": 0.9544497132301331,
752
+ "learning_rate": 0.00015434999999999998,
753
+ "loss": 6.588863372802734,
754
+ "step": 1030
755
+ },
756
+ {
757
+ "epoch": 0.1757053556343977,
758
+ "grad_norm": 0.9558138251304626,
759
+ "learning_rate": 0.00015584999999999997,
760
+ "loss": 6.589711761474609,
761
+ "step": 1040
762
+ },
763
+ {
764
+ "epoch": 0.17739483020780536,
765
+ "grad_norm": 0.7995399236679077,
766
+ "learning_rate": 0.00015734999999999998,
767
+ "loss": 6.585386657714844,
768
+ "step": 1050
769
+ },
770
+ {
771
+ "epoch": 0.17908430478121304,
772
+ "grad_norm": 1.052297830581665,
773
+ "learning_rate": 0.00015884999999999999,
774
+ "loss": 6.567500305175781,
775
+ "step": 1060
776
+ },
777
+ {
778
+ "epoch": 0.18077377935462072,
779
+ "grad_norm": 1.0039197206497192,
780
+ "learning_rate": 0.00016034999999999997,
781
+ "loss": 6.520351409912109,
782
+ "step": 1070
783
+ },
784
+ {
785
+ "epoch": 0.1824632539280284,
786
+ "grad_norm": 1.1218979358673096,
787
+ "learning_rate": 0.00016184999999999998,
788
+ "loss": 6.544094848632812,
789
+ "step": 1080
790
+ },
791
+ {
792
+ "epoch": 0.18415272850143605,
793
+ "grad_norm": 0.9787916541099548,
794
+ "learning_rate": 0.00016334999999999999,
795
+ "loss": 6.525911712646485,
796
+ "step": 1090
797
+ },
798
+ {
799
+ "epoch": 0.18584220307484373,
800
+ "grad_norm": 1.0371917486190796,
801
+ "learning_rate": 0.00016485,
802
+ "loss": 6.497917175292969,
803
+ "step": 1100
804
+ },
805
+ {
806
+ "epoch": 0.1875316776482514,
807
+ "grad_norm": 0.9600743055343628,
808
+ "learning_rate": 0.00016634999999999998,
809
+ "loss": 6.508014678955078,
810
+ "step": 1110
811
+ },
812
+ {
813
+ "epoch": 0.18922115222165906,
814
+ "grad_norm": 1.0070927143096924,
815
+ "learning_rate": 0.00016785,
816
+ "loss": 6.495113372802734,
817
+ "step": 1120
818
+ },
819
+ {
820
+ "epoch": 0.19091062679506673,
821
+ "grad_norm": 1.0297925472259521,
822
+ "learning_rate": 0.00016935,
823
+ "loss": 6.476868438720703,
824
+ "step": 1130
825
+ },
826
+ {
827
+ "epoch": 0.1926001013684744,
828
+ "grad_norm": 1.2697237730026245,
829
+ "learning_rate": 0.00017084999999999998,
830
+ "loss": 6.457928466796875,
831
+ "step": 1140
832
+ },
833
+ {
834
+ "epoch": 0.19428957594188206,
835
+ "grad_norm": 1.0294984579086304,
836
+ "learning_rate": 0.00017235,
837
+ "loss": 6.44539794921875,
838
+ "step": 1150
839
+ },
840
+ {
841
+ "epoch": 0.19597905051528974,
842
+ "grad_norm": 0.9561747908592224,
843
+ "learning_rate": 0.00017385,
844
+ "loss": 6.428233337402344,
845
+ "step": 1160
846
+ },
847
+ {
848
+ "epoch": 0.19766852508869742,
849
+ "grad_norm": 1.0781433582305908,
850
+ "learning_rate": 0.00017534999999999998,
851
+ "loss": 6.419010925292969,
852
+ "step": 1170
853
+ },
854
+ {
855
+ "epoch": 0.19935799966210507,
856
+ "grad_norm": 1.1035155057907104,
857
+ "learning_rate": 0.00017685,
858
+ "loss": 6.415547180175781,
859
+ "step": 1180
860
+ },
861
+ {
862
+ "epoch": 0.20104747423551275,
863
+ "grad_norm": 1.3168368339538574,
864
+ "learning_rate": 0.00017835,
865
+ "loss": 6.426744842529297,
866
+ "step": 1190
867
+ },
868
+ {
869
+ "epoch": 0.20273694880892043,
870
+ "grad_norm": 1.0347086191177368,
871
+ "learning_rate": 0.00017984999999999998,
872
+ "loss": 6.390957641601562,
873
+ "step": 1200
874
+ },
875
+ {
876
+ "epoch": 0.2044264233823281,
877
+ "grad_norm": 1.1279572248458862,
878
+ "learning_rate": 0.00018135,
879
+ "loss": 6.382788848876953,
880
+ "step": 1210
881
+ },
882
+ {
883
+ "epoch": 0.20611589795573576,
884
+ "grad_norm": 1.072667121887207,
885
+ "learning_rate": 0.00018285,
886
+ "loss": 6.360079956054688,
887
+ "step": 1220
888
+ },
889
+ {
890
+ "epoch": 0.20780537252914344,
891
+ "grad_norm": 0.9861840605735779,
892
+ "learning_rate": 0.00018435,
893
+ "loss": 6.357261657714844,
894
+ "step": 1230
895
+ },
896
+ {
897
+ "epoch": 0.20949484710255112,
898
+ "grad_norm": 1.1492271423339844,
899
+ "learning_rate": 0.00018585,
900
+ "loss": 6.357462310791016,
901
+ "step": 1240
902
+ },
903
+ {
904
+ "epoch": 0.21118432167595877,
905
+ "grad_norm": 1.0800551176071167,
906
+ "learning_rate": 0.00018735,
907
+ "loss": 6.335888290405274,
908
+ "step": 1250
909
+ },
910
+ {
911
+ "epoch": 0.21287379624936645,
912
+ "grad_norm": 1.2155574560165405,
913
+ "learning_rate": 0.00018884999999999996,
914
+ "loss": 6.315219497680664,
915
+ "step": 1260
916
+ },
917
+ {
918
+ "epoch": 0.21456327082277413,
919
+ "grad_norm": 1.1489802598953247,
920
+ "learning_rate": 0.00019034999999999996,
921
+ "loss": 6.315352249145508,
922
+ "step": 1270
923
+ },
924
+ {
925
+ "epoch": 0.21625274539618178,
926
+ "grad_norm": 0.8910436034202576,
927
+ "learning_rate": 0.00019184999999999997,
928
+ "loss": 6.315536117553711,
929
+ "step": 1280
930
+ },
931
+ {
932
+ "epoch": 0.21794221996958946,
933
+ "grad_norm": 1.3812509775161743,
934
+ "learning_rate": 0.00019334999999999998,
935
+ "loss": 6.30749282836914,
936
+ "step": 1290
937
+ },
938
+ {
939
+ "epoch": 0.21963169454299714,
940
+ "grad_norm": 1.0909061431884766,
941
+ "learning_rate": 0.00019484999999999997,
942
+ "loss": 6.276274108886719,
943
+ "step": 1300
944
+ },
945
+ {
946
+ "epoch": 0.2213211691164048,
947
+ "grad_norm": 1.1523410081863403,
948
+ "learning_rate": 0.00019634999999999998,
949
+ "loss": 6.271069717407227,
950
+ "step": 1310
951
+ },
952
+ {
953
+ "epoch": 0.22301064368981247,
954
+ "grad_norm": 1.1942309141159058,
955
+ "learning_rate": 0.00019784999999999998,
956
+ "loss": 6.243044662475586,
957
+ "step": 1320
958
+ },
959
+ {
960
+ "epoch": 0.22470011826322014,
961
+ "grad_norm": 1.0811495780944824,
962
+ "learning_rate": 0.00019934999999999997,
963
+ "loss": 6.263803100585937,
964
+ "step": 1330
965
+ },
966
+ {
967
+ "epoch": 0.2263895928366278,
968
+ "grad_norm": 0.9704897999763489,
969
+ "learning_rate": 0.00020084999999999998,
970
+ "loss": 6.227846145629883,
971
+ "step": 1340
972
+ },
973
+ {
974
+ "epoch": 0.22807906741003547,
975
+ "grad_norm": 1.1559011936187744,
976
+ "learning_rate": 0.00020234999999999999,
977
+ "loss": 6.219324111938477,
978
+ "step": 1350
979
+ },
980
+ {
981
+ "epoch": 0.22976854198344315,
982
+ "grad_norm": 1.4220818281173706,
983
+ "learning_rate": 0.00020384999999999997,
984
+ "loss": 6.193642807006836,
985
+ "step": 1360
986
+ },
987
+ {
988
+ "epoch": 0.23145801655685083,
989
+ "grad_norm": 1.257460594177246,
990
+ "learning_rate": 0.00020534999999999998,
991
+ "loss": 6.204745483398438,
992
+ "step": 1370
993
+ },
994
+ {
995
+ "epoch": 0.23314749113025848,
996
+ "grad_norm": 0.998426079750061,
997
+ "learning_rate": 0.00020684999999999999,
998
+ "loss": 6.19123649597168,
999
+ "step": 1380
1000
+ },
1001
+ {
1002
+ "epoch": 0.23483696570366616,
1003
+ "grad_norm": 1.0262788534164429,
1004
+ "learning_rate": 0.00020835,
1005
+ "loss": 6.168861389160156,
1006
+ "step": 1390
1007
+ },
1008
+ {
1009
+ "epoch": 0.23652644027707384,
1010
+ "grad_norm": 1.2289084196090698,
1011
+ "learning_rate": 0.00020984999999999998,
1012
+ "loss": 6.1512306213378904,
1013
+ "step": 1400
1014
+ },
1015
+ {
1016
+ "epoch": 0.2382159148504815,
1017
+ "grad_norm": 0.981163740158081,
1018
+ "learning_rate": 0.00021135,
1019
+ "loss": 6.15844497680664,
1020
+ "step": 1410
1021
+ },
1022
+ {
1023
+ "epoch": 0.23990538942388917,
1024
+ "grad_norm": 1.2685691118240356,
1025
+ "learning_rate": 0.00021285,
1026
+ "loss": 6.120241165161133,
1027
+ "step": 1420
1028
+ },
1029
+ {
1030
+ "epoch": 0.24159486399729685,
1031
+ "grad_norm": 1.2658777236938477,
1032
+ "learning_rate": 0.00021434999999999998,
1033
+ "loss": 6.137707901000977,
1034
+ "step": 1430
1035
+ },
1036
+ {
1037
+ "epoch": 0.2432843385707045,
1038
+ "grad_norm": 1.0119268894195557,
1039
+ "learning_rate": 0.00021585,
1040
+ "loss": 6.122516250610351,
1041
+ "step": 1440
1042
+ },
1043
+ {
1044
+ "epoch": 0.24497381314411218,
1045
+ "grad_norm": 1.2874401807785034,
1046
+ "learning_rate": 0.00021735,
1047
+ "loss": 6.107418441772461,
1048
+ "step": 1450
1049
+ },
1050
+ {
1051
+ "epoch": 0.24666328771751986,
1052
+ "grad_norm": 1.0223300457000732,
1053
+ "learning_rate": 0.00021884999999999998,
1054
+ "loss": 6.101365280151367,
1055
+ "step": 1460
1056
+ },
1057
+ {
1058
+ "epoch": 0.2483527622909275,
1059
+ "grad_norm": 1.1164225339889526,
1060
+ "learning_rate": 0.00022035,
1061
+ "loss": 6.099144744873047,
1062
+ "step": 1470
1063
+ },
1064
+ {
1065
+ "epoch": 0.2500422368643352,
1066
+ "grad_norm": 1.2341437339782715,
1067
+ "learning_rate": 0.00022185,
1068
+ "loss": 6.07642822265625,
1069
+ "step": 1480
1070
+ },
1071
+ {
1072
+ "epoch": 0.25173171143774287,
1073
+ "grad_norm": 1.0508023500442505,
1074
+ "learning_rate": 0.00022335,
1075
+ "loss": 6.062232208251953,
1076
+ "step": 1490
1077
+ },
1078
+ {
1079
+ "epoch": 0.25342118601115055,
1080
+ "grad_norm": 1.7231544256210327,
1081
+ "learning_rate": 0.00022485,
1082
+ "loss": 6.033694458007813,
1083
+ "step": 1500
1084
+ },
1085
+ {
1086
+ "epoch": 0.25342118601115055,
1087
+ "eval_loss": 6.021761894226074,
1088
+ "eval_runtime": 3.6574,
1089
+ "eval_samples_per_second": 273.42,
1090
+ "eval_steps_per_second": 5.742,
1091
+ "step": 1500
1092
  }
1093
  ],
1094
  "logging_steps": 10,
 
1108
  "attributes": {}
1109
  }
1110
  },
1111
+ "total_flos": 5.0168523128832e+16,
1112
  "train_batch_size": 48,
1113
  "trial_name": null,
1114
  "trial_params": null