Wilsonwin commited on
Commit
fadf1a1
·
verified ·
1 Parent(s): a27f045

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6a6ed44cd2dfba870cb534237ba4896f3e7dc134dd8f5e7b12dba7ffa27c335
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13b39b25712b1700516628197082779e670c991b5446245f1b02d4d7584d5995
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3d65452f71865cff75ce7bd9061bd7c195d9f7790eb08651bef46b28c8cf5db
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fa04a29b27b1343d5fb5458eddeb0052332c0d610a5e2af9e8f8706e9e6b91a
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a141ddada80b12146ad2875b480471ca4604a84a507446df6ce95668765adaf4
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abce81d7290a22f9b260f2e004a835c5fd7f98ca8d48012d38a32b582885319d
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:641712abd35039f810da46b5ecace55e8c31f5b5a7d2cfa0aaa8182597f8aad6
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2219b103874c49a564cb9902ed8bfe290939ff6276f6750739e5f7ca5ec6aba7
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.16894745734076702,
6
  "eval_steps": 500,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -731,6 +731,364 @@
731
  "eval_samples_per_second": 272.845,
732
  "eval_steps_per_second": 5.73,
733
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
  }
735
  ],
736
  "logging_steps": 10,
@@ -750,7 +1108,7 @@
750
  "attributes": {}
751
  }
752
  },
753
- "total_flos": 3.3445682085888e+16,
754
  "train_batch_size": 48,
755
  "trial_name": null,
756
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.25342118601115055,
6
  "eval_steps": 500,
7
+ "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
731
  "eval_samples_per_second": 272.845,
732
  "eval_steps_per_second": 5.73,
733
  "step": 1000
734
+ },
735
+ {
736
+ "epoch": 0.1706369319141747,
737
+ "grad_norm": 0.9963025450706482,
738
+ "learning_rate": 0.00015134999999999997,
739
+ "loss": 6.613154602050781,
740
+ "step": 1010
741
+ },
742
+ {
743
+ "epoch": 0.17232640648758235,
744
+ "grad_norm": 0.872097909450531,
745
+ "learning_rate": 0.00015284999999999997,
746
+ "loss": 6.613529968261719,
747
+ "step": 1020
748
+ },
749
+ {
750
+ "epoch": 0.17401588106099003,
751
+ "grad_norm": 1.2607650756835938,
752
+ "learning_rate": 0.00015434999999999998,
753
+ "loss": 6.587220001220703,
754
+ "step": 1030
755
+ },
756
+ {
757
+ "epoch": 0.1757053556343977,
758
+ "grad_norm": 1.0194809436798096,
759
+ "learning_rate": 0.00015584999999999997,
760
+ "loss": 6.585498046875,
761
+ "step": 1040
762
+ },
763
+ {
764
+ "epoch": 0.17739483020780536,
765
+ "grad_norm": 0.9153720736503601,
766
+ "learning_rate": 0.00015734999999999998,
767
+ "loss": 6.5845489501953125,
768
+ "step": 1050
769
+ },
770
+ {
771
+ "epoch": 0.17908430478121304,
772
+ "grad_norm": 1.1903005838394165,
773
+ "learning_rate": 0.00015884999999999999,
774
+ "loss": 6.566903686523437,
775
+ "step": 1060
776
+ },
777
+ {
778
+ "epoch": 0.18077377935462072,
779
+ "grad_norm": 0.9262056350708008,
780
+ "learning_rate": 0.00016034999999999997,
781
+ "loss": 6.520059204101562,
782
+ "step": 1070
783
+ },
784
+ {
785
+ "epoch": 0.1824632539280284,
786
+ "grad_norm": 1.0881860256195068,
787
+ "learning_rate": 0.00016184999999999998,
788
+ "loss": 6.543362426757812,
789
+ "step": 1080
790
+ },
791
+ {
792
+ "epoch": 0.18415272850143605,
793
+ "grad_norm": 0.9753679633140564,
794
+ "learning_rate": 0.00016334999999999999,
795
+ "loss": 6.528910064697266,
796
+ "step": 1090
797
+ },
798
+ {
799
+ "epoch": 0.18584220307484373,
800
+ "grad_norm": 1.2809370756149292,
801
+ "learning_rate": 0.00016485,
802
+ "loss": 6.49705810546875,
803
+ "step": 1100
804
+ },
805
+ {
806
+ "epoch": 0.1875316776482514,
807
+ "grad_norm": 1.0647395849227905,
808
+ "learning_rate": 0.00016634999999999998,
809
+ "loss": 6.508152008056641,
810
+ "step": 1110
811
+ },
812
+ {
813
+ "epoch": 0.18922115222165906,
814
+ "grad_norm": 0.9427017569541931,
815
+ "learning_rate": 0.00016785,
816
+ "loss": 6.492857360839844,
817
+ "step": 1120
818
+ },
819
+ {
820
+ "epoch": 0.19091062679506673,
821
+ "grad_norm": 1.1307021379470825,
822
+ "learning_rate": 0.00016935,
823
+ "loss": 6.474656677246093,
824
+ "step": 1130
825
+ },
826
+ {
827
+ "epoch": 0.1926001013684744,
828
+ "grad_norm": 1.182411789894104,
829
+ "learning_rate": 0.00017084999999999998,
830
+ "loss": 6.457868194580078,
831
+ "step": 1140
832
+ },
833
+ {
834
+ "epoch": 0.19428957594188206,
835
+ "grad_norm": 1.1442158222198486,
836
+ "learning_rate": 0.00017235,
837
+ "loss": 6.443910217285156,
838
+ "step": 1150
839
+ },
840
+ {
841
+ "epoch": 0.19597905051528974,
842
+ "grad_norm": 1.2637932300567627,
843
+ "learning_rate": 0.00017385,
844
+ "loss": 6.428031158447266,
845
+ "step": 1160
846
+ },
847
+ {
848
+ "epoch": 0.19766852508869742,
849
+ "grad_norm": 1.334306001663208,
850
+ "learning_rate": 0.00017534999999999998,
851
+ "loss": 6.415740966796875,
852
+ "step": 1170
853
+ },
854
+ {
855
+ "epoch": 0.19935799966210507,
856
+ "grad_norm": 0.882560670375824,
857
+ "learning_rate": 0.00017685,
858
+ "loss": 6.413926696777343,
859
+ "step": 1180
860
+ },
861
+ {
862
+ "epoch": 0.20104747423551275,
863
+ "grad_norm": 0.9657256603240967,
864
+ "learning_rate": 0.00017835,
865
+ "loss": 6.425054931640625,
866
+ "step": 1190
867
+ },
868
+ {
869
+ "epoch": 0.20273694880892043,
870
+ "grad_norm": 1.0196014642715454,
871
+ "learning_rate": 0.00017984999999999998,
872
+ "loss": 6.391595077514649,
873
+ "step": 1200
874
+ },
875
+ {
876
+ "epoch": 0.2044264233823281,
877
+ "grad_norm": 1.297837257385254,
878
+ "learning_rate": 0.00018135,
879
+ "loss": 6.382472991943359,
880
+ "step": 1210
881
+ },
882
+ {
883
+ "epoch": 0.20611589795573576,
884
+ "grad_norm": 1.1288139820098877,
885
+ "learning_rate": 0.00018285,
886
+ "loss": 6.358099746704101,
887
+ "step": 1220
888
+ },
889
+ {
890
+ "epoch": 0.20780537252914344,
891
+ "grad_norm": 0.9396995306015015,
892
+ "learning_rate": 0.00018435,
893
+ "loss": 6.355449676513672,
894
+ "step": 1230
895
+ },
896
+ {
897
+ "epoch": 0.20949484710255112,
898
+ "grad_norm": 1.1936787366867065,
899
+ "learning_rate": 0.00018585,
900
+ "loss": 6.356659698486328,
901
+ "step": 1240
902
+ },
903
+ {
904
+ "epoch": 0.21118432167595877,
905
+ "grad_norm": 0.9550564289093018,
906
+ "learning_rate": 0.00018735,
907
+ "loss": 6.337493515014648,
908
+ "step": 1250
909
+ },
910
+ {
911
+ "epoch": 0.21287379624936645,
912
+ "grad_norm": 1.2012646198272705,
913
+ "learning_rate": 0.00018884999999999996,
914
+ "loss": 6.317781829833985,
915
+ "step": 1260
916
+ },
917
+ {
918
+ "epoch": 0.21456327082277413,
919
+ "grad_norm": 1.0816755294799805,
920
+ "learning_rate": 0.00019034999999999996,
921
+ "loss": 6.316750335693359,
922
+ "step": 1270
923
+ },
924
+ {
925
+ "epoch": 0.21625274539618178,
926
+ "grad_norm": 1.3777987957000732,
927
+ "learning_rate": 0.00019184999999999997,
928
+ "loss": 6.3194934844970705,
929
+ "step": 1280
930
+ },
931
+ {
932
+ "epoch": 0.21794221996958946,
933
+ "grad_norm": 1.187603235244751,
934
+ "learning_rate": 0.00019334999999999998,
935
+ "loss": 6.30432357788086,
936
+ "step": 1290
937
+ },
938
+ {
939
+ "epoch": 0.21963169454299714,
940
+ "grad_norm": 1.0069150924682617,
941
+ "learning_rate": 0.00019484999999999997,
942
+ "loss": 6.2757713317871096,
943
+ "step": 1300
944
+ },
945
+ {
946
+ "epoch": 0.2213211691164048,
947
+ "grad_norm": 1.2410210371017456,
948
+ "learning_rate": 0.00019634999999999998,
949
+ "loss": 6.2698211669921875,
950
+ "step": 1310
951
+ },
952
+ {
953
+ "epoch": 0.22301064368981247,
954
+ "grad_norm": 1.1892989873886108,
955
+ "learning_rate": 0.00019784999999999998,
956
+ "loss": 6.2431591033935545,
957
+ "step": 1320
958
+ },
959
+ {
960
+ "epoch": 0.22470011826322014,
961
+ "grad_norm": 1.1054743528366089,
962
+ "learning_rate": 0.00019934999999999997,
963
+ "loss": 6.26300163269043,
964
+ "step": 1330
965
+ },
966
+ {
967
+ "epoch": 0.2263895928366278,
968
+ "grad_norm": 1.145757794380188,
969
+ "learning_rate": 0.00020084999999999998,
970
+ "loss": 6.226350021362305,
971
+ "step": 1340
972
+ },
973
+ {
974
+ "epoch": 0.22807906741003547,
975
+ "grad_norm": 1.0067166090011597,
976
+ "learning_rate": 0.00020234999999999999,
977
+ "loss": 6.2175750732421875,
978
+ "step": 1350
979
+ },
980
+ {
981
+ "epoch": 0.22976854198344315,
982
+ "grad_norm": 1.5041327476501465,
983
+ "learning_rate": 0.00020384999999999997,
984
+ "loss": 6.191579055786133,
985
+ "step": 1360
986
+ },
987
+ {
988
+ "epoch": 0.23145801655685083,
989
+ "grad_norm": 1.2780109643936157,
990
+ "learning_rate": 0.00020534999999999998,
991
+ "loss": 6.204021835327149,
992
+ "step": 1370
993
+ },
994
+ {
995
+ "epoch": 0.23314749113025848,
996
+ "grad_norm": 1.1531580686569214,
997
+ "learning_rate": 0.00020684999999999999,
998
+ "loss": 6.191404342651367,
999
+ "step": 1380
1000
+ },
1001
+ {
1002
+ "epoch": 0.23483696570366616,
1003
+ "grad_norm": 1.056857705116272,
1004
+ "learning_rate": 0.00020835,
1005
+ "loss": 6.17081298828125,
1006
+ "step": 1390
1007
+ },
1008
+ {
1009
+ "epoch": 0.23652644027707384,
1010
+ "grad_norm": 1.1238850355148315,
1011
+ "learning_rate": 0.00020984999999999998,
1012
+ "loss": 6.153195190429687,
1013
+ "step": 1400
1014
+ },
1015
+ {
1016
+ "epoch": 0.2382159148504815,
1017
+ "grad_norm": 1.2115790843963623,
1018
+ "learning_rate": 0.00021135,
1019
+ "loss": 6.157797622680664,
1020
+ "step": 1410
1021
+ },
1022
+ {
1023
+ "epoch": 0.23990538942388917,
1024
+ "grad_norm": 1.1303883790969849,
1025
+ "learning_rate": 0.00021285,
1026
+ "loss": 6.119416809082031,
1027
+ "step": 1420
1028
+ },
1029
+ {
1030
+ "epoch": 0.24159486399729685,
1031
+ "grad_norm": 1.2523441314697266,
1032
+ "learning_rate": 0.00021434999999999998,
1033
+ "loss": 6.133832550048828,
1034
+ "step": 1430
1035
+ },
1036
+ {
1037
+ "epoch": 0.2432843385707045,
1038
+ "grad_norm": 1.1120916604995728,
1039
+ "learning_rate": 0.00021585,
1040
+ "loss": 6.122848129272461,
1041
+ "step": 1440
1042
+ },
1043
+ {
1044
+ "epoch": 0.24497381314411218,
1045
+ "grad_norm": 1.239675521850586,
1046
+ "learning_rate": 0.00021735,
1047
+ "loss": 6.106191253662109,
1048
+ "step": 1450
1049
+ },
1050
+ {
1051
+ "epoch": 0.24666328771751986,
1052
+ "grad_norm": 1.1382733583450317,
1053
+ "learning_rate": 0.00021884999999999998,
1054
+ "loss": 6.0912620544433596,
1055
+ "step": 1460
1056
+ },
1057
+ {
1058
+ "epoch": 0.2483527622909275,
1059
+ "grad_norm": 1.3199714422225952,
1060
+ "learning_rate": 0.00022035,
1061
+ "loss": 6.09831428527832,
1062
+ "step": 1470
1063
+ },
1064
+ {
1065
+ "epoch": 0.2500422368643352,
1066
+ "grad_norm": 1.2705349922180176,
1067
+ "learning_rate": 0.00022185,
1068
+ "loss": 6.078111267089843,
1069
+ "step": 1480
1070
+ },
1071
+ {
1072
+ "epoch": 0.25173171143774287,
1073
+ "grad_norm": 1.436306357383728,
1074
+ "learning_rate": 0.00022335,
1075
+ "loss": 6.058963012695313,
1076
+ "step": 1490
1077
+ },
1078
+ {
1079
+ "epoch": 0.25342118601115055,
1080
+ "grad_norm": 1.179898977279663,
1081
+ "learning_rate": 0.00022485,
1082
+ "loss": 6.029299545288086,
1083
+ "step": 1500
1084
+ },
1085
+ {
1086
+ "epoch": 0.25342118601115055,
1087
+ "eval_loss": 6.033608436584473,
1088
+ "eval_runtime": 3.6064,
1089
+ "eval_samples_per_second": 277.282,
1090
+ "eval_steps_per_second": 5.823,
1091
+ "step": 1500
1092
  }
1093
  ],
1094
  "logging_steps": 10,
 
1108
  "attributes": {}
1109
  }
1110
  },
1111
+ "total_flos": 5.0168523128832e+16,
1112
  "train_batch_size": 48,
1113
  "trial_name": null,
1114
  "trial_params": null