ljcamargo commited on
Commit
81aa8f0
·
verified ·
1 Parent(s): b6a6861

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:afcab61d0f0cd6492620de0981d9e3af3b1d7bf197c5b9a30367af1e1384d769
3
  size 3237829088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e35f58cfc186debe53f8ca77f3187fcc171f64260bf63f1275d8d0b0ab69bede
3
  size 3237829088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4bf418fd1d2214f3b6dd2acb610220e68b528633af9c01c8a0638ef623a8e37
3
  size 2062272049
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef2fb6b56d26498118cb8b387fcedcb4debb46e3fe9c3c47660644efe86198ea
3
  size 2062272049
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63dea8701860badef7f13a7093d7c8f6df4c5eb7423d37c0b1df9d89c9a49eb9
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eae40f4428968ab5083d1a5e4e97daade1451ea492899254cef072ae8e7b9d7
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b04ef7af3a89dd0eb8778c7ed7d28aeab310d9f53593d47cc2bdc9458a253ac
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4c90e73b569a38f99c2197447433676c2eaa22ce221aeecf0a7d6e7d0501c17
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76df728eeb65e9565f5601b8baa2ec9eb380c004fbfe2e79296e73893ec398b4
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0d3e74929cb15c68f9b787eaa5631a6b89640ebdbca5e2e73c4cb4aa37e0203
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.34438226431338786,
6
  "eval_steps": 300,
7
- "global_step": 1200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -848,6 +848,216 @@
848
  "learning_rate": 0.000150926304647952,
849
  "loss": 0.8811,
850
  "step": 1200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
851
  }
852
  ],
853
  "logging_steps": 10,
@@ -867,7 +1077,7 @@
867
  "attributes": {}
868
  }
869
  },
870
- "total_flos": 4.9102593196032e+19,
871
  "train_batch_size": 6,
872
  "trial_name": null,
873
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4304778303917348,
6
  "eval_steps": 300,
7
+ "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
848
  "learning_rate": 0.000150926304647952,
849
  "loss": 0.8811,
850
  "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.3472521165159994,
854
+ "grad_norm": 6.111181259155273,
855
+ "learning_rate": 0.00015013268414012742,
856
+ "loss": 0.8297,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.350121968718611,
861
+ "grad_norm": 6.417325496673584,
862
+ "learning_rate": 0.00014933482347549303,
863
+ "loss": 0.8296,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.35299182092122255,
868
+ "grad_norm": 48.331573486328125,
869
+ "learning_rate": 0.00014853279013605957,
870
+ "loss": 0.7966,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.3558616731238341,
875
+ "grad_norm": 8.638408660888672,
876
+ "learning_rate": 0.00014772665195675718,
877
+ "loss": 0.8522,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.3587315253264457,
882
+ "grad_norm": 6.308197498321533,
883
+ "learning_rate": 0.00014691647711969803,
884
+ "loss": 0.8228,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.36160137752905724,
889
+ "grad_norm": 6.23061990737915,
890
+ "learning_rate": 0.0001461023341484094,
891
+ "loss": 0.7915,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.36447122973166884,
896
+ "grad_norm": 6.377804756164551,
897
+ "learning_rate": 0.00014528429190203824,
898
+ "loss": 0.8486,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.3673410819342804,
903
+ "grad_norm": 6.146363258361816,
904
+ "learning_rate": 0.00014446241956952714,
905
+ "loss": 0.8927,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.37021093413689193,
910
+ "grad_norm": 3.900587320327759,
911
+ "learning_rate": 0.0001436367866637622,
912
+ "loss": 0.8167,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.37308078633950353,
917
+ "grad_norm": 8.58018684387207,
918
+ "learning_rate": 0.00014280746301569407,
919
+ "loss": 0.8128,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.3759506385421151,
924
+ "grad_norm": 5.754461288452148,
925
+ "learning_rate": 0.00014197451876843138,
926
+ "loss": 0.8441,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.3788204907447266,
931
+ "grad_norm": 7.290277004241943,
932
+ "learning_rate": 0.00014113802437130845,
933
+ "loss": 0.8555,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.3816903429473382,
938
+ "grad_norm": 43.14801788330078,
939
+ "learning_rate": 0.00014029805057392655,
940
+ "loss": 0.8299,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.38456019514994977,
945
+ "grad_norm": 5.909049034118652,
946
+ "learning_rate": 0.0001394546684201701,
947
+ "loss": 0.8448,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.38743004735256137,
952
+ "grad_norm": 4.810829162597656,
953
+ "learning_rate": 0.00013860794924219782,
954
+ "loss": 0.8592,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.3902998995551729,
959
+ "grad_norm": 6.602210998535156,
960
+ "learning_rate": 0.00013775796465440956,
961
+ "loss": 0.8351,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.39316975175778446,
966
+ "grad_norm": 7.952111721038818,
967
+ "learning_rate": 0.0001369047865473893,
968
+ "loss": 0.8243,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.39603960396039606,
973
+ "grad_norm": 8.271283149719238,
974
+ "learning_rate": 0.00013604848708182466,
975
+ "loss": 0.8239,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.3989094561630076,
980
+ "grad_norm": 12.694669723510742,
981
+ "learning_rate": 0.00013518913868240372,
982
+ "loss": 0.8381,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.40177930836561915,
987
+ "grad_norm": 22.169252395629883,
988
+ "learning_rate": 0.00013432681403168932,
989
+ "loss": 0.8227,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.40464916056823075,
994
+ "grad_norm": 127.96073913574219,
995
+ "learning_rate": 0.00013346158606397182,
996
+ "loss": 0.8376,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.4075190127708423,
1001
+ "grad_norm": 12.16250991821289,
1002
+ "learning_rate": 0.0001325935279591003,
1003
+ "loss": 0.8253,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.4103888649734539,
1008
+ "grad_norm": 11.346808433532715,
1009
+ "learning_rate": 0.00013172271313629315,
1010
+ "loss": 0.8554,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.41325871717606544,
1015
+ "grad_norm": 18.371610641479492,
1016
+ "learning_rate": 0.0001308492152479283,
1017
+ "loss": 0.7743,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.416128569378677,
1022
+ "grad_norm": 17.174100875854492,
1023
+ "learning_rate": 0.00012997310817331392,
1024
+ "loss": 0.8342,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.4189984215812886,
1029
+ "grad_norm": 15.853143692016602,
1030
+ "learning_rate": 0.00012909446601243972,
1031
+ "loss": 0.8514,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 0.4218682737839001,
1036
+ "grad_norm": 6.734909534454346,
1037
+ "learning_rate": 0.00012821336307970965,
1038
+ "loss": 0.7947,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 0.42473812598651167,
1043
+ "grad_norm": 7.687751770019531,
1044
+ "learning_rate": 0.00012732987389765658,
1045
+ "loss": 0.8249,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 0.4276079781891233,
1050
+ "grad_norm": 4.791903972625732,
1051
+ "learning_rate": 0.00012644407319063918,
1052
+ "loss": 0.7755,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 0.4304778303917348,
1057
+ "grad_norm": 3.5958361625671387,
1058
+ "learning_rate": 0.0001255560358785219,
1059
+ "loss": 0.7828,
1060
+ "step": 1500
1061
  }
1062
  ],
1063
  "logging_steps": 10,
 
1077
  "attributes": {}
1078
  }
1079
  },
1080
+ "total_flos": 6.137824149504e+19,
1081
  "train_batch_size": 6,
1082
  "trial_name": null,
1083
  "trial_params": null