Training in progress, step 1500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3237829088
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e35f58cfc186debe53f8ca77f3187fcc171f64260bf63f1275d8d0b0ab69bede
|
| 3 |
size 3237829088
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2062272049
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef2fb6b56d26498118cb8b387fcedcb4debb46e3fe9c3c47660644efe86198ea
|
| 3 |
size 2062272049
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6eae40f4428968ab5083d1a5e4e97daade1451ea492899254cef072ae8e7b9d7
|
| 3 |
size 14645
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4c90e73b569a38f99c2197447433676c2eaa22ce221aeecf0a7d6e7d0501c17
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e0d3e74929cb15c68f9b787eaa5631a6b89640ebdbca5e2e73c4cb4aa37e0203
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 300,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -848,6 +848,216 @@
|
|
| 848 |
"learning_rate": 0.000150926304647952,
|
| 849 |
"loss": 0.8811,
|
| 850 |
"step": 1200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 851 |
}
|
| 852 |
],
|
| 853 |
"logging_steps": 10,
|
|
@@ -867,7 +1077,7 @@
|
|
| 867 |
"attributes": {}
|
| 868 |
}
|
| 869 |
},
|
| 870 |
-
"total_flos":
|
| 871 |
"train_batch_size": 6,
|
| 872 |
"trial_name": null,
|
| 873 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.4304778303917348,
|
| 6 |
"eval_steps": 300,
|
| 7 |
+
"global_step": 1500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 848 |
"learning_rate": 0.000150926304647952,
|
| 849 |
"loss": 0.8811,
|
| 850 |
"step": 1200
|
| 851 |
+
},
|
| 852 |
+
{
|
| 853 |
+
"epoch": 0.3472521165159994,
|
| 854 |
+
"grad_norm": 6.111181259155273,
|
| 855 |
+
"learning_rate": 0.00015013268414012742,
|
| 856 |
+
"loss": 0.8297,
|
| 857 |
+
"step": 1210
|
| 858 |
+
},
|
| 859 |
+
{
|
| 860 |
+
"epoch": 0.350121968718611,
|
| 861 |
+
"grad_norm": 6.417325496673584,
|
| 862 |
+
"learning_rate": 0.00014933482347549303,
|
| 863 |
+
"loss": 0.8296,
|
| 864 |
+
"step": 1220
|
| 865 |
+
},
|
| 866 |
+
{
|
| 867 |
+
"epoch": 0.35299182092122255,
|
| 868 |
+
"grad_norm": 48.331573486328125,
|
| 869 |
+
"learning_rate": 0.00014853279013605957,
|
| 870 |
+
"loss": 0.7966,
|
| 871 |
+
"step": 1230
|
| 872 |
+
},
|
| 873 |
+
{
|
| 874 |
+
"epoch": 0.3558616731238341,
|
| 875 |
+
"grad_norm": 8.638408660888672,
|
| 876 |
+
"learning_rate": 0.00014772665195675718,
|
| 877 |
+
"loss": 0.8522,
|
| 878 |
+
"step": 1240
|
| 879 |
+
},
|
| 880 |
+
{
|
| 881 |
+
"epoch": 0.3587315253264457,
|
| 882 |
+
"grad_norm": 6.308197498321533,
|
| 883 |
+
"learning_rate": 0.00014691647711969803,
|
| 884 |
+
"loss": 0.8228,
|
| 885 |
+
"step": 1250
|
| 886 |
+
},
|
| 887 |
+
{
|
| 888 |
+
"epoch": 0.36160137752905724,
|
| 889 |
+
"grad_norm": 6.23061990737915,
|
| 890 |
+
"learning_rate": 0.0001461023341484094,
|
| 891 |
+
"loss": 0.7915,
|
| 892 |
+
"step": 1260
|
| 893 |
+
},
|
| 894 |
+
{
|
| 895 |
+
"epoch": 0.36447122973166884,
|
| 896 |
+
"grad_norm": 6.377804756164551,
|
| 897 |
+
"learning_rate": 0.00014528429190203824,
|
| 898 |
+
"loss": 0.8486,
|
| 899 |
+
"step": 1270
|
| 900 |
+
},
|
| 901 |
+
{
|
| 902 |
+
"epoch": 0.3673410819342804,
|
| 903 |
+
"grad_norm": 6.146363258361816,
|
| 904 |
+
"learning_rate": 0.00014446241956952714,
|
| 905 |
+
"loss": 0.8927,
|
| 906 |
+
"step": 1280
|
| 907 |
+
},
|
| 908 |
+
{
|
| 909 |
+
"epoch": 0.37021093413689193,
|
| 910 |
+
"grad_norm": 3.900587320327759,
|
| 911 |
+
"learning_rate": 0.0001436367866637622,
|
| 912 |
+
"loss": 0.8167,
|
| 913 |
+
"step": 1290
|
| 914 |
+
},
|
| 915 |
+
{
|
| 916 |
+
"epoch": 0.37308078633950353,
|
| 917 |
+
"grad_norm": 8.58018684387207,
|
| 918 |
+
"learning_rate": 0.00014280746301569407,
|
| 919 |
+
"loss": 0.8128,
|
| 920 |
+
"step": 1300
|
| 921 |
+
},
|
| 922 |
+
{
|
| 923 |
+
"epoch": 0.3759506385421151,
|
| 924 |
+
"grad_norm": 5.754461288452148,
|
| 925 |
+
"learning_rate": 0.00014197451876843138,
|
| 926 |
+
"loss": 0.8441,
|
| 927 |
+
"step": 1310
|
| 928 |
+
},
|
| 929 |
+
{
|
| 930 |
+
"epoch": 0.3788204907447266,
|
| 931 |
+
"grad_norm": 7.290277004241943,
|
| 932 |
+
"learning_rate": 0.00014113802437130845,
|
| 933 |
+
"loss": 0.8555,
|
| 934 |
+
"step": 1320
|
| 935 |
+
},
|
| 936 |
+
{
|
| 937 |
+
"epoch": 0.3816903429473382,
|
| 938 |
+
"grad_norm": 43.14801788330078,
|
| 939 |
+
"learning_rate": 0.00014029805057392655,
|
| 940 |
+
"loss": 0.8299,
|
| 941 |
+
"step": 1330
|
| 942 |
+
},
|
| 943 |
+
{
|
| 944 |
+
"epoch": 0.38456019514994977,
|
| 945 |
+
"grad_norm": 5.909049034118652,
|
| 946 |
+
"learning_rate": 0.0001394546684201701,
|
| 947 |
+
"loss": 0.8448,
|
| 948 |
+
"step": 1340
|
| 949 |
+
},
|
| 950 |
+
{
|
| 951 |
+
"epoch": 0.38743004735256137,
|
| 952 |
+
"grad_norm": 4.810829162597656,
|
| 953 |
+
"learning_rate": 0.00013860794924219782,
|
| 954 |
+
"loss": 0.8592,
|
| 955 |
+
"step": 1350
|
| 956 |
+
},
|
| 957 |
+
{
|
| 958 |
+
"epoch": 0.3902998995551729,
|
| 959 |
+
"grad_norm": 6.602210998535156,
|
| 960 |
+
"learning_rate": 0.00013775796465440956,
|
| 961 |
+
"loss": 0.8351,
|
| 962 |
+
"step": 1360
|
| 963 |
+
},
|
| 964 |
+
{
|
| 965 |
+
"epoch": 0.39316975175778446,
|
| 966 |
+
"grad_norm": 7.952111721038818,
|
| 967 |
+
"learning_rate": 0.0001369047865473893,
|
| 968 |
+
"loss": 0.8243,
|
| 969 |
+
"step": 1370
|
| 970 |
+
},
|
| 971 |
+
{
|
| 972 |
+
"epoch": 0.39603960396039606,
|
| 973 |
+
"grad_norm": 8.271283149719238,
|
| 974 |
+
"learning_rate": 0.00013604848708182466,
|
| 975 |
+
"loss": 0.8239,
|
| 976 |
+
"step": 1380
|
| 977 |
+
},
|
| 978 |
+
{
|
| 979 |
+
"epoch": 0.3989094561630076,
|
| 980 |
+
"grad_norm": 12.694669723510742,
|
| 981 |
+
"learning_rate": 0.00013518913868240372,
|
| 982 |
+
"loss": 0.8381,
|
| 983 |
+
"step": 1390
|
| 984 |
+
},
|
| 985 |
+
{
|
| 986 |
+
"epoch": 0.40177930836561915,
|
| 987 |
+
"grad_norm": 22.169252395629883,
|
| 988 |
+
"learning_rate": 0.00013432681403168932,
|
| 989 |
+
"loss": 0.8227,
|
| 990 |
+
"step": 1400
|
| 991 |
+
},
|
| 992 |
+
{
|
| 993 |
+
"epoch": 0.40464916056823075,
|
| 994 |
+
"grad_norm": 127.96073913574219,
|
| 995 |
+
"learning_rate": 0.00013346158606397182,
|
| 996 |
+
"loss": 0.8376,
|
| 997 |
+
"step": 1410
|
| 998 |
+
},
|
| 999 |
+
{
|
| 1000 |
+
"epoch": 0.4075190127708423,
|
| 1001 |
+
"grad_norm": 12.16250991821289,
|
| 1002 |
+
"learning_rate": 0.0001325935279591003,
|
| 1003 |
+
"loss": 0.8253,
|
| 1004 |
+
"step": 1420
|
| 1005 |
+
},
|
| 1006 |
+
{
|
| 1007 |
+
"epoch": 0.4103888649734539,
|
| 1008 |
+
"grad_norm": 11.346808433532715,
|
| 1009 |
+
"learning_rate": 0.00013172271313629315,
|
| 1010 |
+
"loss": 0.8554,
|
| 1011 |
+
"step": 1430
|
| 1012 |
+
},
|
| 1013 |
+
{
|
| 1014 |
+
"epoch": 0.41325871717606544,
|
| 1015 |
+
"grad_norm": 18.371610641479492,
|
| 1016 |
+
"learning_rate": 0.0001308492152479283,
|
| 1017 |
+
"loss": 0.7743,
|
| 1018 |
+
"step": 1440
|
| 1019 |
+
},
|
| 1020 |
+
{
|
| 1021 |
+
"epoch": 0.416128569378677,
|
| 1022 |
+
"grad_norm": 17.174100875854492,
|
| 1023 |
+
"learning_rate": 0.00012997310817331392,
|
| 1024 |
+
"loss": 0.8342,
|
| 1025 |
+
"step": 1450
|
| 1026 |
+
},
|
| 1027 |
+
{
|
| 1028 |
+
"epoch": 0.4189984215812886,
|
| 1029 |
+
"grad_norm": 15.853143692016602,
|
| 1030 |
+
"learning_rate": 0.00012909446601243972,
|
| 1031 |
+
"loss": 0.8514,
|
| 1032 |
+
"step": 1460
|
| 1033 |
+
},
|
| 1034 |
+
{
|
| 1035 |
+
"epoch": 0.4218682737839001,
|
| 1036 |
+
"grad_norm": 6.734909534454346,
|
| 1037 |
+
"learning_rate": 0.00012821336307970965,
|
| 1038 |
+
"loss": 0.7947,
|
| 1039 |
+
"step": 1470
|
| 1040 |
+
},
|
| 1041 |
+
{
|
| 1042 |
+
"epoch": 0.42473812598651167,
|
| 1043 |
+
"grad_norm": 7.687751770019531,
|
| 1044 |
+
"learning_rate": 0.00012732987389765658,
|
| 1045 |
+
"loss": 0.8249,
|
| 1046 |
+
"step": 1480
|
| 1047 |
+
},
|
| 1048 |
+
{
|
| 1049 |
+
"epoch": 0.4276079781891233,
|
| 1050 |
+
"grad_norm": 4.791903972625732,
|
| 1051 |
+
"learning_rate": 0.00012644407319063918,
|
| 1052 |
+
"loss": 0.7755,
|
| 1053 |
+
"step": 1490
|
| 1054 |
+
},
|
| 1055 |
+
{
|
| 1056 |
+
"epoch": 0.4304778303917348,
|
| 1057 |
+
"grad_norm": 3.5958361625671387,
|
| 1058 |
+
"learning_rate": 0.0001255560358785219,
|
| 1059 |
+
"loss": 0.7828,
|
| 1060 |
+
"step": 1500
|
| 1061 |
}
|
| 1062 |
],
|
| 1063 |
"logging_steps": 10,
|
|
|
|
| 1077 |
"attributes": {}
|
| 1078 |
}
|
| 1079 |
},
|
| 1080 |
+
"total_flos": 6.137824149504e+19,
|
| 1081 |
"train_batch_size": 6,
|
| 1082 |
"trial_name": null,
|
| 1083 |
"trial_params": null
|