Training in progress, step 135000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:079464073c9724ceb804666b522429a90a4928e290e5da217f3ad8b9d68b8886
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fd285cfac8e5c0f6d1266cf8e23ce20a797130dac2828587dcc5345232fa441
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:874cf93e738f75197422ec1e62b162ef1e398b581422e23932b758446980a6af
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e1e7a01b81e1907abf43be3318a5c567fc57f95dbaef634f44d30b341186326
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -23860,11 +23860,189 @@
|
|
| 23860 |
"eval_steps_per_second": 15.134,
|
| 23861 |
"num_input_tokens_seen": 70243253472,
|
| 23862 |
"step": 134000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23863 |
}
|
| 23864 |
],
|
| 23865 |
"logging_steps": 50,
|
| 23866 |
"max_steps": 140000,
|
| 23867 |
-
"num_input_tokens_seen":
|
| 23868 |
"num_train_epochs": 2,
|
| 23869 |
"save_steps": 1000,
|
| 23870 |
"stateful_callbacks": {
|
|
@@ -23879,7 +24057,7 @@
|
|
| 23879 |
"attributes": {}
|
| 23880 |
}
|
| 23881 |
},
|
| 23882 |
-
"total_flos": 1.
|
| 23883 |
"train_batch_size": 32,
|
| 23884 |
"trial_name": null,
|
| 23885 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.2879091787209178,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 135000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 23860 |
"eval_steps_per_second": 15.134,
|
| 23861 |
"num_input_tokens_seen": 70243253472,
|
| 23862 |
"step": 134000
|
| 23863 |
+
},
|
| 23864 |
+
{
|
| 23865 |
+
"epoch": 1.2788461309133405,
|
| 23866 |
+
"grad_norm": 0.12490282952785492,
|
| 23867 |
+
"learning_rate": 0.00010734153455962764,
|
| 23868 |
+
"loss": 2.0308,
|
| 23869 |
+
"num_input_tokens_seen": 70269466208,
|
| 23870 |
+
"step": 134050
|
| 23871 |
+
},
|
| 23872 |
+
{
|
| 23873 |
+
"epoch": 1.279323133429529,
|
| 23874 |
+
"grad_norm": 0.12396061420440674,
|
| 23875 |
+
"learning_rate": 0.00010561116804955451,
|
| 23876 |
+
"loss": 2.036,
|
| 23877 |
+
"num_input_tokens_seen": 70295676096,
|
| 23878 |
+
"step": 134100
|
| 23879 |
+
},
|
| 23880 |
+
{
|
| 23881 |
+
"epoch": 1.2798001359457172,
|
| 23882 |
+
"grad_norm": 0.12122515588998795,
|
| 23883 |
+
"learning_rate": 0.00010389321369363636,
|
| 23884 |
+
"loss": 2.0424,
|
| 23885 |
+
"num_input_tokens_seen": 70321882272,
|
| 23886 |
+
"step": 134150
|
| 23887 |
+
},
|
| 23888 |
+
{
|
| 23889 |
+
"epoch": 1.2802771384619054,
|
| 23890 |
+
"grad_norm": 0.12559206783771515,
|
| 23891 |
+
"learning_rate": 0.00010218772555910954,
|
| 23892 |
+
"loss": 2.0456,
|
| 23893 |
+
"num_input_tokens_seen": 70348095808,
|
| 23894 |
+
"step": 134200
|
| 23895 |
+
},
|
| 23896 |
+
{
|
| 23897 |
+
"epoch": 1.2807541409780936,
|
| 23898 |
+
"grad_norm": 0.11915505677461624,
|
| 23899 |
+
"learning_rate": 0.0001004947573208756,
|
| 23900 |
+
"loss": 2.0412,
|
| 23901 |
+
"num_input_tokens_seen": 70374304800,
|
| 23902 |
+
"step": 134250
|
| 23903 |
+
},
|
| 23904 |
+
{
|
| 23905 |
+
"epoch": 1.2812311434942818,
|
| 23906 |
+
"grad_norm": 0.12196268141269684,
|
| 23907 |
+
"learning_rate": 9.881436225981105e-05,
|
| 23908 |
+
"loss": 2.0386,
|
| 23909 |
+
"num_input_tokens_seen": 70400510976,
|
| 23910 |
+
"step": 134300
|
| 23911 |
+
},
|
| 23912 |
+
{
|
| 23913 |
+
"epoch": 1.2817081460104702,
|
| 23914 |
+
"grad_norm": 0.12415535002946854,
|
| 23915 |
+
"learning_rate": 9.714659326109137e-05,
|
| 23916 |
+
"loss": 2.0448,
|
| 23917 |
+
"num_input_tokens_seen": 70426725376,
|
| 23918 |
+
"step": 134350
|
| 23919 |
+
},
|
| 23920 |
+
{
|
| 23921 |
+
"epoch": 1.2821851485266584,
|
| 23922 |
+
"grad_norm": 0.12361661344766617,
|
| 23923 |
+
"learning_rate": 9.549150281252633e-05,
|
| 23924 |
+
"loss": 2.0371,
|
| 23925 |
+
"num_input_tokens_seen": 70452929792,
|
| 23926 |
+
"step": 134400
|
| 23927 |
+
},
|
| 23928 |
+
{
|
| 23929 |
+
"epoch": 1.2826621510428469,
|
| 23930 |
+
"grad_norm": 0.12377167493104935,
|
| 23931 |
+
"learning_rate": 9.384914300290748e-05,
|
| 23932 |
+
"loss": 2.0344,
|
| 23933 |
+
"num_input_tokens_seen": 70479144192,
|
| 23934 |
+
"step": 134450
|
| 23935 |
+
},
|
| 23936 |
+
{
|
| 23937 |
+
"epoch": 1.283139153559035,
|
| 23938 |
+
"grad_norm": 0.11863281577825546,
|
| 23939 |
+
"learning_rate": 9.221956552036992e-05,
|
| 23940 |
+
"loss": 2.0393,
|
| 23941 |
+
"num_input_tokens_seen": 70505353504,
|
| 23942 |
+
"step": 134500
|
| 23943 |
+
},
|
| 23944 |
+
{
|
| 23945 |
+
"epoch": 1.283139153559035,
|
| 23946 |
+
"eval_loss": 1.9545812606811523,
|
| 23947 |
+
"eval_runtime": 82.3767,
|
| 23948 |
+
"eval_samples_per_second": 60.697,
|
| 23949 |
+
"eval_steps_per_second": 15.174,
|
| 23950 |
+
"num_input_tokens_seen": 70505353504,
|
| 23951 |
+
"step": 134500
|
| 23952 |
+
},
|
| 23953 |
+
{
|
| 23954 |
+
"epoch": 1.2836161560752233,
|
| 23955 |
+
"grad_norm": 0.12550202012062073,
|
| 23956 |
+
"learning_rate": 9.060282165076461e-05,
|
| 23957 |
+
"loss": 2.0483,
|
| 23958 |
+
"num_input_tokens_seen": 70531564640,
|
| 23959 |
+
"step": 134550
|
| 23960 |
+
},
|
| 23961 |
+
{
|
| 23962 |
+
"epoch": 1.2840931585914115,
|
| 23963 |
+
"grad_norm": 0.12165137380361557,
|
| 23964 |
+
"learning_rate": 8.899896227604509e-05,
|
| 23965 |
+
"loss": 2.034,
|
| 23966 |
+
"num_input_tokens_seen": 70557777824,
|
| 23967 |
+
"step": 134600
|
| 23968 |
+
},
|
| 23969 |
+
{
|
| 23970 |
+
"epoch": 1.2845701611076,
|
| 23971 |
+
"grad_norm": 0.12417840212583542,
|
| 23972 |
+
"learning_rate": 8.740803787266521e-05,
|
| 23973 |
+
"loss": 2.0381,
|
| 23974 |
+
"num_input_tokens_seen": 70583987456,
|
| 23975 |
+
"step": 134650
|
| 23976 |
+
},
|
| 23977 |
+
{
|
| 23978 |
+
"epoch": 1.2850471636237881,
|
| 23979 |
+
"grad_norm": 0.12609820067882538,
|
| 23980 |
+
"learning_rate": 8.58300985099918e-05,
|
| 23981 |
+
"loss": 2.0369,
|
| 23982 |
+
"num_input_tokens_seen": 70610189152,
|
| 23983 |
+
"step": 134700
|
| 23984 |
+
},
|
| 23985 |
+
{
|
| 23986 |
+
"epoch": 1.2855241661399763,
|
| 23987 |
+
"grad_norm": 0.1163376122713089,
|
| 23988 |
+
"learning_rate": 8.426519384872733e-05,
|
| 23989 |
+
"loss": 2.0236,
|
| 23990 |
+
"num_input_tokens_seen": 70636401088,
|
| 23991 |
+
"step": 134750
|
| 23992 |
+
},
|
| 23993 |
+
{
|
| 23994 |
+
"epoch": 1.2860011686561648,
|
| 23995 |
+
"grad_norm": 0.11958843469619751,
|
| 23996 |
+
"learning_rate": 8.271337313934868e-05,
|
| 23997 |
+
"loss": 2.0465,
|
| 23998 |
+
"num_input_tokens_seen": 70662608672,
|
| 23999 |
+
"step": 134800
|
| 24000 |
+
},
|
| 24001 |
+
{
|
| 24002 |
+
"epoch": 1.286478171172353,
|
| 24003 |
+
"grad_norm": 0.12234240025281906,
|
| 24004 |
+
"learning_rate": 8.117468522055577e-05,
|
| 24005 |
+
"loss": 2.0384,
|
| 24006 |
+
"num_input_tokens_seen": 70688820640,
|
| 24007 |
+
"step": 134850
|
| 24008 |
+
},
|
| 24009 |
+
{
|
| 24010 |
+
"epoch": 1.2869551736885412,
|
| 24011 |
+
"grad_norm": 0.11501733213663101,
|
| 24012 |
+
"learning_rate": 7.964917851773496e-05,
|
| 24013 |
+
"loss": 2.0343,
|
| 24014 |
+
"num_input_tokens_seen": 70715035040,
|
| 24015 |
+
"step": 134900
|
| 24016 |
+
},
|
| 24017 |
+
{
|
| 24018 |
+
"epoch": 1.2874321762047294,
|
| 24019 |
+
"grad_norm": 0.12062328308820724,
|
| 24020 |
+
"learning_rate": 7.813690104143555e-05,
|
| 24021 |
+
"loss": 2.0211,
|
| 24022 |
+
"num_input_tokens_seen": 70741249088,
|
| 24023 |
+
"step": 134950
|
| 24024 |
+
},
|
| 24025 |
+
{
|
| 24026 |
+
"epoch": 1.2879091787209178,
|
| 24027 |
+
"grad_norm": 0.11405592411756516,
|
| 24028 |
+
"learning_rate": 7.663790038585794e-05,
|
| 24029 |
+
"loss": 2.0401,
|
| 24030 |
+
"num_input_tokens_seen": 70767457344,
|
| 24031 |
+
"step": 135000
|
| 24032 |
+
},
|
| 24033 |
+
{
|
| 24034 |
+
"epoch": 1.2879091787209178,
|
| 24035 |
+
"eval_loss": 1.9541493654251099,
|
| 24036 |
+
"eval_runtime": 82.5619,
|
| 24037 |
+
"eval_samples_per_second": 60.561,
|
| 24038 |
+
"eval_steps_per_second": 15.14,
|
| 24039 |
+
"num_input_tokens_seen": 70767457344,
|
| 24040 |
+
"step": 135000
|
| 24041 |
}
|
| 24042 |
],
|
| 24043 |
"logging_steps": 50,
|
| 24044 |
"max_steps": 140000,
|
| 24045 |
+
"num_input_tokens_seen": 70767457344,
|
| 24046 |
"num_train_epochs": 2,
|
| 24047 |
"save_steps": 1000,
|
| 24048 |
"stateful_callbacks": {
|
|
|
|
| 24057 |
"attributes": {}
|
| 24058 |
}
|
| 24059 |
},
|
| 24060 |
+
"total_flos": 1.2524549151466045e+20,
|
| 24061 |
"train_batch_size": 32,
|
| 24062 |
"trial_name": null,
|
| 24063 |
"trial_params": null
|