Training in progress, step 57000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c12b0497c316584eab0a6471e97deaea6b6c97411924d2517f029fde79d3b1c2
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3e51e859ffdf4b3059a027d7764e0788d882ec9bf060bed69c183a774f7373cd
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b80a94302b027aba469e721f259f7cea336e0f08145beaf0eef00eec23f3459c
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25aca1947c52853a475b5e869ec5722620ca13248105b9ec208f0e66ff7cf239
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9976,11 +9976,189 @@
|
|
| 9976 |
"eval_steps_per_second": 23.416,
|
| 9977 |
"num_input_tokens_seen": 14680059456,
|
| 9978 |
"step": 56000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9979 |
}
|
| 9980 |
],
|
| 9981 |
"logging_steps": 50,
|
| 9982 |
"max_steps": 70000,
|
| 9983 |
-
"num_input_tokens_seen":
|
| 9984 |
"num_train_epochs": 1,
|
| 9985 |
"save_steps": 1000,
|
| 9986 |
"stateful_callbacks": {
|
|
@@ -9995,7 +10173,7 @@
|
|
| 9995 |
"attributes": {}
|
| 9996 |
}
|
| 9997 |
},
|
| 9998 |
-
"total_flos": 3.
|
| 9999 |
"train_batch_size": 64,
|
| 10000 |
"trial_name": null,
|
| 10001 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.27189143422731554,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 57000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9976 |
"eval_steps_per_second": 23.416,
|
| 9977 |
"num_input_tokens_seen": 14680059456,
|
| 9978 |
"step": 56000
|
| 9979 |
+
},
|
| 9980 |
+
{
|
| 9981 |
+
"epoch": 0.26735991032352696,
|
| 9982 |
+
"grad_norm": 0.22615984082221985,
|
| 9983 |
+
"learning_rate": 0.0009999685283773503,
|
| 9984 |
+
"loss": 2.5961,
|
| 9985 |
+
"num_input_tokens_seen": 14693166656,
|
| 9986 |
+
"step": 56050
|
| 9987 |
+
},
|
| 9988 |
+
{
|
| 9989 |
+
"epoch": 0.2675984115816211,
|
| 9990 |
+
"grad_norm": 0.2738794982433319,
|
| 9991 |
+
"learning_rate": 0.0009998741174712534,
|
| 9992 |
+
"loss": 2.612,
|
| 9993 |
+
"num_input_tokens_seen": 14706273856,
|
| 9994 |
+
"step": 56100
|
| 9995 |
+
},
|
| 9996 |
+
{
|
| 9997 |
+
"epoch": 0.2678369128397152,
|
| 9998 |
+
"grad_norm": 0.23470066487789154,
|
| 9999 |
+
"learning_rate": 0.0009997167791667668,
|
| 10000 |
+
"loss": 2.6071,
|
| 10001 |
+
"num_input_tokens_seen": 14719381056,
|
| 10002 |
+
"step": 56150
|
| 10003 |
+
},
|
| 10004 |
+
{
|
| 10005 |
+
"epoch": 0.2680754140978094,
|
| 10006 |
+
"grad_norm": 0.23558543622493744,
|
| 10007 |
+
"learning_rate": 0.0009994965332706573,
|
| 10008 |
+
"loss": 2.5956,
|
| 10009 |
+
"num_input_tokens_seen": 14732488256,
|
| 10010 |
+
"step": 56200
|
| 10011 |
+
},
|
| 10012 |
+
{
|
| 10013 |
+
"epoch": 0.2683139153559035,
|
| 10014 |
+
"grad_norm": 0.2274416983127594,
|
| 10015 |
+
"learning_rate": 0.0009992134075089082,
|
| 10016 |
+
"loss": 2.5873,
|
| 10017 |
+
"num_input_tokens_seen": 14745595456,
|
| 10018 |
+
"step": 56250
|
| 10019 |
+
},
|
| 10020 |
+
{
|
| 10021 |
+
"epoch": 0.26855241661399765,
|
| 10022 |
+
"grad_norm": 0.21609161794185638,
|
| 10023 |
+
"learning_rate": 0.000998867437523228,
|
| 10024 |
+
"loss": 2.6043,
|
| 10025 |
+
"num_input_tokens_seen": 14758702656,
|
| 10026 |
+
"step": 56300
|
| 10027 |
+
},
|
| 10028 |
+
{
|
| 10029 |
+
"epoch": 0.26879091787209175,
|
| 10030 |
+
"grad_norm": 0.2368565797805786,
|
| 10031 |
+
"learning_rate": 0.000998458666866564,
|
| 10032 |
+
"loss": 2.5952,
|
| 10033 |
+
"num_input_tokens_seen": 14771809856,
|
| 10034 |
+
"step": 56350
|
| 10035 |
+
},
|
| 10036 |
+
{
|
| 10037 |
+
"epoch": 0.2690294191301859,
|
| 10038 |
+
"grad_norm": 0.22180891036987305,
|
| 10039 |
+
"learning_rate": 0.0009979871469976197,
|
| 10040 |
+
"loss": 2.5934,
|
| 10041 |
+
"num_input_tokens_seen": 14784917056,
|
| 10042 |
+
"step": 56400
|
| 10043 |
+
},
|
| 10044 |
+
{
|
| 10045 |
+
"epoch": 0.26926792038828007,
|
| 10046 |
+
"grad_norm": 0.3060019910335541,
|
| 10047 |
+
"learning_rate": 0.0009974529372743762,
|
| 10048 |
+
"loss": 2.6224,
|
| 10049 |
+
"num_input_tokens_seen": 14798024256,
|
| 10050 |
+
"step": 56450
|
| 10051 |
+
},
|
| 10052 |
+
{
|
| 10053 |
+
"epoch": 0.2695064216463742,
|
| 10054 |
+
"grad_norm": 0.2387322634458542,
|
| 10055 |
+
"learning_rate": 0.0009968561049466214,
|
| 10056 |
+
"loss": 2.5905,
|
| 10057 |
+
"num_input_tokens_seen": 14811131456,
|
| 10058 |
+
"step": 56500
|
| 10059 |
+
},
|
| 10060 |
+
{
|
| 10061 |
+
"epoch": 0.2695064216463742,
|
| 10062 |
+
"eval_loss": 2.4835996627807617,
|
| 10063 |
+
"eval_runtime": 53.8478,
|
| 10064 |
+
"eval_samples_per_second": 92.854,
|
| 10065 |
+
"eval_steps_per_second": 23.214,
|
| 10066 |
+
"num_input_tokens_seen": 14811131456,
|
| 10067 |
+
"step": 56500
|
| 10068 |
+
},
|
| 10069 |
+
{
|
| 10070 |
+
"epoch": 0.26974492290446833,
|
| 10071 |
+
"grad_norm": 0.22091372311115265,
|
| 10072 |
+
"learning_rate": 0.0009961967251474822,
|
| 10073 |
+
"loss": 2.6139,
|
| 10074 |
+
"num_input_tokens_seen": 14824238656,
|
| 10075 |
+
"step": 56550
|
| 10076 |
+
},
|
| 10077 |
+
{
|
| 10078 |
+
"epoch": 0.26998342416256244,
|
| 10079 |
+
"grad_norm": 0.2304680198431015,
|
| 10080 |
+
"learning_rate": 0.0009954748808839674,
|
| 10081 |
+
"loss": 2.6167,
|
| 10082 |
+
"num_input_tokens_seen": 14837345856,
|
| 10083 |
+
"step": 56600
|
| 10084 |
+
},
|
| 10085 |
+
{
|
| 10086 |
+
"epoch": 0.2702219254206566,
|
| 10087 |
+
"grad_norm": 0.19777421653270721,
|
| 10088 |
+
"learning_rate": 0.0009946906630265184,
|
| 10089 |
+
"loss": 2.6082,
|
| 10090 |
+
"num_input_tokens_seen": 14850453056,
|
| 10091 |
+
"step": 56650
|
| 10092 |
+
},
|
| 10093 |
+
{
|
| 10094 |
+
"epoch": 0.27046042667875075,
|
| 10095 |
+
"grad_norm": 0.2113979458808899,
|
| 10096 |
+
"learning_rate": 0.0009938441702975688,
|
| 10097 |
+
"loss": 2.5981,
|
| 10098 |
+
"num_input_tokens_seen": 14863560256,
|
| 10099 |
+
"step": 56700
|
| 10100 |
+
},
|
| 10101 |
+
{
|
| 10102 |
+
"epoch": 0.27069892793684486,
|
| 10103 |
+
"grad_norm": 0.19911637902259827,
|
| 10104 |
+
"learning_rate": 0.0009929355092591179,
|
| 10105 |
+
"loss": 2.5904,
|
| 10106 |
+
"num_input_tokens_seen": 14876667456,
|
| 10107 |
+
"step": 56750
|
| 10108 |
+
},
|
| 10109 |
+
{
|
| 10110 |
+
"epoch": 0.270937429194939,
|
| 10111 |
+
"grad_norm": 0.20081694424152374,
|
| 10112 |
+
"learning_rate": 0.0009919647942993148,
|
| 10113 |
+
"loss": 2.6012,
|
| 10114 |
+
"num_input_tokens_seen": 14889774656,
|
| 10115 |
+
"step": 56800
|
| 10116 |
+
},
|
| 10117 |
+
{
|
| 10118 |
+
"epoch": 0.2711759304530331,
|
| 10119 |
+
"grad_norm": 0.22752800583839417,
|
| 10120 |
+
"learning_rate": 0.0009909321476180592,
|
| 10121 |
+
"loss": 2.6017,
|
| 10122 |
+
"num_input_tokens_seen": 14902881856,
|
| 10123 |
+
"step": 56850
|
| 10124 |
+
},
|
| 10125 |
+
{
|
| 10126 |
+
"epoch": 0.2714144317111273,
|
| 10127 |
+
"grad_norm": 0.23174402117729187,
|
| 10128 |
+
"learning_rate": 0.0009898376992116178,
|
| 10129 |
+
"loss": 2.6012,
|
| 10130 |
+
"num_input_tokens_seen": 14915989056,
|
| 10131 |
+
"step": 56900
|
| 10132 |
+
},
|
| 10133 |
+
{
|
| 10134 |
+
"epoch": 0.27165293296922144,
|
| 10135 |
+
"grad_norm": 0.22149533033370972,
|
| 10136 |
+
"learning_rate": 0.0009886815868562597,
|
| 10137 |
+
"loss": 2.5881,
|
| 10138 |
+
"num_input_tokens_seen": 14929096256,
|
| 10139 |
+
"step": 56950
|
| 10140 |
+
},
|
| 10141 |
+
{
|
| 10142 |
+
"epoch": 0.27189143422731554,
|
| 10143 |
+
"grad_norm": 0.22576771676540375,
|
| 10144 |
+
"learning_rate": 0.0009874639560909118,
|
| 10145 |
+
"loss": 2.6021,
|
| 10146 |
+
"num_input_tokens_seen": 14942203456,
|
| 10147 |
+
"step": 57000
|
| 10148 |
+
},
|
| 10149 |
+
{
|
| 10150 |
+
"epoch": 0.27189143422731554,
|
| 10151 |
+
"eval_loss": 2.482896566390991,
|
| 10152 |
+
"eval_runtime": 53.3773,
|
| 10153 |
+
"eval_samples_per_second": 93.673,
|
| 10154 |
+
"eval_steps_per_second": 23.418,
|
| 10155 |
+
"num_input_tokens_seen": 14942203456,
|
| 10156 |
+
"step": 57000
|
| 10157 |
}
|
| 10158 |
],
|
| 10159 |
"logging_steps": 50,
|
| 10160 |
"max_steps": 70000,
|
| 10161 |
+
"num_input_tokens_seen": 14942203456,
|
| 10162 |
"num_train_epochs": 1,
|
| 10163 |
"save_steps": 1000,
|
| 10164 |
"stateful_callbacks": {
|
|
|
|
| 10173 |
"attributes": {}
|
| 10174 |
}
|
| 10175 |
},
|
| 10176 |
+
"total_flos": 3.9971852603857306e+18,
|
| 10177 |
"train_batch_size": 64,
|
| 10178 |
"trial_name": null,
|
| 10179 |
"trial_params": null
|