Training in progress, step 24000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:54c2b1dc0ce252890792fa50a7ced2b1884b184496f8709b1df62b942e4f6173
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6caaef1143ab01dc77c2601e1c5bde16b77c55e497c5f13366c2442c28ab6fac
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4eb9e5f9b752984653e9c2f4587df901a2cc5f64a95a0121fadf8e7c7c268621
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06f0f3181677433703f6860ec173100c1f71e33282413595313e7174a82f6998
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4102,11 +4102,189 @@
|
|
| 4102 |
"eval_steps_per_second": 18.892,
|
| 4103 |
"num_input_tokens_seen": 24117244160,
|
| 4104 |
"step": 23000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4105 |
}
|
| 4106 |
],
|
| 4107 |
"logging_steps": 50,
|
| 4108 |
"max_steps": 200000,
|
| 4109 |
-
"num_input_tokens_seen":
|
| 4110 |
"num_train_epochs": 5,
|
| 4111 |
"save_steps": 1000,
|
| 4112 |
"stateful_callbacks": {
|
|
@@ -4121,7 +4299,7 @@
|
|
| 4121 |
"attributes": {}
|
| 4122 |
}
|
| 4123 |
},
|
| 4124 |
-
"total_flos": 1.
|
| 4125 |
"train_batch_size": 64,
|
| 4126 |
"trial_name": null,
|
| 4127 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.5271850378296451,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 24000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4102 |
"eval_steps_per_second": 18.892,
|
| 4103 |
"num_input_tokens_seen": 24117244160,
|
| 4104 |
"step": 23000
|
| 4105 |
+
},
|
| 4106 |
+
{
|
| 4107 |
+
"epoch": 0.5063172967488883,
|
| 4108 |
+
"grad_norm": 0.15117652714252472,
|
| 4109 |
+
"learning_rate": 0.001,
|
| 4110 |
+
"loss": 2.696,
|
| 4111 |
+
"num_input_tokens_seen": 24169672960,
|
| 4112 |
+
"step": 23050
|
| 4113 |
+
},
|
| 4114 |
+
{
|
| 4115 |
+
"epoch": 0.5074155989110334,
|
| 4116 |
+
"grad_norm": 0.15605470538139343,
|
| 4117 |
+
"learning_rate": 0.001,
|
| 4118 |
+
"loss": 2.6918,
|
| 4119 |
+
"num_input_tokens_seen": 24222101760,
|
| 4120 |
+
"step": 23100
|
| 4121 |
+
},
|
| 4122 |
+
{
|
| 4123 |
+
"epoch": 0.5085139010731785,
|
| 4124 |
+
"grad_norm": 0.17503651976585388,
|
| 4125 |
+
"learning_rate": 0.001,
|
| 4126 |
+
"loss": 2.688,
|
| 4127 |
+
"num_input_tokens_seen": 24274530560,
|
| 4128 |
+
"step": 23150
|
| 4129 |
+
},
|
| 4130 |
+
{
|
| 4131 |
+
"epoch": 0.5096122032353236,
|
| 4132 |
+
"grad_norm": 0.1622135490179062,
|
| 4133 |
+
"learning_rate": 0.001,
|
| 4134 |
+
"loss": 2.6949,
|
| 4135 |
+
"num_input_tokens_seen": 24326959360,
|
| 4136 |
+
"step": 23200
|
| 4137 |
+
},
|
| 4138 |
+
{
|
| 4139 |
+
"epoch": 0.5107105053974687,
|
| 4140 |
+
"grad_norm": 0.1331271231174469,
|
| 4141 |
+
"learning_rate": 0.001,
|
| 4142 |
+
"loss": 2.6876,
|
| 4143 |
+
"num_input_tokens_seen": 24379388160,
|
| 4144 |
+
"step": 23250
|
| 4145 |
+
},
|
| 4146 |
+
{
|
| 4147 |
+
"epoch": 0.5118088075596138,
|
| 4148 |
+
"grad_norm": 0.14365510642528534,
|
| 4149 |
+
"learning_rate": 0.001,
|
| 4150 |
+
"loss": 2.7027,
|
| 4151 |
+
"num_input_tokens_seen": 24431816960,
|
| 4152 |
+
"step": 23300
|
| 4153 |
+
},
|
| 4154 |
+
{
|
| 4155 |
+
"epoch": 0.5129071097217589,
|
| 4156 |
+
"grad_norm": 0.13621902465820312,
|
| 4157 |
+
"learning_rate": 0.001,
|
| 4158 |
+
"loss": 2.6946,
|
| 4159 |
+
"num_input_tokens_seen": 24484245760,
|
| 4160 |
+
"step": 23350
|
| 4161 |
+
},
|
| 4162 |
+
{
|
| 4163 |
+
"epoch": 0.5140054118839039,
|
| 4164 |
+
"grad_norm": 0.12506547570228577,
|
| 4165 |
+
"learning_rate": 0.001,
|
| 4166 |
+
"loss": 2.6864,
|
| 4167 |
+
"num_input_tokens_seen": 24536674560,
|
| 4168 |
+
"step": 23400
|
| 4169 |
+
},
|
| 4170 |
+
{
|
| 4171 |
+
"epoch": 0.515103714046049,
|
| 4172 |
+
"grad_norm": 0.12824128568172455,
|
| 4173 |
+
"learning_rate": 0.001,
|
| 4174 |
+
"loss": 2.6871,
|
| 4175 |
+
"num_input_tokens_seen": 24589103360,
|
| 4176 |
+
"step": 23450
|
| 4177 |
+
},
|
| 4178 |
+
{
|
| 4179 |
+
"epoch": 0.5162020162081942,
|
| 4180 |
+
"grad_norm": 0.14310036599636078,
|
| 4181 |
+
"learning_rate": 0.001,
|
| 4182 |
+
"loss": 2.6936,
|
| 4183 |
+
"num_input_tokens_seen": 24641532160,
|
| 4184 |
+
"step": 23500
|
| 4185 |
+
},
|
| 4186 |
+
{
|
| 4187 |
+
"epoch": 0.5162020162081942,
|
| 4188 |
+
"eval_loss": 2.592362880706787,
|
| 4189 |
+
"eval_runtime": 66.663,
|
| 4190 |
+
"eval_samples_per_second": 75.004,
|
| 4191 |
+
"eval_steps_per_second": 18.751,
|
| 4192 |
+
"num_input_tokens_seen": 24641532160,
|
| 4193 |
+
"step": 23500
|
| 4194 |
+
},
|
| 4195 |
+
{
|
| 4196 |
+
"epoch": 0.5173003183703393,
|
| 4197 |
+
"grad_norm": 0.1362077295780182,
|
| 4198 |
+
"learning_rate": 0.001,
|
| 4199 |
+
"loss": 2.6924,
|
| 4200 |
+
"num_input_tokens_seen": 24693960960,
|
| 4201 |
+
"step": 23550
|
| 4202 |
+
},
|
| 4203 |
+
{
|
| 4204 |
+
"epoch": 0.5183986205324843,
|
| 4205 |
+
"grad_norm": 0.13662473857402802,
|
| 4206 |
+
"learning_rate": 0.001,
|
| 4207 |
+
"loss": 2.6972,
|
| 4208 |
+
"num_input_tokens_seen": 24746389760,
|
| 4209 |
+
"step": 23600
|
| 4210 |
+
},
|
| 4211 |
+
{
|
| 4212 |
+
"epoch": 0.5194969226946294,
|
| 4213 |
+
"grad_norm": 0.12603560090065002,
|
| 4214 |
+
"learning_rate": 0.001,
|
| 4215 |
+
"loss": 2.6908,
|
| 4216 |
+
"num_input_tokens_seen": 24798818560,
|
| 4217 |
+
"step": 23650
|
| 4218 |
+
},
|
| 4219 |
+
{
|
| 4220 |
+
"epoch": 0.5205952248567746,
|
| 4221 |
+
"grad_norm": 0.16597150266170502,
|
| 4222 |
+
"learning_rate": 0.001,
|
| 4223 |
+
"loss": 2.6882,
|
| 4224 |
+
"num_input_tokens_seen": 24851247360,
|
| 4225 |
+
"step": 23700
|
| 4226 |
+
},
|
| 4227 |
+
{
|
| 4228 |
+
"epoch": 0.5216935270189196,
|
| 4229 |
+
"grad_norm": 0.13665246963500977,
|
| 4230 |
+
"learning_rate": 0.001,
|
| 4231 |
+
"loss": 2.6958,
|
| 4232 |
+
"num_input_tokens_seen": 24903676160,
|
| 4233 |
+
"step": 23750
|
| 4234 |
+
},
|
| 4235 |
+
{
|
| 4236 |
+
"epoch": 0.5227918291810647,
|
| 4237 |
+
"grad_norm": 0.14349523186683655,
|
| 4238 |
+
"learning_rate": 0.001,
|
| 4239 |
+
"loss": 2.6874,
|
| 4240 |
+
"num_input_tokens_seen": 24956104960,
|
| 4241 |
+
"step": 23800
|
| 4242 |
+
},
|
| 4243 |
+
{
|
| 4244 |
+
"epoch": 0.5238901313432098,
|
| 4245 |
+
"grad_norm": 0.15857954323291779,
|
| 4246 |
+
"learning_rate": 0.001,
|
| 4247 |
+
"loss": 2.6882,
|
| 4248 |
+
"num_input_tokens_seen": 25008533760,
|
| 4249 |
+
"step": 23850
|
| 4250 |
+
},
|
| 4251 |
+
{
|
| 4252 |
+
"epoch": 0.524988433505355,
|
| 4253 |
+
"grad_norm": 0.15056300163269043,
|
| 4254 |
+
"learning_rate": 0.001,
|
| 4255 |
+
"loss": 2.694,
|
| 4256 |
+
"num_input_tokens_seen": 25060962560,
|
| 4257 |
+
"step": 23900
|
| 4258 |
+
},
|
| 4259 |
+
{
|
| 4260 |
+
"epoch": 0.5260867356675,
|
| 4261 |
+
"grad_norm": 0.12861080467700958,
|
| 4262 |
+
"learning_rate": 0.001,
|
| 4263 |
+
"loss": 2.6899,
|
| 4264 |
+
"num_input_tokens_seen": 25113391360,
|
| 4265 |
+
"step": 23950
|
| 4266 |
+
},
|
| 4267 |
+
{
|
| 4268 |
+
"epoch": 0.5271850378296451,
|
| 4269 |
+
"grad_norm": 0.14443258941173553,
|
| 4270 |
+
"learning_rate": 0.001,
|
| 4271 |
+
"loss": 2.6929,
|
| 4272 |
+
"num_input_tokens_seen": 25165820160,
|
| 4273 |
+
"step": 24000
|
| 4274 |
+
},
|
| 4275 |
+
{
|
| 4276 |
+
"epoch": 0.5271850378296451,
|
| 4277 |
+
"eval_loss": 2.5910630226135254,
|
| 4278 |
+
"eval_runtime": 66.9014,
|
| 4279 |
+
"eval_samples_per_second": 74.737,
|
| 4280 |
+
"eval_steps_per_second": 18.684,
|
| 4281 |
+
"num_input_tokens_seen": 25165820160,
|
| 4282 |
+
"step": 24000
|
| 4283 |
}
|
| 4284 |
],
|
| 4285 |
"logging_steps": 50,
|
| 4286 |
"max_steps": 200000,
|
| 4287 |
+
"num_input_tokens_seen": 25165820160,
|
| 4288 |
"num_train_epochs": 5,
|
| 4289 |
"save_steps": 1000,
|
| 4290 |
"stateful_callbacks": {
|
|
|
|
| 4299 |
"attributes": {}
|
| 4300 |
}
|
| 4301 |
},
|
| 4302 |
+
"total_flos": 1.4332118996250132e+19,
|
| 4303 |
"train_batch_size": 64,
|
| 4304 |
"trial_name": null,
|
| 4305 |
"trial_params": null
|