Training in progress, step 25000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca1305d807a0d62209066bee9cbe48b75438f197b4d11307eb4ba5e592a11386
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed4d9687ffe945b21f6759ab92e79d3a46252bbf5731184d996dc881364e21e9
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a59157d1ca64ffae44fbe8134d666bfe8e12822f27ca50fb6e1f0b29f58d3b64
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8c565830d05eccabcd7df396792d29e3638ccbd6988e240ff15902ef690b7e6
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4280,11 +4280,189 @@
|
|
| 4280 |
"eval_steps_per_second": 18.684,
|
| 4281 |
"num_input_tokens_seen": 25165820160,
|
| 4282 |
"step": 24000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4283 |
}
|
| 4284 |
],
|
| 4285 |
"logging_steps": 50,
|
| 4286 |
"max_steps": 200000,
|
| 4287 |
-
"num_input_tokens_seen":
|
| 4288 |
"num_train_epochs": 5,
|
| 4289 |
"save_steps": 1000,
|
| 4290 |
"stateful_callbacks": {
|
|
@@ -4299,7 +4477,7 @@
|
|
| 4299 |
"attributes": {}
|
| 4300 |
}
|
| 4301 |
},
|
| 4302 |
-
"total_flos": 1.
|
| 4303 |
"train_batch_size": 64,
|
| 4304 |
"trial_name": null,
|
| 4305 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.549151081072547,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 25000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4280 |
"eval_steps_per_second": 18.684,
|
| 4281 |
"num_input_tokens_seen": 25165820160,
|
| 4282 |
"step": 24000
|
| 4283 |
+
},
|
| 4284 |
+
{
|
| 4285 |
+
"epoch": 0.5282833399917902,
|
| 4286 |
+
"grad_norm": 0.14083649218082428,
|
| 4287 |
+
"learning_rate": 0.001,
|
| 4288 |
+
"loss": 2.6851,
|
| 4289 |
+
"num_input_tokens_seen": 25218248960,
|
| 4290 |
+
"step": 24050
|
| 4291 |
+
},
|
| 4292 |
+
{
|
| 4293 |
+
"epoch": 0.5293816421539352,
|
| 4294 |
+
"grad_norm": 0.13934968411922455,
|
| 4295 |
+
"learning_rate": 0.001,
|
| 4296 |
+
"loss": 2.6863,
|
| 4297 |
+
"num_input_tokens_seen": 25270677760,
|
| 4298 |
+
"step": 24100
|
| 4299 |
+
},
|
| 4300 |
+
{
|
| 4301 |
+
"epoch": 0.5304799443160804,
|
| 4302 |
+
"grad_norm": 0.15416787564754486,
|
| 4303 |
+
"learning_rate": 0.001,
|
| 4304 |
+
"loss": 2.6894,
|
| 4305 |
+
"num_input_tokens_seen": 25323106560,
|
| 4306 |
+
"step": 24150
|
| 4307 |
+
},
|
| 4308 |
+
{
|
| 4309 |
+
"epoch": 0.5315782464782255,
|
| 4310 |
+
"grad_norm": 0.17290246486663818,
|
| 4311 |
+
"learning_rate": 0.001,
|
| 4312 |
+
"loss": 2.6907,
|
| 4313 |
+
"num_input_tokens_seen": 25375535360,
|
| 4314 |
+
"step": 24200
|
| 4315 |
+
},
|
| 4316 |
+
{
|
| 4317 |
+
"epoch": 0.5326765486403706,
|
| 4318 |
+
"grad_norm": 0.14260552823543549,
|
| 4319 |
+
"learning_rate": 0.001,
|
| 4320 |
+
"loss": 2.6832,
|
| 4321 |
+
"num_input_tokens_seen": 25427964160,
|
| 4322 |
+
"step": 24250
|
| 4323 |
+
},
|
| 4324 |
+
{
|
| 4325 |
+
"epoch": 0.5337748508025156,
|
| 4326 |
+
"grad_norm": 0.14795690774917603,
|
| 4327 |
+
"learning_rate": 0.001,
|
| 4328 |
+
"loss": 2.6895,
|
| 4329 |
+
"num_input_tokens_seen": 25480392960,
|
| 4330 |
+
"step": 24300
|
| 4331 |
+
},
|
| 4332 |
+
{
|
| 4333 |
+
"epoch": 0.5348731529646608,
|
| 4334 |
+
"grad_norm": 0.15009699761867523,
|
| 4335 |
+
"learning_rate": 0.001,
|
| 4336 |
+
"loss": 2.6819,
|
| 4337 |
+
"num_input_tokens_seen": 25532821760,
|
| 4338 |
+
"step": 24350
|
| 4339 |
+
},
|
| 4340 |
+
{
|
| 4341 |
+
"epoch": 0.5359714551268059,
|
| 4342 |
+
"grad_norm": 0.15425953269004822,
|
| 4343 |
+
"learning_rate": 0.001,
|
| 4344 |
+
"loss": 2.6874,
|
| 4345 |
+
"num_input_tokens_seen": 25585250560,
|
| 4346 |
+
"step": 24400
|
| 4347 |
+
},
|
| 4348 |
+
{
|
| 4349 |
+
"epoch": 0.5370697572889509,
|
| 4350 |
+
"grad_norm": 0.14639410376548767,
|
| 4351 |
+
"learning_rate": 0.001,
|
| 4352 |
+
"loss": 2.6878,
|
| 4353 |
+
"num_input_tokens_seen": 25637679360,
|
| 4354 |
+
"step": 24450
|
| 4355 |
+
},
|
| 4356 |
+
{
|
| 4357 |
+
"epoch": 0.538168059451096,
|
| 4358 |
+
"grad_norm": 0.14785613119602203,
|
| 4359 |
+
"learning_rate": 0.001,
|
| 4360 |
+
"loss": 2.6841,
|
| 4361 |
+
"num_input_tokens_seen": 25690108160,
|
| 4362 |
+
"step": 24500
|
| 4363 |
+
},
|
| 4364 |
+
{
|
| 4365 |
+
"epoch": 0.538168059451096,
|
| 4366 |
+
"eval_loss": 2.5875706672668457,
|
| 4367 |
+
"eval_runtime": 66.9296,
|
| 4368 |
+
"eval_samples_per_second": 74.705,
|
| 4369 |
+
"eval_steps_per_second": 18.676,
|
| 4370 |
+
"num_input_tokens_seen": 25690108160,
|
| 4371 |
+
"step": 24500
|
| 4372 |
+
},
|
| 4373 |
+
{
|
| 4374 |
+
"epoch": 0.5392663616132412,
|
| 4375 |
+
"grad_norm": 0.14224180579185486,
|
| 4376 |
+
"learning_rate": 0.001,
|
| 4377 |
+
"loss": 2.6876,
|
| 4378 |
+
"num_input_tokens_seen": 25742536960,
|
| 4379 |
+
"step": 24550
|
| 4380 |
+
},
|
| 4381 |
+
{
|
| 4382 |
+
"epoch": 0.5403646637753863,
|
| 4383 |
+
"grad_norm": 0.14881493151187897,
|
| 4384 |
+
"learning_rate": 0.001,
|
| 4385 |
+
"loss": 2.6827,
|
| 4386 |
+
"num_input_tokens_seen": 25794965760,
|
| 4387 |
+
"step": 24600
|
| 4388 |
+
},
|
| 4389 |
+
{
|
| 4390 |
+
"epoch": 0.5414629659375313,
|
| 4391 |
+
"grad_norm": 0.17951786518096924,
|
| 4392 |
+
"learning_rate": 0.001,
|
| 4393 |
+
"loss": 2.688,
|
| 4394 |
+
"num_input_tokens_seen": 25847394560,
|
| 4395 |
+
"step": 24650
|
| 4396 |
+
},
|
| 4397 |
+
{
|
| 4398 |
+
"epoch": 0.5425612680996764,
|
| 4399 |
+
"grad_norm": 0.1400926560163498,
|
| 4400 |
+
"learning_rate": 0.001,
|
| 4401 |
+
"loss": 2.6945,
|
| 4402 |
+
"num_input_tokens_seen": 25899823360,
|
| 4403 |
+
"step": 24700
|
| 4404 |
+
},
|
| 4405 |
+
{
|
| 4406 |
+
"epoch": 0.5436595702618215,
|
| 4407 |
+
"grad_norm": 0.1421627402305603,
|
| 4408 |
+
"learning_rate": 0.001,
|
| 4409 |
+
"loss": 2.6852,
|
| 4410 |
+
"num_input_tokens_seen": 25952252160,
|
| 4411 |
+
"step": 24750
|
| 4412 |
+
},
|
| 4413 |
+
{
|
| 4414 |
+
"epoch": 0.5447578724239666,
|
| 4415 |
+
"grad_norm": 0.1617737114429474,
|
| 4416 |
+
"learning_rate": 0.001,
|
| 4417 |
+
"loss": 2.686,
|
| 4418 |
+
"num_input_tokens_seen": 26004680960,
|
| 4419 |
+
"step": 24800
|
| 4420 |
+
},
|
| 4421 |
+
{
|
| 4422 |
+
"epoch": 0.5458561745861117,
|
| 4423 |
+
"grad_norm": 0.1523471176624298,
|
| 4424 |
+
"learning_rate": 0.001,
|
| 4425 |
+
"loss": 2.6945,
|
| 4426 |
+
"num_input_tokens_seen": 26057109760,
|
| 4427 |
+
"step": 24850
|
| 4428 |
+
},
|
| 4429 |
+
{
|
| 4430 |
+
"epoch": 0.5469544767482568,
|
| 4431 |
+
"grad_norm": 0.13078247010707855,
|
| 4432 |
+
"learning_rate": 0.001,
|
| 4433 |
+
"loss": 2.6829,
|
| 4434 |
+
"num_input_tokens_seen": 26109538560,
|
| 4435 |
+
"step": 24900
|
| 4436 |
+
},
|
| 4437 |
+
{
|
| 4438 |
+
"epoch": 0.5480527789104018,
|
| 4439 |
+
"grad_norm": 0.14831651747226715,
|
| 4440 |
+
"learning_rate": 0.001,
|
| 4441 |
+
"loss": 2.6898,
|
| 4442 |
+
"num_input_tokens_seen": 26161967360,
|
| 4443 |
+
"step": 24950
|
| 4444 |
+
},
|
| 4445 |
+
{
|
| 4446 |
+
"epoch": 0.549151081072547,
|
| 4447 |
+
"grad_norm": 0.1782410740852356,
|
| 4448 |
+
"learning_rate": 0.001,
|
| 4449 |
+
"loss": 2.6871,
|
| 4450 |
+
"num_input_tokens_seen": 26214396160,
|
| 4451 |
+
"step": 25000
|
| 4452 |
+
},
|
| 4453 |
+
{
|
| 4454 |
+
"epoch": 0.549151081072547,
|
| 4455 |
+
"eval_loss": 2.5877788066864014,
|
| 4456 |
+
"eval_runtime": 67.2223,
|
| 4457 |
+
"eval_samples_per_second": 74.38,
|
| 4458 |
+
"eval_steps_per_second": 18.595,
|
| 4459 |
+
"num_input_tokens_seen": 26214396160,
|
| 4460 |
+
"step": 25000
|
| 4461 |
}
|
| 4462 |
],
|
| 4463 |
"logging_steps": 50,
|
| 4464 |
"max_steps": 200000,
|
| 4465 |
+
"num_input_tokens_seen": 26214396160,
|
| 4466 |
"num_train_epochs": 5,
|
| 4467 |
"save_steps": 1000,
|
| 4468 |
"stateful_callbacks": {
|
|
|
|
| 4477 |
"attributes": {}
|
| 4478 |
}
|
| 4479 |
},
|
| 4480 |
+
"total_flos": 1.492929071221506e+19,
|
| 4481 |
"train_batch_size": 64,
|
| 4482 |
"trial_name": null,
|
| 4483 |
"trial_params": null
|