Training in progress, step 26000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28394340845b35c88e8a63417e18c503dadf4a251790835d2715e5a4962f656e
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4565341b2daf769a1d6b98280e7a99c73d3df5a11f570b225860490fa5b0252c
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b4ee497eed0fe85641f8ca254d6d7e11e60873712ef2108d29f717bef60c5dd
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ced8856b9ff194699de7fca54070bd17a17efd31d5f5d4d7e4c8ff1ec712ca9
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4458,11 +4458,189 @@
|
|
| 4458 |
"eval_steps_per_second": 18.595,
|
| 4459 |
"num_input_tokens_seen": 26214396160,
|
| 4460 |
"step": 25000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4461 |
}
|
| 4462 |
],
|
| 4463 |
"logging_steps": 50,
|
| 4464 |
"max_steps": 200000,
|
| 4465 |
-
"num_input_tokens_seen":
|
| 4466 |
"num_train_epochs": 5,
|
| 4467 |
"save_steps": 1000,
|
| 4468 |
"stateful_callbacks": {
|
|
@@ -4477,7 +4655,7 @@
|
|
| 4477 |
"attributes": {}
|
| 4478 |
}
|
| 4479 |
},
|
| 4480 |
-
"total_flos": 1.
|
| 4481 |
"train_batch_size": 64,
|
| 4482 |
"trial_name": null,
|
| 4483 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.5711171243154488,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 26000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4458 |
"eval_steps_per_second": 18.595,
|
| 4459 |
"num_input_tokens_seen": 26214396160,
|
| 4460 |
"step": 25000
|
| 4461 |
+
},
|
| 4462 |
+
{
|
| 4463 |
+
"epoch": 0.5502493832346921,
|
| 4464 |
+
"grad_norm": 0.16484692692756653,
|
| 4465 |
+
"learning_rate": 0.001,
|
| 4466 |
+
"loss": 2.6843,
|
| 4467 |
+
"num_input_tokens_seen": 26266824960,
|
| 4468 |
+
"step": 25050
|
| 4469 |
+
},
|
| 4470 |
+
{
|
| 4471 |
+
"epoch": 0.5513476853968372,
|
| 4472 |
+
"grad_norm": 0.1583317369222641,
|
| 4473 |
+
"learning_rate": 0.001,
|
| 4474 |
+
"loss": 2.6825,
|
| 4475 |
+
"num_input_tokens_seen": 26319253760,
|
| 4476 |
+
"step": 25100
|
| 4477 |
+
},
|
| 4478 |
+
{
|
| 4479 |
+
"epoch": 0.5524459875589822,
|
| 4480 |
+
"grad_norm": 0.1569424867630005,
|
| 4481 |
+
"learning_rate": 0.001,
|
| 4482 |
+
"loss": 2.6787,
|
| 4483 |
+
"num_input_tokens_seen": 26371682560,
|
| 4484 |
+
"step": 25150
|
| 4485 |
+
},
|
| 4486 |
+
{
|
| 4487 |
+
"epoch": 0.5535442897211273,
|
| 4488 |
+
"grad_norm": 0.13633306324481964,
|
| 4489 |
+
"learning_rate": 0.001,
|
| 4490 |
+
"loss": 2.6872,
|
| 4491 |
+
"num_input_tokens_seen": 26424111360,
|
| 4492 |
+
"step": 25200
|
| 4493 |
+
},
|
| 4494 |
+
{
|
| 4495 |
+
"epoch": 0.5546425918832725,
|
| 4496 |
+
"grad_norm": 0.1480533927679062,
|
| 4497 |
+
"learning_rate": 0.001,
|
| 4498 |
+
"loss": 2.6842,
|
| 4499 |
+
"num_input_tokens_seen": 26476540160,
|
| 4500 |
+
"step": 25250
|
| 4501 |
+
},
|
| 4502 |
+
{
|
| 4503 |
+
"epoch": 0.5557408940454175,
|
| 4504 |
+
"grad_norm": 0.1267666518688202,
|
| 4505 |
+
"learning_rate": 0.001,
|
| 4506 |
+
"loss": 2.6839,
|
| 4507 |
+
"num_input_tokens_seen": 26528968960,
|
| 4508 |
+
"step": 25300
|
| 4509 |
+
},
|
| 4510 |
+
{
|
| 4511 |
+
"epoch": 0.5568391962075626,
|
| 4512 |
+
"grad_norm": 0.13951599597930908,
|
| 4513 |
+
"learning_rate": 0.001,
|
| 4514 |
+
"loss": 2.6799,
|
| 4515 |
+
"num_input_tokens_seen": 26581397760,
|
| 4516 |
+
"step": 25350
|
| 4517 |
+
},
|
| 4518 |
+
{
|
| 4519 |
+
"epoch": 0.5579374983697077,
|
| 4520 |
+
"grad_norm": 0.15044580399990082,
|
| 4521 |
+
"learning_rate": 0.001,
|
| 4522 |
+
"loss": 2.6846,
|
| 4523 |
+
"num_input_tokens_seen": 26633826560,
|
| 4524 |
+
"step": 25400
|
| 4525 |
+
},
|
| 4526 |
+
{
|
| 4527 |
+
"epoch": 0.5590358005318529,
|
| 4528 |
+
"grad_norm": 0.12891829013824463,
|
| 4529 |
+
"learning_rate": 0.001,
|
| 4530 |
+
"loss": 2.682,
|
| 4531 |
+
"num_input_tokens_seen": 26686255360,
|
| 4532 |
+
"step": 25450
|
| 4533 |
+
},
|
| 4534 |
+
{
|
| 4535 |
+
"epoch": 0.5601341026939979,
|
| 4536 |
+
"grad_norm": 0.12812241911888123,
|
| 4537 |
+
"learning_rate": 0.001,
|
| 4538 |
+
"loss": 2.684,
|
| 4539 |
+
"num_input_tokens_seen": 26738684160,
|
| 4540 |
+
"step": 25500
|
| 4541 |
+
},
|
| 4542 |
+
{
|
| 4543 |
+
"epoch": 0.5601341026939979,
|
| 4544 |
+
"eval_loss": 2.5832085609436035,
|
| 4545 |
+
"eval_runtime": 66.9038,
|
| 4546 |
+
"eval_samples_per_second": 74.734,
|
| 4547 |
+
"eval_steps_per_second": 18.684,
|
| 4548 |
+
"num_input_tokens_seen": 26738684160,
|
| 4549 |
+
"step": 25500
|
| 4550 |
+
},
|
| 4551 |
+
{
|
| 4552 |
+
"epoch": 0.561232404856143,
|
| 4553 |
+
"grad_norm": 0.14243654906749725,
|
| 4554 |
+
"learning_rate": 0.001,
|
| 4555 |
+
"loss": 2.6883,
|
| 4556 |
+
"num_input_tokens_seen": 26791112960,
|
| 4557 |
+
"step": 25550
|
| 4558 |
+
},
|
| 4559 |
+
{
|
| 4560 |
+
"epoch": 0.5623307070182881,
|
| 4561 |
+
"grad_norm": 0.14436320960521698,
|
| 4562 |
+
"learning_rate": 0.001,
|
| 4563 |
+
"loss": 2.6835,
|
| 4564 |
+
"num_input_tokens_seen": 26843541760,
|
| 4565 |
+
"step": 25600
|
| 4566 |
+
},
|
| 4567 |
+
{
|
| 4568 |
+
"epoch": 0.5634290091804331,
|
| 4569 |
+
"grad_norm": 0.1516960710287094,
|
| 4570 |
+
"learning_rate": 0.001,
|
| 4571 |
+
"loss": 2.6752,
|
| 4572 |
+
"num_input_tokens_seen": 26895970560,
|
| 4573 |
+
"step": 25650
|
| 4574 |
+
},
|
| 4575 |
+
{
|
| 4576 |
+
"epoch": 0.5645273113425783,
|
| 4577 |
+
"grad_norm": 0.14002515375614166,
|
| 4578 |
+
"learning_rate": 0.001,
|
| 4579 |
+
"loss": 2.6817,
|
| 4580 |
+
"num_input_tokens_seen": 26948399360,
|
| 4581 |
+
"step": 25700
|
| 4582 |
+
},
|
| 4583 |
+
{
|
| 4584 |
+
"epoch": 0.5656256135047234,
|
| 4585 |
+
"grad_norm": 0.1379036009311676,
|
| 4586 |
+
"learning_rate": 0.001,
|
| 4587 |
+
"loss": 2.6904,
|
| 4588 |
+
"num_input_tokens_seen": 27000828160,
|
| 4589 |
+
"step": 25750
|
| 4590 |
+
},
|
| 4591 |
+
{
|
| 4592 |
+
"epoch": 0.5667239156668685,
|
| 4593 |
+
"grad_norm": 0.16127964854240417,
|
| 4594 |
+
"learning_rate": 0.001,
|
| 4595 |
+
"loss": 2.6813,
|
| 4596 |
+
"num_input_tokens_seen": 27053256960,
|
| 4597 |
+
"step": 25800
|
| 4598 |
+
},
|
| 4599 |
+
{
|
| 4600 |
+
"epoch": 0.5678222178290135,
|
| 4601 |
+
"grad_norm": 0.15714125335216522,
|
| 4602 |
+
"learning_rate": 0.001,
|
| 4603 |
+
"loss": 2.6851,
|
| 4604 |
+
"num_input_tokens_seen": 27105685760,
|
| 4605 |
+
"step": 25850
|
| 4606 |
+
},
|
| 4607 |
+
{
|
| 4608 |
+
"epoch": 0.5689205199911587,
|
| 4609 |
+
"grad_norm": 0.15288160741329193,
|
| 4610 |
+
"learning_rate": 0.001,
|
| 4611 |
+
"loss": 2.6832,
|
| 4612 |
+
"num_input_tokens_seen": 27158114560,
|
| 4613 |
+
"step": 25900
|
| 4614 |
+
},
|
| 4615 |
+
{
|
| 4616 |
+
"epoch": 0.5700188221533038,
|
| 4617 |
+
"grad_norm": 0.1398363709449768,
|
| 4618 |
+
"learning_rate": 0.001,
|
| 4619 |
+
"loss": 2.6814,
|
| 4620 |
+
"num_input_tokens_seen": 27210543360,
|
| 4621 |
+
"step": 25950
|
| 4622 |
+
},
|
| 4623 |
+
{
|
| 4624 |
+
"epoch": 0.5711171243154488,
|
| 4625 |
+
"grad_norm": 0.15253235399723053,
|
| 4626 |
+
"learning_rate": 0.001,
|
| 4627 |
+
"loss": 2.6755,
|
| 4628 |
+
"num_input_tokens_seen": 27262972160,
|
| 4629 |
+
"step": 26000
|
| 4630 |
+
},
|
| 4631 |
+
{
|
| 4632 |
+
"epoch": 0.5711171243154488,
|
| 4633 |
+
"eval_loss": 2.5809168815612793,
|
| 4634 |
+
"eval_runtime": 66.151,
|
| 4635 |
+
"eval_samples_per_second": 75.585,
|
| 4636 |
+
"eval_steps_per_second": 18.896,
|
| 4637 |
+
"num_input_tokens_seen": 27262972160,
|
| 4638 |
+
"step": 26000
|
| 4639 |
}
|
| 4640 |
],
|
| 4641 |
"logging_steps": 50,
|
| 4642 |
"max_steps": 200000,
|
| 4643 |
+
"num_input_tokens_seen": 27262972160,
|
| 4644 |
"num_train_epochs": 5,
|
| 4645 |
"save_steps": 1000,
|
| 4646 |
"stateful_callbacks": {
|
|
|
|
| 4655 |
"attributes": {}
|
| 4656 |
}
|
| 4657 |
},
|
| 4658 |
+
"total_flos": 1.5526462428179988e+19,
|
| 4659 |
"train_batch_size": 64,
|
| 4660 |
"trial_name": null,
|
| 4661 |
"trial_params": null
|