Training checkpoint at step 13000
Browse files- trainer_state.json +186 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4508,6 +4508,186 @@
|
|
| 4508 |
"eval_samples_per_second": 2.473,
|
| 4509 |
"eval_steps_per_second": 1.237,
|
| 4510 |
"step": 12500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4511 |
}
|
| 4512 |
],
|
| 4513 |
"logging_steps": 25,
|
|
@@ -4527,7 +4707,7 @@
|
|
| 4527 |
"attributes": {}
|
| 4528 |
}
|
| 4529 |
},
|
| 4530 |
-
"total_flos": 2.
|
| 4531 |
"train_batch_size": 1,
|
| 4532 |
"trial_name": null,
|
| 4533 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 13000,
|
| 3 |
+
"best_metric": 2.532376766204834,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-13000",
|
| 5 |
+
"epoch": 0.26,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 13000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4508 |
"eval_samples_per_second": 2.473,
|
| 4509 |
"eval_steps_per_second": 1.237,
|
| 4510 |
"step": 12500
|
| 4511 |
+
},
|
| 4512 |
+
{
|
| 4513 |
+
"epoch": 0.2505,
|
| 4514 |
+
"grad_norm": 2.019140906115923,
|
| 4515 |
+
"learning_rate": 8.328e-06,
|
| 4516 |
+
"loss": 2.5241,
|
| 4517 |
+
"step": 12525
|
| 4518 |
+
},
|
| 4519 |
+
{
|
| 4520 |
+
"epoch": 0.251,
|
| 4521 |
+
"grad_norm": 1.9012303831260067,
|
| 4522 |
+
"learning_rate": 8.322444444444446e-06,
|
| 4523 |
+
"loss": 2.5354,
|
| 4524 |
+
"step": 12550
|
| 4525 |
+
},
|
| 4526 |
+
{
|
| 4527 |
+
"epoch": 0.2515,
|
| 4528 |
+
"grad_norm": 1.7607101331370496,
|
| 4529 |
+
"learning_rate": 8.31688888888889e-06,
|
| 4530 |
+
"loss": 2.5254,
|
| 4531 |
+
"step": 12575
|
| 4532 |
+
},
|
| 4533 |
+
{
|
| 4534 |
+
"epoch": 0.252,
|
| 4535 |
+
"grad_norm": 2.5505055208286933,
|
| 4536 |
+
"learning_rate": 8.311333333333333e-06,
|
| 4537 |
+
"loss": 2.5294,
|
| 4538 |
+
"step": 12600
|
| 4539 |
+
},
|
| 4540 |
+
{
|
| 4541 |
+
"epoch": 0.252,
|
| 4542 |
+
"eval_loss": 2.535231351852417,
|
| 4543 |
+
"eval_runtime": 41.9731,
|
| 4544 |
+
"eval_samples_per_second": 2.478,
|
| 4545 |
+
"eval_steps_per_second": 1.239,
|
| 4546 |
+
"step": 12600
|
| 4547 |
+
},
|
| 4548 |
+
{
|
| 4549 |
+
"epoch": 0.2525,
|
| 4550 |
+
"grad_norm": 1.6218420390627293,
|
| 4551 |
+
"learning_rate": 8.305777777777778e-06,
|
| 4552 |
+
"loss": 2.5262,
|
| 4553 |
+
"step": 12625
|
| 4554 |
+
},
|
| 4555 |
+
{
|
| 4556 |
+
"epoch": 0.253,
|
| 4557 |
+
"grad_norm": 2.0991897222525115,
|
| 4558 |
+
"learning_rate": 8.300222222222223e-06,
|
| 4559 |
+
"loss": 2.5206,
|
| 4560 |
+
"step": 12650
|
| 4561 |
+
},
|
| 4562 |
+
{
|
| 4563 |
+
"epoch": 0.2535,
|
| 4564 |
+
"grad_norm": 2.478785246720621,
|
| 4565 |
+
"learning_rate": 8.294666666666667e-06,
|
| 4566 |
+
"loss": 2.5275,
|
| 4567 |
+
"step": 12675
|
| 4568 |
+
},
|
| 4569 |
+
{
|
| 4570 |
+
"epoch": 0.254,
|
| 4571 |
+
"grad_norm": 2.141371973093057,
|
| 4572 |
+
"learning_rate": 8.289111111111112e-06,
|
| 4573 |
+
"loss": 2.5323,
|
| 4574 |
+
"step": 12700
|
| 4575 |
+
},
|
| 4576 |
+
{
|
| 4577 |
+
"epoch": 0.254,
|
| 4578 |
+
"eval_loss": 2.5341796875,
|
| 4579 |
+
"eval_runtime": 42.2622,
|
| 4580 |
+
"eval_samples_per_second": 2.461,
|
| 4581 |
+
"eval_steps_per_second": 1.23,
|
| 4582 |
+
"step": 12700
|
| 4583 |
+
},
|
| 4584 |
+
{
|
| 4585 |
+
"epoch": 0.2545,
|
| 4586 |
+
"grad_norm": 2.269733740633448,
|
| 4587 |
+
"learning_rate": 8.283555555555556e-06,
|
| 4588 |
+
"loss": 2.5367,
|
| 4589 |
+
"step": 12725
|
| 4590 |
+
},
|
| 4591 |
+
{
|
| 4592 |
+
"epoch": 0.255,
|
| 4593 |
+
"grad_norm": 1.893617133257015,
|
| 4594 |
+
"learning_rate": 8.278000000000001e-06,
|
| 4595 |
+
"loss": 2.5257,
|
| 4596 |
+
"step": 12750
|
| 4597 |
+
},
|
| 4598 |
+
{
|
| 4599 |
+
"epoch": 0.2555,
|
| 4600 |
+
"grad_norm": 1.751381032940087,
|
| 4601 |
+
"learning_rate": 8.272444444444445e-06,
|
| 4602 |
+
"loss": 2.5276,
|
| 4603 |
+
"step": 12775
|
| 4604 |
+
},
|
| 4605 |
+
{
|
| 4606 |
+
"epoch": 0.256,
|
| 4607 |
+
"grad_norm": 2.6264391487699545,
|
| 4608 |
+
"learning_rate": 8.26688888888889e-06,
|
| 4609 |
+
"loss": 2.5281,
|
| 4610 |
+
"step": 12800
|
| 4611 |
+
},
|
| 4612 |
+
{
|
| 4613 |
+
"epoch": 0.256,
|
| 4614 |
+
"eval_loss": 2.534780740737915,
|
| 4615 |
+
"eval_runtime": 42.0037,
|
| 4616 |
+
"eval_samples_per_second": 2.476,
|
| 4617 |
+
"eval_steps_per_second": 1.238,
|
| 4618 |
+
"step": 12800
|
| 4619 |
+
},
|
| 4620 |
+
{
|
| 4621 |
+
"epoch": 0.2565,
|
| 4622 |
+
"grad_norm": 2.9544216590918766,
|
| 4623 |
+
"learning_rate": 8.261333333333335e-06,
|
| 4624 |
+
"loss": 2.5159,
|
| 4625 |
+
"step": 12825
|
| 4626 |
+
},
|
| 4627 |
+
{
|
| 4628 |
+
"epoch": 0.257,
|
| 4629 |
+
"grad_norm": 1.703574826031134,
|
| 4630 |
+
"learning_rate": 8.255777777777779e-06,
|
| 4631 |
+
"loss": 2.5314,
|
| 4632 |
+
"step": 12850
|
| 4633 |
+
},
|
| 4634 |
+
{
|
| 4635 |
+
"epoch": 0.2575,
|
| 4636 |
+
"grad_norm": 2.23456733038464,
|
| 4637 |
+
"learning_rate": 8.250222222222222e-06,
|
| 4638 |
+
"loss": 2.5301,
|
| 4639 |
+
"step": 12875
|
| 4640 |
+
},
|
| 4641 |
+
{
|
| 4642 |
+
"epoch": 0.258,
|
| 4643 |
+
"grad_norm": 2.0236952351089132,
|
| 4644 |
+
"learning_rate": 8.244666666666667e-06,
|
| 4645 |
+
"loss": 2.5274,
|
| 4646 |
+
"step": 12900
|
| 4647 |
+
},
|
| 4648 |
+
{
|
| 4649 |
+
"epoch": 0.258,
|
| 4650 |
+
"eval_loss": 2.532827615737915,
|
| 4651 |
+
"eval_runtime": 42.2742,
|
| 4652 |
+
"eval_samples_per_second": 2.46,
|
| 4653 |
+
"eval_steps_per_second": 1.23,
|
| 4654 |
+
"step": 12900
|
| 4655 |
+
},
|
| 4656 |
+
{
|
| 4657 |
+
"epoch": 0.2585,
|
| 4658 |
+
"grad_norm": 1.9175658573019432,
|
| 4659 |
+
"learning_rate": 8.239111111111113e-06,
|
| 4660 |
+
"loss": 2.5293,
|
| 4661 |
+
"step": 12925
|
| 4662 |
+
},
|
| 4663 |
+
{
|
| 4664 |
+
"epoch": 0.259,
|
| 4665 |
+
"grad_norm": 2.227745372848629,
|
| 4666 |
+
"learning_rate": 8.233555555555556e-06,
|
| 4667 |
+
"loss": 2.5346,
|
| 4668 |
+
"step": 12950
|
| 4669 |
+
},
|
| 4670 |
+
{
|
| 4671 |
+
"epoch": 0.2595,
|
| 4672 |
+
"grad_norm": 2.0320264112024375,
|
| 4673 |
+
"learning_rate": 8.228e-06,
|
| 4674 |
+
"loss": 2.5133,
|
| 4675 |
+
"step": 12975
|
| 4676 |
+
},
|
| 4677 |
+
{
|
| 4678 |
+
"epoch": 0.26,
|
| 4679 |
+
"grad_norm": 2.3254627331546636,
|
| 4680 |
+
"learning_rate": 8.222444444444445e-06,
|
| 4681 |
+
"loss": 2.5257,
|
| 4682 |
+
"step": 13000
|
| 4683 |
+
},
|
| 4684 |
+
{
|
| 4685 |
+
"epoch": 0.26,
|
| 4686 |
+
"eval_loss": 2.532376766204834,
|
| 4687 |
+
"eval_runtime": 42.0555,
|
| 4688 |
+
"eval_samples_per_second": 2.473,
|
| 4689 |
+
"eval_steps_per_second": 1.236,
|
| 4690 |
+
"step": 13000
|
| 4691 |
}
|
| 4692 |
],
|
| 4693 |
"logging_steps": 25,
|
|
|
|
| 4707 |
"attributes": {}
|
| 4708 |
}
|
| 4709 |
},
|
| 4710 |
+
"total_flos": 2.9174851597705937e+19,
|
| 4711 |
"train_batch_size": 1,
|
| 4712 |
"trial_name": null,
|
| 4713 |
"trial_params": null
|