Training checkpoint at step 22000
Browse files- trainer_state.json +366 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -7568,6 +7568,366 @@
|
|
| 7568 |
"eval_samples_per_second": 3.209,
|
| 7569 |
"eval_steps_per_second": 1.604,
|
| 7570 |
"step": 21000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7571 |
}
|
| 7572 |
],
|
| 7573 |
"logging_steps": 25,
|
|
@@ -7587,7 +7947,7 @@
|
|
| 7587 |
"attributes": {}
|
| 7588 |
}
|
| 7589 |
},
|
| 7590 |
-
"total_flos":
|
| 7591 |
"train_batch_size": 1,
|
| 7592 |
"trial_name": null,
|
| 7593 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 22000,
|
| 3 |
+
"best_metric": 2.3865110874176025,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-22000",
|
| 5 |
+
"epoch": 0.44,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 22000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 7568 |
"eval_samples_per_second": 3.209,
|
| 7569 |
"eval_steps_per_second": 1.604,
|
| 7570 |
"step": 21000
|
| 7571 |
+
},
|
| 7572 |
+
{
|
| 7573 |
+
"epoch": 0.4205,
|
| 7574 |
+
"grad_norm": 0.5447308357523696,
|
| 7575 |
+
"learning_rate": 6.439111111111111e-06,
|
| 7576 |
+
"loss": 2.3803,
|
| 7577 |
+
"step": 21025
|
| 7578 |
+
},
|
| 7579 |
+
{
|
| 7580 |
+
"epoch": 0.421,
|
| 7581 |
+
"grad_norm": 0.5426314550064573,
|
| 7582 |
+
"learning_rate": 6.4335555555555566e-06,
|
| 7583 |
+
"loss": 2.3798,
|
| 7584 |
+
"step": 21050
|
| 7585 |
+
},
|
| 7586 |
+
{
|
| 7587 |
+
"epoch": 0.4215,
|
| 7588 |
+
"grad_norm": 0.5623213994558643,
|
| 7589 |
+
"learning_rate": 6.428000000000001e-06,
|
| 7590 |
+
"loss": 2.3855,
|
| 7591 |
+
"step": 21075
|
| 7592 |
+
},
|
| 7593 |
+
{
|
| 7594 |
+
"epoch": 0.422,
|
| 7595 |
+
"grad_norm": 0.551782200199429,
|
| 7596 |
+
"learning_rate": 6.4224444444444445e-06,
|
| 7597 |
+
"loss": 2.3744,
|
| 7598 |
+
"step": 21100
|
| 7599 |
+
},
|
| 7600 |
+
{
|
| 7601 |
+
"epoch": 0.422,
|
| 7602 |
+
"eval_loss": 2.3879234790802,
|
| 7603 |
+
"eval_runtime": 31.7247,
|
| 7604 |
+
"eval_samples_per_second": 3.215,
|
| 7605 |
+
"eval_steps_per_second": 1.608,
|
| 7606 |
+
"step": 21100
|
| 7607 |
+
},
|
| 7608 |
+
{
|
| 7609 |
+
"epoch": 0.4225,
|
| 7610 |
+
"grad_norm": 0.527718965025146,
|
| 7611 |
+
"learning_rate": 6.416888888888889e-06,
|
| 7612 |
+
"loss": 2.3629,
|
| 7613 |
+
"step": 21125
|
| 7614 |
+
},
|
| 7615 |
+
{
|
| 7616 |
+
"epoch": 0.423,
|
| 7617 |
+
"grad_norm": 0.5608708238117702,
|
| 7618 |
+
"learning_rate": 6.411333333333334e-06,
|
| 7619 |
+
"loss": 2.3775,
|
| 7620 |
+
"step": 21150
|
| 7621 |
+
},
|
| 7622 |
+
{
|
| 7623 |
+
"epoch": 0.4235,
|
| 7624 |
+
"grad_norm": 0.5448339479028284,
|
| 7625 |
+
"learning_rate": 6.405777777777779e-06,
|
| 7626 |
+
"loss": 2.379,
|
| 7627 |
+
"step": 21175
|
| 7628 |
+
},
|
| 7629 |
+
{
|
| 7630 |
+
"epoch": 0.424,
|
| 7631 |
+
"grad_norm": 0.5418336159854089,
|
| 7632 |
+
"learning_rate": 6.400222222222223e-06,
|
| 7633 |
+
"loss": 2.3771,
|
| 7634 |
+
"step": 21200
|
| 7635 |
+
},
|
| 7636 |
+
{
|
| 7637 |
+
"epoch": 0.424,
|
| 7638 |
+
"eval_loss": 2.3878672122955322,
|
| 7639 |
+
"eval_runtime": 31.8891,
|
| 7640 |
+
"eval_samples_per_second": 3.199,
|
| 7641 |
+
"eval_steps_per_second": 1.599,
|
| 7642 |
+
"step": 21200
|
| 7643 |
+
},
|
| 7644 |
+
{
|
| 7645 |
+
"epoch": 0.4245,
|
| 7646 |
+
"grad_norm": 0.5765916975285049,
|
| 7647 |
+
"learning_rate": 6.3946666666666665e-06,
|
| 7648 |
+
"loss": 2.3838,
|
| 7649 |
+
"step": 21225
|
| 7650 |
+
},
|
| 7651 |
+
{
|
| 7652 |
+
"epoch": 0.425,
|
| 7653 |
+
"grad_norm": 0.5482787584221817,
|
| 7654 |
+
"learning_rate": 6.389111111111112e-06,
|
| 7655 |
+
"loss": 2.3751,
|
| 7656 |
+
"step": 21250
|
| 7657 |
+
},
|
| 7658 |
+
{
|
| 7659 |
+
"epoch": 0.4255,
|
| 7660 |
+
"grad_norm": 0.5592623692636863,
|
| 7661 |
+
"learning_rate": 6.383555555555556e-06,
|
| 7662 |
+
"loss": 2.3714,
|
| 7663 |
+
"step": 21275
|
| 7664 |
+
},
|
| 7665 |
+
{
|
| 7666 |
+
"epoch": 0.426,
|
| 7667 |
+
"grad_norm": 0.5502456266750644,
|
| 7668 |
+
"learning_rate": 6.378000000000001e-06,
|
| 7669 |
+
"loss": 2.3687,
|
| 7670 |
+
"step": 21300
|
| 7671 |
+
},
|
| 7672 |
+
{
|
| 7673 |
+
"epoch": 0.426,
|
| 7674 |
+
"eval_loss": 2.387702226638794,
|
| 7675 |
+
"eval_runtime": 31.8474,
|
| 7676 |
+
"eval_samples_per_second": 3.203,
|
| 7677 |
+
"eval_steps_per_second": 1.601,
|
| 7678 |
+
"step": 21300
|
| 7679 |
+
},
|
| 7680 |
+
{
|
| 7681 |
+
"epoch": 0.4265,
|
| 7682 |
+
"grad_norm": 0.5508844144432443,
|
| 7683 |
+
"learning_rate": 6.372444444444444e-06,
|
| 7684 |
+
"loss": 2.3705,
|
| 7685 |
+
"step": 21325
|
| 7686 |
+
},
|
| 7687 |
+
{
|
| 7688 |
+
"epoch": 0.427,
|
| 7689 |
+
"grad_norm": 0.5551955771008479,
|
| 7690 |
+
"learning_rate": 6.366888888888889e-06,
|
| 7691 |
+
"loss": 2.3616,
|
| 7692 |
+
"step": 21350
|
| 7693 |
+
},
|
| 7694 |
+
{
|
| 7695 |
+
"epoch": 0.4275,
|
| 7696 |
+
"grad_norm": 0.5482174863813819,
|
| 7697 |
+
"learning_rate": 6.361333333333334e-06,
|
| 7698 |
+
"loss": 2.3679,
|
| 7699 |
+
"step": 21375
|
| 7700 |
+
},
|
| 7701 |
+
{
|
| 7702 |
+
"epoch": 0.428,
|
| 7703 |
+
"grad_norm": 0.540793837360148,
|
| 7704 |
+
"learning_rate": 6.355777777777778e-06,
|
| 7705 |
+
"loss": 2.3724,
|
| 7706 |
+
"step": 21400
|
| 7707 |
+
},
|
| 7708 |
+
{
|
| 7709 |
+
"epoch": 0.428,
|
| 7710 |
+
"eval_loss": 2.3876450061798096,
|
| 7711 |
+
"eval_runtime": 32.2051,
|
| 7712 |
+
"eval_samples_per_second": 3.167,
|
| 7713 |
+
"eval_steps_per_second": 1.584,
|
| 7714 |
+
"step": 21400
|
| 7715 |
+
},
|
| 7716 |
+
{
|
| 7717 |
+
"epoch": 0.4285,
|
| 7718 |
+
"grad_norm": 0.5478812262209652,
|
| 7719 |
+
"learning_rate": 6.3502222222222235e-06,
|
| 7720 |
+
"loss": 2.3639,
|
| 7721 |
+
"step": 21425
|
| 7722 |
+
},
|
| 7723 |
+
{
|
| 7724 |
+
"epoch": 0.429,
|
| 7725 |
+
"grad_norm": 0.5598419449976438,
|
| 7726 |
+
"learning_rate": 6.344666666666667e-06,
|
| 7727 |
+
"loss": 2.3686,
|
| 7728 |
+
"step": 21450
|
| 7729 |
+
},
|
| 7730 |
+
{
|
| 7731 |
+
"epoch": 0.4295,
|
| 7732 |
+
"grad_norm": 0.5650989625187698,
|
| 7733 |
+
"learning_rate": 6.339111111111111e-06,
|
| 7734 |
+
"loss": 2.3755,
|
| 7735 |
+
"step": 21475
|
| 7736 |
+
},
|
| 7737 |
+
{
|
| 7738 |
+
"epoch": 0.43,
|
| 7739 |
+
"grad_norm": 0.5521104434834965,
|
| 7740 |
+
"learning_rate": 6.333555555555556e-06,
|
| 7741 |
+
"loss": 2.3819,
|
| 7742 |
+
"step": 21500
|
| 7743 |
+
},
|
| 7744 |
+
{
|
| 7745 |
+
"epoch": 0.43,
|
| 7746 |
+
"eval_loss": 2.386732578277588,
|
| 7747 |
+
"eval_runtime": 32.423,
|
| 7748 |
+
"eval_samples_per_second": 3.146,
|
| 7749 |
+
"eval_steps_per_second": 1.573,
|
| 7750 |
+
"step": 21500
|
| 7751 |
+
},
|
| 7752 |
+
{
|
| 7753 |
+
"epoch": 0.4305,
|
| 7754 |
+
"grad_norm": 0.5718504697288973,
|
| 7755 |
+
"learning_rate": 6.328000000000001e-06,
|
| 7756 |
+
"loss": 2.3768,
|
| 7757 |
+
"step": 21525
|
| 7758 |
+
},
|
| 7759 |
+
{
|
| 7760 |
+
"epoch": 0.431,
|
| 7761 |
+
"grad_norm": 0.5647383482527034,
|
| 7762 |
+
"learning_rate": 6.3224444444444455e-06,
|
| 7763 |
+
"loss": 2.3634,
|
| 7764 |
+
"step": 21550
|
| 7765 |
+
},
|
| 7766 |
+
{
|
| 7767 |
+
"epoch": 0.4315,
|
| 7768 |
+
"grad_norm": 0.5740444089490578,
|
| 7769 |
+
"learning_rate": 6.316888888888889e-06,
|
| 7770 |
+
"loss": 2.3683,
|
| 7771 |
+
"step": 21575
|
| 7772 |
+
},
|
| 7773 |
+
{
|
| 7774 |
+
"epoch": 0.432,
|
| 7775 |
+
"grad_norm": 0.5468815860778439,
|
| 7776 |
+
"learning_rate": 6.3113333333333334e-06,
|
| 7777 |
+
"loss": 2.3775,
|
| 7778 |
+
"step": 21600
|
| 7779 |
+
},
|
| 7780 |
+
{
|
| 7781 |
+
"epoch": 0.432,
|
| 7782 |
+
"eval_loss": 2.386624813079834,
|
| 7783 |
+
"eval_runtime": 32.2361,
|
| 7784 |
+
"eval_samples_per_second": 3.164,
|
| 7785 |
+
"eval_steps_per_second": 1.582,
|
| 7786 |
+
"step": 21600
|
| 7787 |
+
},
|
| 7788 |
+
{
|
| 7789 |
+
"epoch": 0.4325,
|
| 7790 |
+
"grad_norm": 0.5491782166979611,
|
| 7791 |
+
"learning_rate": 6.305777777777779e-06,
|
| 7792 |
+
"loss": 2.3678,
|
| 7793 |
+
"step": 21625
|
| 7794 |
+
},
|
| 7795 |
+
{
|
| 7796 |
+
"epoch": 0.433,
|
| 7797 |
+
"grad_norm": 0.5493956319744467,
|
| 7798 |
+
"learning_rate": 6.300222222222223e-06,
|
| 7799 |
+
"loss": 2.3632,
|
| 7800 |
+
"step": 21650
|
| 7801 |
+
},
|
| 7802 |
+
{
|
| 7803 |
+
"epoch": 0.4335,
|
| 7804 |
+
"grad_norm": 0.5517199994093782,
|
| 7805 |
+
"learning_rate": 6.294666666666667e-06,
|
| 7806 |
+
"loss": 2.3719,
|
| 7807 |
+
"step": 21675
|
| 7808 |
+
},
|
| 7809 |
+
{
|
| 7810 |
+
"epoch": 0.434,
|
| 7811 |
+
"grad_norm": 0.5480082798934808,
|
| 7812 |
+
"learning_rate": 6.289111111111111e-06,
|
| 7813 |
+
"loss": 2.3705,
|
| 7814 |
+
"step": 21700
|
| 7815 |
+
},
|
| 7816 |
+
{
|
| 7817 |
+
"epoch": 0.434,
|
| 7818 |
+
"eval_loss": 2.386605978012085,
|
| 7819 |
+
"eval_runtime": 31.811,
|
| 7820 |
+
"eval_samples_per_second": 3.206,
|
| 7821 |
+
"eval_steps_per_second": 1.603,
|
| 7822 |
+
"step": 21700
|
| 7823 |
+
},
|
| 7824 |
+
{
|
| 7825 |
+
"epoch": 0.4345,
|
| 7826 |
+
"grad_norm": 0.5988374708555845,
|
| 7827 |
+
"learning_rate": 6.283555555555556e-06,
|
| 7828 |
+
"loss": 2.3736,
|
| 7829 |
+
"step": 21725
|
| 7830 |
+
},
|
| 7831 |
+
{
|
| 7832 |
+
"epoch": 0.435,
|
| 7833 |
+
"grad_norm": 0.5394989364015422,
|
| 7834 |
+
"learning_rate": 6.278000000000001e-06,
|
| 7835 |
+
"loss": 2.38,
|
| 7836 |
+
"step": 21750
|
| 7837 |
+
},
|
| 7838 |
+
{
|
| 7839 |
+
"epoch": 0.4355,
|
| 7840 |
+
"grad_norm": 0.5660475248416822,
|
| 7841 |
+
"learning_rate": 6.272444444444445e-06,
|
| 7842 |
+
"loss": 2.3712,
|
| 7843 |
+
"step": 21775
|
| 7844 |
+
},
|
| 7845 |
+
{
|
| 7846 |
+
"epoch": 0.436,
|
| 7847 |
+
"grad_norm": 0.5824076374736812,
|
| 7848 |
+
"learning_rate": 6.266888888888889e-06,
|
| 7849 |
+
"loss": 2.3781,
|
| 7850 |
+
"step": 21800
|
| 7851 |
+
},
|
| 7852 |
+
{
|
| 7853 |
+
"epoch": 0.436,
|
| 7854 |
+
"eval_loss": 2.3868014812469482,
|
| 7855 |
+
"eval_runtime": 32.0011,
|
| 7856 |
+
"eval_samples_per_second": 3.187,
|
| 7857 |
+
"eval_steps_per_second": 1.594,
|
| 7858 |
+
"step": 21800
|
| 7859 |
+
},
|
| 7860 |
+
{
|
| 7861 |
+
"epoch": 0.4365,
|
| 7862 |
+
"grad_norm": 0.5604649354431509,
|
| 7863 |
+
"learning_rate": 6.261333333333334e-06,
|
| 7864 |
+
"loss": 2.3673,
|
| 7865 |
+
"step": 21825
|
| 7866 |
+
},
|
| 7867 |
+
{
|
| 7868 |
+
"epoch": 0.437,
|
| 7869 |
+
"grad_norm": 0.5581917280058185,
|
| 7870 |
+
"learning_rate": 6.255777777777778e-06,
|
| 7871 |
+
"loss": 2.3575,
|
| 7872 |
+
"step": 21850
|
| 7873 |
+
},
|
| 7874 |
+
{
|
| 7875 |
+
"epoch": 0.4375,
|
| 7876 |
+
"grad_norm": 0.5682187519985219,
|
| 7877 |
+
"learning_rate": 6.250222222222223e-06,
|
| 7878 |
+
"loss": 2.3752,
|
| 7879 |
+
"step": 21875
|
| 7880 |
+
},
|
| 7881 |
+
{
|
| 7882 |
+
"epoch": 0.438,
|
| 7883 |
+
"grad_norm": 0.5343819916754123,
|
| 7884 |
+
"learning_rate": 6.244666666666666e-06,
|
| 7885 |
+
"loss": 2.3688,
|
| 7886 |
+
"step": 21900
|
| 7887 |
+
},
|
| 7888 |
+
{
|
| 7889 |
+
"epoch": 0.438,
|
| 7890 |
+
"eval_loss": 2.3865694999694824,
|
| 7891 |
+
"eval_runtime": 31.8681,
|
| 7892 |
+
"eval_samples_per_second": 3.201,
|
| 7893 |
+
"eval_steps_per_second": 1.6,
|
| 7894 |
+
"step": 21900
|
| 7895 |
+
},
|
| 7896 |
+
{
|
| 7897 |
+
"epoch": 0.4385,
|
| 7898 |
+
"grad_norm": 0.6084740129821103,
|
| 7899 |
+
"learning_rate": 6.2391111111111115e-06,
|
| 7900 |
+
"loss": 2.3611,
|
| 7901 |
+
"step": 21925
|
| 7902 |
+
},
|
| 7903 |
+
{
|
| 7904 |
+
"epoch": 0.439,
|
| 7905 |
+
"grad_norm": 0.5550908983577711,
|
| 7906 |
+
"learning_rate": 6.233555555555556e-06,
|
| 7907 |
+
"loss": 2.364,
|
| 7908 |
+
"step": 21950
|
| 7909 |
+
},
|
| 7910 |
+
{
|
| 7911 |
+
"epoch": 0.4395,
|
| 7912 |
+
"grad_norm": 0.5605896822575689,
|
| 7913 |
+
"learning_rate": 6.228e-06,
|
| 7914 |
+
"loss": 2.3875,
|
| 7915 |
+
"step": 21975
|
| 7916 |
+
},
|
| 7917 |
+
{
|
| 7918 |
+
"epoch": 0.44,
|
| 7919 |
+
"grad_norm": 0.5679795530728957,
|
| 7920 |
+
"learning_rate": 6.222444444444446e-06,
|
| 7921 |
+
"loss": 2.3637,
|
| 7922 |
+
"step": 22000
|
| 7923 |
+
},
|
| 7924 |
+
{
|
| 7925 |
+
"epoch": 0.44,
|
| 7926 |
+
"eval_loss": 2.3865110874176025,
|
| 7927 |
+
"eval_runtime": 31.8116,
|
| 7928 |
+
"eval_samples_per_second": 3.206,
|
| 7929 |
+
"eval_steps_per_second": 1.603,
|
| 7930 |
+
"step": 22000
|
| 7931 |
}
|
| 7932 |
],
|
| 7933 |
"logging_steps": 25,
|
|
|
|
| 7947 |
"attributes": {}
|
| 7948 |
}
|
| 7949 |
},
|
| 7950 |
+
"total_flos": 7.0030450563198484e+19,
|
| 7951 |
"train_batch_size": 1,
|
| 7952 |
"trial_name": null,
|
| 7953 |
"trial_params": null
|