Training checkpoint at step 14000
Browse files- trainer_state.json +365 -5
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-13000",
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4688,6 +4688,366 @@
|
|
| 4688 |
"eval_samples_per_second": 3.207,
|
| 4689 |
"eval_steps_per_second": 1.603,
|
| 4690 |
"step": 13000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4691 |
}
|
| 4692 |
],
|
| 4693 |
"logging_steps": 25,
|
|
@@ -4707,7 +5067,7 @@
|
|
| 4707 |
"attributes": {}
|
| 4708 |
}
|
| 4709 |
},
|
| 4710 |
-
"total_flos": 4.
|
| 4711 |
"train_batch_size": 1,
|
| 4712 |
"trial_name": null,
|
| 4713 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 13900,
|
| 3 |
+
"best_metric": 2.3990118503570557,
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-13000",
|
| 5 |
+
"epoch": 0.28,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 14000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4688 |
"eval_samples_per_second": 3.207,
|
| 4689 |
"eval_steps_per_second": 1.603,
|
| 4690 |
"step": 13000
|
| 4691 |
+
},
|
| 4692 |
+
{
|
| 4693 |
+
"epoch": 0.2605,
|
| 4694 |
+
"grad_norm": 0.5758550911461594,
|
| 4695 |
+
"learning_rate": 8.21688888888889e-06,
|
| 4696 |
+
"loss": 2.39,
|
| 4697 |
+
"step": 13025
|
| 4698 |
+
},
|
| 4699 |
+
{
|
| 4700 |
+
"epoch": 0.261,
|
| 4701 |
+
"grad_norm": 0.5506335078390368,
|
| 4702 |
+
"learning_rate": 8.211333333333334e-06,
|
| 4703 |
+
"loss": 2.3879,
|
| 4704 |
+
"step": 13050
|
| 4705 |
+
},
|
| 4706 |
+
{
|
| 4707 |
+
"epoch": 0.2615,
|
| 4708 |
+
"grad_norm": 0.578047700560021,
|
| 4709 |
+
"learning_rate": 8.205777777777777e-06,
|
| 4710 |
+
"loss": 2.3772,
|
| 4711 |
+
"step": 13075
|
| 4712 |
+
},
|
| 4713 |
+
{
|
| 4714 |
+
"epoch": 0.262,
|
| 4715 |
+
"grad_norm": 0.5517825098879646,
|
| 4716 |
+
"learning_rate": 8.200222222222223e-06,
|
| 4717 |
+
"loss": 2.3751,
|
| 4718 |
+
"step": 13100
|
| 4719 |
+
},
|
| 4720 |
+
{
|
| 4721 |
+
"epoch": 0.262,
|
| 4722 |
+
"eval_loss": 2.4008378982543945,
|
| 4723 |
+
"eval_runtime": 31.8219,
|
| 4724 |
+
"eval_samples_per_second": 3.205,
|
| 4725 |
+
"eval_steps_per_second": 1.603,
|
| 4726 |
+
"step": 13100
|
| 4727 |
+
},
|
| 4728 |
+
{
|
| 4729 |
+
"epoch": 0.2625,
|
| 4730 |
+
"grad_norm": 0.6060142395322289,
|
| 4731 |
+
"learning_rate": 8.194666666666668e-06,
|
| 4732 |
+
"loss": 2.3859,
|
| 4733 |
+
"step": 13125
|
| 4734 |
+
},
|
| 4735 |
+
{
|
| 4736 |
+
"epoch": 0.263,
|
| 4737 |
+
"grad_norm": 0.6151379264003006,
|
| 4738 |
+
"learning_rate": 8.189111111111111e-06,
|
| 4739 |
+
"loss": 2.3906,
|
| 4740 |
+
"step": 13150
|
| 4741 |
+
},
|
| 4742 |
+
{
|
| 4743 |
+
"epoch": 0.2635,
|
| 4744 |
+
"grad_norm": 0.5889091981712471,
|
| 4745 |
+
"learning_rate": 8.183555555555555e-06,
|
| 4746 |
+
"loss": 2.3813,
|
| 4747 |
+
"step": 13175
|
| 4748 |
+
},
|
| 4749 |
+
{
|
| 4750 |
+
"epoch": 0.264,
|
| 4751 |
+
"grad_norm": 0.7021686085407579,
|
| 4752 |
+
"learning_rate": 8.178e-06,
|
| 4753 |
+
"loss": 2.3844,
|
| 4754 |
+
"step": 13200
|
| 4755 |
+
},
|
| 4756 |
+
{
|
| 4757 |
+
"epoch": 0.264,
|
| 4758 |
+
"eval_loss": 2.400826930999756,
|
| 4759 |
+
"eval_runtime": 31.7255,
|
| 4760 |
+
"eval_samples_per_second": 3.215,
|
| 4761 |
+
"eval_steps_per_second": 1.608,
|
| 4762 |
+
"step": 13200
|
| 4763 |
+
},
|
| 4764 |
+
{
|
| 4765 |
+
"epoch": 0.2645,
|
| 4766 |
+
"grad_norm": 0.5738899506070113,
|
| 4767 |
+
"learning_rate": 8.172444444444446e-06,
|
| 4768 |
+
"loss": 2.3974,
|
| 4769 |
+
"step": 13225
|
| 4770 |
+
},
|
| 4771 |
+
{
|
| 4772 |
+
"epoch": 0.265,
|
| 4773 |
+
"grad_norm": 0.618543215020873,
|
| 4774 |
+
"learning_rate": 8.166888888888889e-06,
|
| 4775 |
+
"loss": 2.3846,
|
| 4776 |
+
"step": 13250
|
| 4777 |
+
},
|
| 4778 |
+
{
|
| 4779 |
+
"epoch": 0.2655,
|
| 4780 |
+
"grad_norm": 0.5529480549821216,
|
| 4781 |
+
"learning_rate": 8.161333333333334e-06,
|
| 4782 |
+
"loss": 2.3816,
|
| 4783 |
+
"step": 13275
|
| 4784 |
+
},
|
| 4785 |
+
{
|
| 4786 |
+
"epoch": 0.266,
|
| 4787 |
+
"grad_norm": 0.569904631452621,
|
| 4788 |
+
"learning_rate": 8.155777777777778e-06,
|
| 4789 |
+
"loss": 2.3809,
|
| 4790 |
+
"step": 13300
|
| 4791 |
+
},
|
| 4792 |
+
{
|
| 4793 |
+
"epoch": 0.266,
|
| 4794 |
+
"eval_loss": 2.4002933502197266,
|
| 4795 |
+
"eval_runtime": 31.6983,
|
| 4796 |
+
"eval_samples_per_second": 3.218,
|
| 4797 |
+
"eval_steps_per_second": 1.609,
|
| 4798 |
+
"step": 13300
|
| 4799 |
+
},
|
| 4800 |
+
{
|
| 4801 |
+
"epoch": 0.2665,
|
| 4802 |
+
"grad_norm": 0.5743878084278218,
|
| 4803 |
+
"learning_rate": 8.150222222222223e-06,
|
| 4804 |
+
"loss": 2.3941,
|
| 4805 |
+
"step": 13325
|
| 4806 |
+
},
|
| 4807 |
+
{
|
| 4808 |
+
"epoch": 0.267,
|
| 4809 |
+
"grad_norm": 0.5594243149898632,
|
| 4810 |
+
"learning_rate": 8.144666666666667e-06,
|
| 4811 |
+
"loss": 2.3878,
|
| 4812 |
+
"step": 13350
|
| 4813 |
+
},
|
| 4814 |
+
{
|
| 4815 |
+
"epoch": 0.2675,
|
| 4816 |
+
"grad_norm": 0.5810666087448406,
|
| 4817 |
+
"learning_rate": 8.139111111111112e-06,
|
| 4818 |
+
"loss": 2.381,
|
| 4819 |
+
"step": 13375
|
| 4820 |
+
},
|
| 4821 |
+
{
|
| 4822 |
+
"epoch": 0.268,
|
| 4823 |
+
"grad_norm": 0.5595852108101106,
|
| 4824 |
+
"learning_rate": 8.133555555555557e-06,
|
| 4825 |
+
"loss": 2.3792,
|
| 4826 |
+
"step": 13400
|
| 4827 |
+
},
|
| 4828 |
+
{
|
| 4829 |
+
"epoch": 0.268,
|
| 4830 |
+
"eval_loss": 2.400261878967285,
|
| 4831 |
+
"eval_runtime": 31.6975,
|
| 4832 |
+
"eval_samples_per_second": 3.218,
|
| 4833 |
+
"eval_steps_per_second": 1.609,
|
| 4834 |
+
"step": 13400
|
| 4835 |
+
},
|
| 4836 |
+
{
|
| 4837 |
+
"epoch": 0.2685,
|
| 4838 |
+
"grad_norm": 0.5789530002361615,
|
| 4839 |
+
"learning_rate": 8.128e-06,
|
| 4840 |
+
"loss": 2.3759,
|
| 4841 |
+
"step": 13425
|
| 4842 |
+
},
|
| 4843 |
+
{
|
| 4844 |
+
"epoch": 0.269,
|
| 4845 |
+
"grad_norm": 0.5662301407639397,
|
| 4846 |
+
"learning_rate": 8.122444444444444e-06,
|
| 4847 |
+
"loss": 2.3791,
|
| 4848 |
+
"step": 13450
|
| 4849 |
+
},
|
| 4850 |
+
{
|
| 4851 |
+
"epoch": 0.2695,
|
| 4852 |
+
"grad_norm": 0.6131145841315326,
|
| 4853 |
+
"learning_rate": 8.11688888888889e-06,
|
| 4854 |
+
"loss": 2.3833,
|
| 4855 |
+
"step": 13475
|
| 4856 |
+
},
|
| 4857 |
+
{
|
| 4858 |
+
"epoch": 0.27,
|
| 4859 |
+
"grad_norm": 0.5607318024001929,
|
| 4860 |
+
"learning_rate": 8.111333333333335e-06,
|
| 4861 |
+
"loss": 2.3724,
|
| 4862 |
+
"step": 13500
|
| 4863 |
+
},
|
| 4864 |
+
{
|
| 4865 |
+
"epoch": 0.27,
|
| 4866 |
+
"eval_loss": 2.4000020027160645,
|
| 4867 |
+
"eval_runtime": 31.71,
|
| 4868 |
+
"eval_samples_per_second": 3.217,
|
| 4869 |
+
"eval_steps_per_second": 1.608,
|
| 4870 |
+
"step": 13500
|
| 4871 |
+
},
|
| 4872 |
+
{
|
| 4873 |
+
"epoch": 0.2705,
|
| 4874 |
+
"grad_norm": 0.5692755244185855,
|
| 4875 |
+
"learning_rate": 8.105777777777778e-06,
|
| 4876 |
+
"loss": 2.3788,
|
| 4877 |
+
"step": 13525
|
| 4878 |
+
},
|
| 4879 |
+
{
|
| 4880 |
+
"epoch": 0.271,
|
| 4881 |
+
"grad_norm": 0.5647342769538716,
|
| 4882 |
+
"learning_rate": 8.100222222222222e-06,
|
| 4883 |
+
"loss": 2.3799,
|
| 4884 |
+
"step": 13550
|
| 4885 |
+
},
|
| 4886 |
+
{
|
| 4887 |
+
"epoch": 0.2715,
|
| 4888 |
+
"grad_norm": 0.5976773519089553,
|
| 4889 |
+
"learning_rate": 8.094666666666667e-06,
|
| 4890 |
+
"loss": 2.3828,
|
| 4891 |
+
"step": 13575
|
| 4892 |
+
},
|
| 4893 |
+
{
|
| 4894 |
+
"epoch": 0.272,
|
| 4895 |
+
"grad_norm": 0.5642506953063758,
|
| 4896 |
+
"learning_rate": 8.089111111111112e-06,
|
| 4897 |
+
"loss": 2.3835,
|
| 4898 |
+
"step": 13600
|
| 4899 |
+
},
|
| 4900 |
+
{
|
| 4901 |
+
"epoch": 0.272,
|
| 4902 |
+
"eval_loss": 2.400066614151001,
|
| 4903 |
+
"eval_runtime": 31.8128,
|
| 4904 |
+
"eval_samples_per_second": 3.206,
|
| 4905 |
+
"eval_steps_per_second": 1.603,
|
| 4906 |
+
"step": 13600
|
| 4907 |
+
},
|
| 4908 |
+
{
|
| 4909 |
+
"epoch": 0.2725,
|
| 4910 |
+
"grad_norm": 0.5616659241704035,
|
| 4911 |
+
"learning_rate": 8.083555555555556e-06,
|
| 4912 |
+
"loss": 2.3801,
|
| 4913 |
+
"step": 13625
|
| 4914 |
+
},
|
| 4915 |
+
{
|
| 4916 |
+
"epoch": 0.273,
|
| 4917 |
+
"grad_norm": 0.5878315825498157,
|
| 4918 |
+
"learning_rate": 8.078e-06,
|
| 4919 |
+
"loss": 2.3781,
|
| 4920 |
+
"step": 13650
|
| 4921 |
+
},
|
| 4922 |
+
{
|
| 4923 |
+
"epoch": 0.2735,
|
| 4924 |
+
"grad_norm": 0.5716337786191225,
|
| 4925 |
+
"learning_rate": 8.072444444444445e-06,
|
| 4926 |
+
"loss": 2.3932,
|
| 4927 |
+
"step": 13675
|
| 4928 |
+
},
|
| 4929 |
+
{
|
| 4930 |
+
"epoch": 0.274,
|
| 4931 |
+
"grad_norm": 0.5636757577555458,
|
| 4932 |
+
"learning_rate": 8.06688888888889e-06,
|
| 4933 |
+
"loss": 2.4041,
|
| 4934 |
+
"step": 13700
|
| 4935 |
+
},
|
| 4936 |
+
{
|
| 4937 |
+
"epoch": 0.274,
|
| 4938 |
+
"eval_loss": 2.3997650146484375,
|
| 4939 |
+
"eval_runtime": 31.4871,
|
| 4940 |
+
"eval_samples_per_second": 3.239,
|
| 4941 |
+
"eval_steps_per_second": 1.62,
|
| 4942 |
+
"step": 13700
|
| 4943 |
+
},
|
| 4944 |
+
{
|
| 4945 |
+
"epoch": 0.2745,
|
| 4946 |
+
"grad_norm": 0.5564992808480433,
|
| 4947 |
+
"learning_rate": 8.061333333333334e-06,
|
| 4948 |
+
"loss": 2.3971,
|
| 4949 |
+
"step": 13725
|
| 4950 |
+
},
|
| 4951 |
+
{
|
| 4952 |
+
"epoch": 0.275,
|
| 4953 |
+
"grad_norm": 0.5736246457745038,
|
| 4954 |
+
"learning_rate": 8.055777777777777e-06,
|
| 4955 |
+
"loss": 2.3847,
|
| 4956 |
+
"step": 13750
|
| 4957 |
+
},
|
| 4958 |
+
{
|
| 4959 |
+
"epoch": 0.2755,
|
| 4960 |
+
"grad_norm": 0.5423430973262378,
|
| 4961 |
+
"learning_rate": 8.050222222222222e-06,
|
| 4962 |
+
"loss": 2.3786,
|
| 4963 |
+
"step": 13775
|
| 4964 |
+
},
|
| 4965 |
+
{
|
| 4966 |
+
"epoch": 0.276,
|
| 4967 |
+
"grad_norm": 0.5672815850751382,
|
| 4968 |
+
"learning_rate": 8.044666666666668e-06,
|
| 4969 |
+
"loss": 2.3945,
|
| 4970 |
+
"step": 13800
|
| 4971 |
+
},
|
| 4972 |
+
{
|
| 4973 |
+
"epoch": 0.276,
|
| 4974 |
+
"eval_loss": 2.399338483810425,
|
| 4975 |
+
"eval_runtime": 31.3741,
|
| 4976 |
+
"eval_samples_per_second": 3.251,
|
| 4977 |
+
"eval_steps_per_second": 1.626,
|
| 4978 |
+
"step": 13800
|
| 4979 |
+
},
|
| 4980 |
+
{
|
| 4981 |
+
"epoch": 0.2765,
|
| 4982 |
+
"grad_norm": 0.5919813611615313,
|
| 4983 |
+
"learning_rate": 8.039111111111111e-06,
|
| 4984 |
+
"loss": 2.3738,
|
| 4985 |
+
"step": 13825
|
| 4986 |
+
},
|
| 4987 |
+
{
|
| 4988 |
+
"epoch": 0.277,
|
| 4989 |
+
"grad_norm": 0.5679311638374708,
|
| 4990 |
+
"learning_rate": 8.033555555555556e-06,
|
| 4991 |
+
"loss": 2.3771,
|
| 4992 |
+
"step": 13850
|
| 4993 |
+
},
|
| 4994 |
+
{
|
| 4995 |
+
"epoch": 0.2775,
|
| 4996 |
+
"grad_norm": 0.5533203763453908,
|
| 4997 |
+
"learning_rate": 8.028e-06,
|
| 4998 |
+
"loss": 2.3831,
|
| 4999 |
+
"step": 13875
|
| 5000 |
+
},
|
| 5001 |
+
{
|
| 5002 |
+
"epoch": 0.278,
|
| 5003 |
+
"grad_norm": 0.5674818164725537,
|
| 5004 |
+
"learning_rate": 8.022444444444445e-06,
|
| 5005 |
+
"loss": 2.3811,
|
| 5006 |
+
"step": 13900
|
| 5007 |
+
},
|
| 5008 |
+
{
|
| 5009 |
+
"epoch": 0.278,
|
| 5010 |
+
"eval_loss": 2.3990118503570557,
|
| 5011 |
+
"eval_runtime": 31.47,
|
| 5012 |
+
"eval_samples_per_second": 3.241,
|
| 5013 |
+
"eval_steps_per_second": 1.621,
|
| 5014 |
+
"step": 13900
|
| 5015 |
+
},
|
| 5016 |
+
{
|
| 5017 |
+
"epoch": 0.2785,
|
| 5018 |
+
"grad_norm": 0.5664699981127816,
|
| 5019 |
+
"learning_rate": 8.016888888888889e-06,
|
| 5020 |
+
"loss": 2.3848,
|
| 5021 |
+
"step": 13925
|
| 5022 |
+
},
|
| 5023 |
+
{
|
| 5024 |
+
"epoch": 0.279,
|
| 5025 |
+
"grad_norm": 0.6085875103795902,
|
| 5026 |
+
"learning_rate": 8.011333333333334e-06,
|
| 5027 |
+
"loss": 2.3822,
|
| 5028 |
+
"step": 13950
|
| 5029 |
+
},
|
| 5030 |
+
{
|
| 5031 |
+
"epoch": 0.2795,
|
| 5032 |
+
"grad_norm": 0.561160479481643,
|
| 5033 |
+
"learning_rate": 8.00577777777778e-06,
|
| 5034 |
+
"loss": 2.3722,
|
| 5035 |
+
"step": 13975
|
| 5036 |
+
},
|
| 5037 |
+
{
|
| 5038 |
+
"epoch": 0.28,
|
| 5039 |
+
"grad_norm": 0.566395855978902,
|
| 5040 |
+
"learning_rate": 8.000222222222223e-06,
|
| 5041 |
+
"loss": 2.3922,
|
| 5042 |
+
"step": 14000
|
| 5043 |
+
},
|
| 5044 |
+
{
|
| 5045 |
+
"epoch": 0.28,
|
| 5046 |
+
"eval_loss": 2.3991119861602783,
|
| 5047 |
+
"eval_runtime": 31.6591,
|
| 5048 |
+
"eval_samples_per_second": 3.222,
|
| 5049 |
+
"eval_steps_per_second": 1.611,
|
| 5050 |
+
"step": 14000
|
| 5051 |
}
|
| 5052 |
],
|
| 5053 |
"logging_steps": 25,
|
|
|
|
| 5067 |
"attributes": {}
|
| 5068 |
}
|
| 5069 |
},
|
| 5070 |
+
"total_flos": 4.456483217658085e+19,
|
| 5071 |
"train_batch_size": 1,
|
| 5072 |
"trial_name": null,
|
| 5073 |
"trial_params": null
|