Training checkpoint at step 28000
Browse files- trainer_state.json +366 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9728,6 +9728,366 @@
|
|
| 9728 |
"eval_samples_per_second": 3.212,
|
| 9729 |
"eval_steps_per_second": 1.606,
|
| 9730 |
"step": 27000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9731 |
}
|
| 9732 |
],
|
| 9733 |
"logging_steps": 25,
|
|
@@ -9747,7 +10107,7 @@
|
|
| 9747 |
"attributes": {}
|
| 9748 |
}
|
| 9749 |
},
|
| 9750 |
-
"total_flos": 8.
|
| 9751 |
"train_batch_size": 1,
|
| 9752 |
"trial_name": null,
|
| 9753 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 28000,
|
| 3 |
+
"best_metric": 2.380680799484253,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-28000",
|
| 5 |
+
"epoch": 0.56,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 28000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9728 |
"eval_samples_per_second": 3.212,
|
| 9729 |
"eval_steps_per_second": 1.606,
|
| 9730 |
"step": 27000
|
| 9731 |
+
},
|
| 9732 |
+
{
|
| 9733 |
+
"epoch": 0.5405,
|
| 9734 |
+
"grad_norm": 0.5639894142193489,
|
| 9735 |
+
"learning_rate": 5.105777777777778e-06,
|
| 9736 |
+
"loss": 2.3604,
|
| 9737 |
+
"step": 27025
|
| 9738 |
+
},
|
| 9739 |
+
{
|
| 9740 |
+
"epoch": 0.541,
|
| 9741 |
+
"grad_norm": 0.5650474829629732,
|
| 9742 |
+
"learning_rate": 5.100222222222223e-06,
|
| 9743 |
+
"loss": 2.3615,
|
| 9744 |
+
"step": 27050
|
| 9745 |
+
},
|
| 9746 |
+
{
|
| 9747 |
+
"epoch": 0.5415,
|
| 9748 |
+
"grad_norm": 0.5549449402784257,
|
| 9749 |
+
"learning_rate": 5.094666666666666e-06,
|
| 9750 |
+
"loss": 2.3679,
|
| 9751 |
+
"step": 27075
|
| 9752 |
+
},
|
| 9753 |
+
{
|
| 9754 |
+
"epoch": 0.542,
|
| 9755 |
+
"grad_norm": 0.5615002192664388,
|
| 9756 |
+
"learning_rate": 5.0891111111111115e-06,
|
| 9757 |
+
"loss": 2.3634,
|
| 9758 |
+
"step": 27100
|
| 9759 |
+
},
|
| 9760 |
+
{
|
| 9761 |
+
"epoch": 0.542,
|
| 9762 |
+
"eval_loss": 2.381121873855591,
|
| 9763 |
+
"eval_runtime": 31.7586,
|
| 9764 |
+
"eval_samples_per_second": 3.212,
|
| 9765 |
+
"eval_steps_per_second": 1.606,
|
| 9766 |
+
"step": 27100
|
| 9767 |
+
},
|
| 9768 |
+
{
|
| 9769 |
+
"epoch": 0.5425,
|
| 9770 |
+
"grad_norm": 0.5403095468370492,
|
| 9771 |
+
"learning_rate": 5.083555555555556e-06,
|
| 9772 |
+
"loss": 2.3665,
|
| 9773 |
+
"step": 27125
|
| 9774 |
+
},
|
| 9775 |
+
{
|
| 9776 |
+
"epoch": 0.543,
|
| 9777 |
+
"grad_norm": 0.5421716749680758,
|
| 9778 |
+
"learning_rate": 5.078e-06,
|
| 9779 |
+
"loss": 2.369,
|
| 9780 |
+
"step": 27150
|
| 9781 |
+
},
|
| 9782 |
+
{
|
| 9783 |
+
"epoch": 0.5435,
|
| 9784 |
+
"grad_norm": 0.5590064616229682,
|
| 9785 |
+
"learning_rate": 5.072444444444446e-06,
|
| 9786 |
+
"loss": 2.3594,
|
| 9787 |
+
"step": 27175
|
| 9788 |
+
},
|
| 9789 |
+
{
|
| 9790 |
+
"epoch": 0.544,
|
| 9791 |
+
"grad_norm": 0.5444799207706167,
|
| 9792 |
+
"learning_rate": 5.066888888888889e-06,
|
| 9793 |
+
"loss": 2.3582,
|
| 9794 |
+
"step": 27200
|
| 9795 |
+
},
|
| 9796 |
+
{
|
| 9797 |
+
"epoch": 0.544,
|
| 9798 |
+
"eval_loss": 2.3811404705047607,
|
| 9799 |
+
"eval_runtime": 31.8368,
|
| 9800 |
+
"eval_samples_per_second": 3.204,
|
| 9801 |
+
"eval_steps_per_second": 1.602,
|
| 9802 |
+
"step": 27200
|
| 9803 |
+
},
|
| 9804 |
+
{
|
| 9805 |
+
"epoch": 0.5445,
|
| 9806 |
+
"grad_norm": 0.5694522608963828,
|
| 9807 |
+
"learning_rate": 5.0613333333333336e-06,
|
| 9808 |
+
"loss": 2.3651,
|
| 9809 |
+
"step": 27225
|
| 9810 |
+
},
|
| 9811 |
+
{
|
| 9812 |
+
"epoch": 0.545,
|
| 9813 |
+
"grad_norm": 0.5357232316900923,
|
| 9814 |
+
"learning_rate": 5.055777777777778e-06,
|
| 9815 |
+
"loss": 2.3595,
|
| 9816 |
+
"step": 27250
|
| 9817 |
+
},
|
| 9818 |
+
{
|
| 9819 |
+
"epoch": 0.5455,
|
| 9820 |
+
"grad_norm": 0.5449200504756736,
|
| 9821 |
+
"learning_rate": 5.050222222222223e-06,
|
| 9822 |
+
"loss": 2.3563,
|
| 9823 |
+
"step": 27275
|
| 9824 |
+
},
|
| 9825 |
+
{
|
| 9826 |
+
"epoch": 0.546,
|
| 9827 |
+
"grad_norm": 0.5669179572699722,
|
| 9828 |
+
"learning_rate": 5.044666666666667e-06,
|
| 9829 |
+
"loss": 2.3705,
|
| 9830 |
+
"step": 27300
|
| 9831 |
+
},
|
| 9832 |
+
{
|
| 9833 |
+
"epoch": 0.546,
|
| 9834 |
+
"eval_loss": 2.3810057640075684,
|
| 9835 |
+
"eval_runtime": 31.7869,
|
| 9836 |
+
"eval_samples_per_second": 3.209,
|
| 9837 |
+
"eval_steps_per_second": 1.604,
|
| 9838 |
+
"step": 27300
|
| 9839 |
+
},
|
| 9840 |
+
{
|
| 9841 |
+
"epoch": 0.5465,
|
| 9842 |
+
"grad_norm": 0.5536644347581473,
|
| 9843 |
+
"learning_rate": 5.039111111111111e-06,
|
| 9844 |
+
"loss": 2.3658,
|
| 9845 |
+
"step": 27325
|
| 9846 |
+
},
|
| 9847 |
+
{
|
| 9848 |
+
"epoch": 0.547,
|
| 9849 |
+
"grad_norm": 0.5774297317851765,
|
| 9850 |
+
"learning_rate": 5.0335555555555556e-06,
|
| 9851 |
+
"loss": 2.3553,
|
| 9852 |
+
"step": 27350
|
| 9853 |
+
},
|
| 9854 |
+
{
|
| 9855 |
+
"epoch": 0.5475,
|
| 9856 |
+
"grad_norm": 0.567395549600367,
|
| 9857 |
+
"learning_rate": 5.028000000000001e-06,
|
| 9858 |
+
"loss": 2.3694,
|
| 9859 |
+
"step": 27375
|
| 9860 |
+
},
|
| 9861 |
+
{
|
| 9862 |
+
"epoch": 0.548,
|
| 9863 |
+
"grad_norm": 0.5501789999743681,
|
| 9864 |
+
"learning_rate": 5.022444444444445e-06,
|
| 9865 |
+
"loss": 2.3643,
|
| 9866 |
+
"step": 27400
|
| 9867 |
+
},
|
| 9868 |
+
{
|
| 9869 |
+
"epoch": 0.548,
|
| 9870 |
+
"eval_loss": 2.3811025619506836,
|
| 9871 |
+
"eval_runtime": 31.9197,
|
| 9872 |
+
"eval_samples_per_second": 3.196,
|
| 9873 |
+
"eval_steps_per_second": 1.598,
|
| 9874 |
+
"step": 27400
|
| 9875 |
+
},
|
| 9876 |
+
{
|
| 9877 |
+
"epoch": 0.5485,
|
| 9878 |
+
"grad_norm": 0.5719215133111718,
|
| 9879 |
+
"learning_rate": 5.016888888888889e-06,
|
| 9880 |
+
"loss": 2.365,
|
| 9881 |
+
"step": 27425
|
| 9882 |
+
},
|
| 9883 |
+
{
|
| 9884 |
+
"epoch": 0.549,
|
| 9885 |
+
"grad_norm": 0.5899241097551456,
|
| 9886 |
+
"learning_rate": 5.011333333333333e-06,
|
| 9887 |
+
"loss": 2.3774,
|
| 9888 |
+
"step": 27450
|
| 9889 |
+
},
|
| 9890 |
+
{
|
| 9891 |
+
"epoch": 0.5495,
|
| 9892 |
+
"grad_norm": 0.5731413292155066,
|
| 9893 |
+
"learning_rate": 5.0057777777777784e-06,
|
| 9894 |
+
"loss": 2.3706,
|
| 9895 |
+
"step": 27475
|
| 9896 |
+
},
|
| 9897 |
+
{
|
| 9898 |
+
"epoch": 0.55,
|
| 9899 |
+
"grad_norm": 0.5425656065958468,
|
| 9900 |
+
"learning_rate": 5.000222222222223e-06,
|
| 9901 |
+
"loss": 2.3566,
|
| 9902 |
+
"step": 27500
|
| 9903 |
+
},
|
| 9904 |
+
{
|
| 9905 |
+
"epoch": 0.55,
|
| 9906 |
+
"eval_loss": 2.380763292312622,
|
| 9907 |
+
"eval_runtime": 31.8162,
|
| 9908 |
+
"eval_samples_per_second": 3.206,
|
| 9909 |
+
"eval_steps_per_second": 1.603,
|
| 9910 |
+
"step": 27500
|
| 9911 |
+
},
|
| 9912 |
+
{
|
| 9913 |
+
"epoch": 0.5505,
|
| 9914 |
+
"grad_norm": 0.5601626399029922,
|
| 9915 |
+
"learning_rate": 4.994666666666667e-06,
|
| 9916 |
+
"loss": 2.3762,
|
| 9917 |
+
"step": 27525
|
| 9918 |
+
},
|
| 9919 |
+
{
|
| 9920 |
+
"epoch": 0.551,
|
| 9921 |
+
"grad_norm": 0.5715204135637444,
|
| 9922 |
+
"learning_rate": 4.989111111111112e-06,
|
| 9923 |
+
"loss": 2.363,
|
| 9924 |
+
"step": 27550
|
| 9925 |
+
},
|
| 9926 |
+
{
|
| 9927 |
+
"epoch": 0.5515,
|
| 9928 |
+
"grad_norm": 0.547533853702179,
|
| 9929 |
+
"learning_rate": 4.983555555555556e-06,
|
| 9930 |
+
"loss": 2.3659,
|
| 9931 |
+
"step": 27575
|
| 9932 |
+
},
|
| 9933 |
+
{
|
| 9934 |
+
"epoch": 0.552,
|
| 9935 |
+
"grad_norm": 0.5817399132816639,
|
| 9936 |
+
"learning_rate": 4.9780000000000005e-06,
|
| 9937 |
+
"loss": 2.3693,
|
| 9938 |
+
"step": 27600
|
| 9939 |
+
},
|
| 9940 |
+
{
|
| 9941 |
+
"epoch": 0.552,
|
| 9942 |
+
"eval_loss": 2.3807787895202637,
|
| 9943 |
+
"eval_runtime": 31.8396,
|
| 9944 |
+
"eval_samples_per_second": 3.204,
|
| 9945 |
+
"eval_steps_per_second": 1.602,
|
| 9946 |
+
"step": 27600
|
| 9947 |
+
},
|
| 9948 |
+
{
|
| 9949 |
+
"epoch": 0.5525,
|
| 9950 |
+
"grad_norm": 0.544660595894246,
|
| 9951 |
+
"learning_rate": 4.972444444444445e-06,
|
| 9952 |
+
"loss": 2.3661,
|
| 9953 |
+
"step": 27625
|
| 9954 |
+
},
|
| 9955 |
+
{
|
| 9956 |
+
"epoch": 0.553,
|
| 9957 |
+
"grad_norm": 0.5813863819688693,
|
| 9958 |
+
"learning_rate": 4.966888888888889e-06,
|
| 9959 |
+
"loss": 2.365,
|
| 9960 |
+
"step": 27650
|
| 9961 |
+
},
|
| 9962 |
+
{
|
| 9963 |
+
"epoch": 0.5535,
|
| 9964 |
+
"grad_norm": 0.555794514365692,
|
| 9965 |
+
"learning_rate": 4.961333333333334e-06,
|
| 9966 |
+
"loss": 2.3724,
|
| 9967 |
+
"step": 27675
|
| 9968 |
+
},
|
| 9969 |
+
{
|
| 9970 |
+
"epoch": 0.554,
|
| 9971 |
+
"grad_norm": 0.5549771654031,
|
| 9972 |
+
"learning_rate": 4.955777777777778e-06,
|
| 9973 |
+
"loss": 2.3712,
|
| 9974 |
+
"step": 27700
|
| 9975 |
+
},
|
| 9976 |
+
{
|
| 9977 |
+
"epoch": 0.554,
|
| 9978 |
+
"eval_loss": 2.380859613418579,
|
| 9979 |
+
"eval_runtime": 32.035,
|
| 9980 |
+
"eval_samples_per_second": 3.184,
|
| 9981 |
+
"eval_steps_per_second": 1.592,
|
| 9982 |
+
"step": 27700
|
| 9983 |
+
},
|
| 9984 |
+
{
|
| 9985 |
+
"epoch": 0.5545,
|
| 9986 |
+
"grad_norm": 0.5660580874490311,
|
| 9987 |
+
"learning_rate": 4.9502222222222225e-06,
|
| 9988 |
+
"loss": 2.3626,
|
| 9989 |
+
"step": 27725
|
| 9990 |
+
},
|
| 9991 |
+
{
|
| 9992 |
+
"epoch": 0.555,
|
| 9993 |
+
"grad_norm": 0.5408935222204184,
|
| 9994 |
+
"learning_rate": 4.944666666666667e-06,
|
| 9995 |
+
"loss": 2.3546,
|
| 9996 |
+
"step": 27750
|
| 9997 |
+
},
|
| 9998 |
+
{
|
| 9999 |
+
"epoch": 0.5555,
|
| 10000 |
+
"grad_norm": 0.5574539497290301,
|
| 10001 |
+
"learning_rate": 4.939111111111112e-06,
|
| 10002 |
+
"loss": 2.3503,
|
| 10003 |
+
"step": 27775
|
| 10004 |
+
},
|
| 10005 |
+
{
|
| 10006 |
+
"epoch": 0.556,
|
| 10007 |
+
"grad_norm": 0.5733587459238179,
|
| 10008 |
+
"learning_rate": 4.933555555555556e-06,
|
| 10009 |
+
"loss": 2.3787,
|
| 10010 |
+
"step": 27800
|
| 10011 |
+
},
|
| 10012 |
+
{
|
| 10013 |
+
"epoch": 0.556,
|
| 10014 |
+
"eval_loss": 2.380819082260132,
|
| 10015 |
+
"eval_runtime": 31.8731,
|
| 10016 |
+
"eval_samples_per_second": 3.2,
|
| 10017 |
+
"eval_steps_per_second": 1.6,
|
| 10018 |
+
"step": 27800
|
| 10019 |
+
},
|
| 10020 |
+
{
|
| 10021 |
+
"epoch": 0.5565,
|
| 10022 |
+
"grad_norm": 0.5469010479471977,
|
| 10023 |
+
"learning_rate": 4.928000000000001e-06,
|
| 10024 |
+
"loss": 2.3728,
|
| 10025 |
+
"step": 27825
|
| 10026 |
+
},
|
| 10027 |
+
{
|
| 10028 |
+
"epoch": 0.557,
|
| 10029 |
+
"grad_norm": 0.5575923461377743,
|
| 10030 |
+
"learning_rate": 4.9224444444444445e-06,
|
| 10031 |
+
"loss": 2.3587,
|
| 10032 |
+
"step": 27850
|
| 10033 |
+
},
|
| 10034 |
+
{
|
| 10035 |
+
"epoch": 0.5575,
|
| 10036 |
+
"grad_norm": 0.5484615569385746,
|
| 10037 |
+
"learning_rate": 4.91688888888889e-06,
|
| 10038 |
+
"loss": 2.3554,
|
| 10039 |
+
"step": 27875
|
| 10040 |
+
},
|
| 10041 |
+
{
|
| 10042 |
+
"epoch": 0.558,
|
| 10043 |
+
"grad_norm": 0.5700580906470195,
|
| 10044 |
+
"learning_rate": 4.911333333333333e-06,
|
| 10045 |
+
"loss": 2.3591,
|
| 10046 |
+
"step": 27900
|
| 10047 |
+
},
|
| 10048 |
+
{
|
| 10049 |
+
"epoch": 0.558,
|
| 10050 |
+
"eval_loss": 2.380748748779297,
|
| 10051 |
+
"eval_runtime": 31.8799,
|
| 10052 |
+
"eval_samples_per_second": 3.2,
|
| 10053 |
+
"eval_steps_per_second": 1.6,
|
| 10054 |
+
"step": 27900
|
| 10055 |
+
},
|
| 10056 |
+
{
|
| 10057 |
+
"epoch": 0.5585,
|
| 10058 |
+
"grad_norm": 0.5644741625244013,
|
| 10059 |
+
"learning_rate": 4.9057777777777785e-06,
|
| 10060 |
+
"loss": 2.3573,
|
| 10061 |
+
"step": 27925
|
| 10062 |
+
},
|
| 10063 |
+
{
|
| 10064 |
+
"epoch": 0.559,
|
| 10065 |
+
"grad_norm": 0.5518750142742082,
|
| 10066 |
+
"learning_rate": 4.900222222222223e-06,
|
| 10067 |
+
"loss": 2.3722,
|
| 10068 |
+
"step": 27950
|
| 10069 |
+
},
|
| 10070 |
+
{
|
| 10071 |
+
"epoch": 0.5595,
|
| 10072 |
+
"grad_norm": 0.5570570164343176,
|
| 10073 |
+
"learning_rate": 4.894666666666667e-06,
|
| 10074 |
+
"loss": 2.3644,
|
| 10075 |
+
"step": 27975
|
| 10076 |
+
},
|
| 10077 |
+
{
|
| 10078 |
+
"epoch": 0.56,
|
| 10079 |
+
"grad_norm": 0.5454507656456767,
|
| 10080 |
+
"learning_rate": 4.889111111111112e-06,
|
| 10081 |
+
"loss": 2.3545,
|
| 10082 |
+
"step": 28000
|
| 10083 |
+
},
|
| 10084 |
+
{
|
| 10085 |
+
"epoch": 0.56,
|
| 10086 |
+
"eval_loss": 2.380680799484253,
|
| 10087 |
+
"eval_runtime": 31.8506,
|
| 10088 |
+
"eval_samples_per_second": 3.202,
|
| 10089 |
+
"eval_steps_per_second": 1.601,
|
| 10090 |
+
"step": 28000
|
| 10091 |
}
|
| 10092 |
],
|
| 10093 |
"logging_steps": 25,
|
|
|
|
| 10107 |
"attributes": {}
|
| 10108 |
}
|
| 10109 |
},
|
| 10110 |
+
"total_flos": 8.91296643531617e+19,
|
| 10111 |
"train_batch_size": 1,
|
| 10112 |
"trial_name": null,
|
| 10113 |
"trial_params": null
|