Training checkpoint at step 25000
Browse files- trainer_state.json +366 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8648,6 +8648,366 @@
|
|
| 8648 |
"eval_samples_per_second": 3.216,
|
| 8649 |
"eval_steps_per_second": 1.608,
|
| 8650 |
"step": 24000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8651 |
}
|
| 8652 |
],
|
| 8653 |
"logging_steps": 25,
|
|
@@ -8667,7 +9027,7 @@
|
|
| 8667 |
"attributes": {}
|
| 8668 |
}
|
| 8669 |
},
|
| 8670 |
-
"total_flos": 7.
|
| 8671 |
"train_batch_size": 1,
|
| 8672 |
"trial_name": null,
|
| 8673 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 25000,
|
| 3 |
+
"best_metric": 2.3832170963287354,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-25000",
|
| 5 |
+
"epoch": 0.5,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 25000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8648 |
"eval_samples_per_second": 3.216,
|
| 8649 |
"eval_steps_per_second": 1.608,
|
| 8650 |
"step": 24000
|
| 8651 |
+
},
|
| 8652 |
+
{
|
| 8653 |
+
"epoch": 0.4805,
|
| 8654 |
+
"grad_norm": 0.5640470742370431,
|
| 8655 |
+
"learning_rate": 5.772444444444445e-06,
|
| 8656 |
+
"loss": 2.3622,
|
| 8657 |
+
"step": 24025
|
| 8658 |
+
},
|
| 8659 |
+
{
|
| 8660 |
+
"epoch": 0.481,
|
| 8661 |
+
"grad_norm": 0.5463055265939479,
|
| 8662 |
+
"learning_rate": 5.76688888888889e-06,
|
| 8663 |
+
"loss": 2.3609,
|
| 8664 |
+
"step": 24050
|
| 8665 |
+
},
|
| 8666 |
+
{
|
| 8667 |
+
"epoch": 0.4815,
|
| 8668 |
+
"grad_norm": 0.566766243472923,
|
| 8669 |
+
"learning_rate": 5.7613333333333345e-06,
|
| 8670 |
+
"loss": 2.3824,
|
| 8671 |
+
"step": 24075
|
| 8672 |
+
},
|
| 8673 |
+
{
|
| 8674 |
+
"epoch": 0.482,
|
| 8675 |
+
"grad_norm": 0.5584478304684121,
|
| 8676 |
+
"learning_rate": 5.755777777777778e-06,
|
| 8677 |
+
"loss": 2.3744,
|
| 8678 |
+
"step": 24100
|
| 8679 |
+
},
|
| 8680 |
+
{
|
| 8681 |
+
"epoch": 0.482,
|
| 8682 |
+
"eval_loss": 2.384092330932617,
|
| 8683 |
+
"eval_runtime": 31.7835,
|
| 8684 |
+
"eval_samples_per_second": 3.209,
|
| 8685 |
+
"eval_steps_per_second": 1.605,
|
| 8686 |
+
"step": 24100
|
| 8687 |
+
},
|
| 8688 |
+
{
|
| 8689 |
+
"epoch": 0.4825,
|
| 8690 |
+
"grad_norm": 0.5731740442874064,
|
| 8691 |
+
"learning_rate": 5.7502222222222224e-06,
|
| 8692 |
+
"loss": 2.3733,
|
| 8693 |
+
"step": 24125
|
| 8694 |
+
},
|
| 8695 |
+
{
|
| 8696 |
+
"epoch": 0.483,
|
| 8697 |
+
"grad_norm": 0.5552901331066319,
|
| 8698 |
+
"learning_rate": 5.744666666666668e-06,
|
| 8699 |
+
"loss": 2.3755,
|
| 8700 |
+
"step": 24150
|
| 8701 |
+
},
|
| 8702 |
+
{
|
| 8703 |
+
"epoch": 0.4835,
|
| 8704 |
+
"grad_norm": 0.5535450397337369,
|
| 8705 |
+
"learning_rate": 5.739111111111112e-06,
|
| 8706 |
+
"loss": 2.3777,
|
| 8707 |
+
"step": 24175
|
| 8708 |
+
},
|
| 8709 |
+
{
|
| 8710 |
+
"epoch": 0.484,
|
| 8711 |
+
"grad_norm": 0.5622658531288893,
|
| 8712 |
+
"learning_rate": 5.733555555555556e-06,
|
| 8713 |
+
"loss": 2.3671,
|
| 8714 |
+
"step": 24200
|
| 8715 |
+
},
|
| 8716 |
+
{
|
| 8717 |
+
"epoch": 0.484,
|
| 8718 |
+
"eval_loss": 2.3840036392211914,
|
| 8719 |
+
"eval_runtime": 31.7615,
|
| 8720 |
+
"eval_samples_per_second": 3.211,
|
| 8721 |
+
"eval_steps_per_second": 1.606,
|
| 8722 |
+
"step": 24200
|
| 8723 |
+
},
|
| 8724 |
+
{
|
| 8725 |
+
"epoch": 0.4845,
|
| 8726 |
+
"grad_norm": 0.5526779804173192,
|
| 8727 |
+
"learning_rate": 5.728e-06,
|
| 8728 |
+
"loss": 2.374,
|
| 8729 |
+
"step": 24225
|
| 8730 |
+
},
|
| 8731 |
+
{
|
| 8732 |
+
"epoch": 0.485,
|
| 8733 |
+
"grad_norm": 0.5383978006357063,
|
| 8734 |
+
"learning_rate": 5.722444444444445e-06,
|
| 8735 |
+
"loss": 2.3664,
|
| 8736 |
+
"step": 24250
|
| 8737 |
+
},
|
| 8738 |
+
{
|
| 8739 |
+
"epoch": 0.4855,
|
| 8740 |
+
"grad_norm": 0.5542389650019858,
|
| 8741 |
+
"learning_rate": 5.71688888888889e-06,
|
| 8742 |
+
"loss": 2.3692,
|
| 8743 |
+
"step": 24275
|
| 8744 |
+
},
|
| 8745 |
+
{
|
| 8746 |
+
"epoch": 0.486,
|
| 8747 |
+
"grad_norm": 0.5542459781042757,
|
| 8748 |
+
"learning_rate": 5.711333333333334e-06,
|
| 8749 |
+
"loss": 2.379,
|
| 8750 |
+
"step": 24300
|
| 8751 |
+
},
|
| 8752 |
+
{
|
| 8753 |
+
"epoch": 0.486,
|
| 8754 |
+
"eval_loss": 2.3838605880737305,
|
| 8755 |
+
"eval_runtime": 31.8313,
|
| 8756 |
+
"eval_samples_per_second": 3.204,
|
| 8757 |
+
"eval_steps_per_second": 1.602,
|
| 8758 |
+
"step": 24300
|
| 8759 |
+
},
|
| 8760 |
+
{
|
| 8761 |
+
"epoch": 0.4865,
|
| 8762 |
+
"grad_norm": 0.5371257785961498,
|
| 8763 |
+
"learning_rate": 5.705777777777778e-06,
|
| 8764 |
+
"loss": 2.3759,
|
| 8765 |
+
"step": 24325
|
| 8766 |
+
},
|
| 8767 |
+
{
|
| 8768 |
+
"epoch": 0.487,
|
| 8769 |
+
"grad_norm": 0.5334074315105899,
|
| 8770 |
+
"learning_rate": 5.700222222222223e-06,
|
| 8771 |
+
"loss": 2.3842,
|
| 8772 |
+
"step": 24350
|
| 8773 |
+
},
|
| 8774 |
+
{
|
| 8775 |
+
"epoch": 0.4875,
|
| 8776 |
+
"grad_norm": 0.5712028005119992,
|
| 8777 |
+
"learning_rate": 5.694666666666667e-06,
|
| 8778 |
+
"loss": 2.373,
|
| 8779 |
+
"step": 24375
|
| 8780 |
+
},
|
| 8781 |
+
{
|
| 8782 |
+
"epoch": 0.488,
|
| 8783 |
+
"grad_norm": 0.5527635817323101,
|
| 8784 |
+
"learning_rate": 5.689111111111112e-06,
|
| 8785 |
+
"loss": 2.3632,
|
| 8786 |
+
"step": 24400
|
| 8787 |
+
},
|
| 8788 |
+
{
|
| 8789 |
+
"epoch": 0.488,
|
| 8790 |
+
"eval_loss": 2.383908987045288,
|
| 8791 |
+
"eval_runtime": 31.8006,
|
| 8792 |
+
"eval_samples_per_second": 3.207,
|
| 8793 |
+
"eval_steps_per_second": 1.604,
|
| 8794 |
+
"step": 24400
|
| 8795 |
+
},
|
| 8796 |
+
{
|
| 8797 |
+
"epoch": 0.4885,
|
| 8798 |
+
"grad_norm": 0.5497988709199122,
|
| 8799 |
+
"learning_rate": 5.683555555555555e-06,
|
| 8800 |
+
"loss": 2.3674,
|
| 8801 |
+
"step": 24425
|
| 8802 |
+
},
|
| 8803 |
+
{
|
| 8804 |
+
"epoch": 0.489,
|
| 8805 |
+
"grad_norm": 0.5478963614360626,
|
| 8806 |
+
"learning_rate": 5.6780000000000005e-06,
|
| 8807 |
+
"loss": 2.3795,
|
| 8808 |
+
"step": 24450
|
| 8809 |
+
},
|
| 8810 |
+
{
|
| 8811 |
+
"epoch": 0.4895,
|
| 8812 |
+
"grad_norm": 0.5418443665589167,
|
| 8813 |
+
"learning_rate": 5.672444444444445e-06,
|
| 8814 |
+
"loss": 2.3769,
|
| 8815 |
+
"step": 24475
|
| 8816 |
+
},
|
| 8817 |
+
{
|
| 8818 |
+
"epoch": 0.49,
|
| 8819 |
+
"grad_norm": 0.5637739038034214,
|
| 8820 |
+
"learning_rate": 5.666888888888889e-06,
|
| 8821 |
+
"loss": 2.3754,
|
| 8822 |
+
"step": 24500
|
| 8823 |
+
},
|
| 8824 |
+
{
|
| 8825 |
+
"epoch": 0.49,
|
| 8826 |
+
"eval_loss": 2.3835647106170654,
|
| 8827 |
+
"eval_runtime": 31.695,
|
| 8828 |
+
"eval_samples_per_second": 3.218,
|
| 8829 |
+
"eval_steps_per_second": 1.609,
|
| 8830 |
+
"step": 24500
|
| 8831 |
+
},
|
| 8832 |
+
{
|
| 8833 |
+
"epoch": 0.4905,
|
| 8834 |
+
"grad_norm": 0.5352738455560374,
|
| 8835 |
+
"learning_rate": 5.661333333333335e-06,
|
| 8836 |
+
"loss": 2.3665,
|
| 8837 |
+
"step": 24525
|
| 8838 |
+
},
|
| 8839 |
+
{
|
| 8840 |
+
"epoch": 0.491,
|
| 8841 |
+
"grad_norm": 0.5593898219847685,
|
| 8842 |
+
"learning_rate": 5.655777777777778e-06,
|
| 8843 |
+
"loss": 2.3621,
|
| 8844 |
+
"step": 24550
|
| 8845 |
+
},
|
| 8846 |
+
{
|
| 8847 |
+
"epoch": 0.4915,
|
| 8848 |
+
"grad_norm": 0.5340153226573613,
|
| 8849 |
+
"learning_rate": 5.6502222222222225e-06,
|
| 8850 |
+
"loss": 2.3704,
|
| 8851 |
+
"step": 24575
|
| 8852 |
+
},
|
| 8853 |
+
{
|
| 8854 |
+
"epoch": 0.492,
|
| 8855 |
+
"grad_norm": 0.5434269177198789,
|
| 8856 |
+
"learning_rate": 5.644666666666667e-06,
|
| 8857 |
+
"loss": 2.3707,
|
| 8858 |
+
"step": 24600
|
| 8859 |
+
},
|
| 8860 |
+
{
|
| 8861 |
+
"epoch": 0.492,
|
| 8862 |
+
"eval_loss": 2.38376522064209,
|
| 8863 |
+
"eval_runtime": 31.8117,
|
| 8864 |
+
"eval_samples_per_second": 3.206,
|
| 8865 |
+
"eval_steps_per_second": 1.603,
|
| 8866 |
+
"step": 24600
|
| 8867 |
+
},
|
| 8868 |
+
{
|
| 8869 |
+
"epoch": 0.4925,
|
| 8870 |
+
"grad_norm": 0.5555073289213541,
|
| 8871 |
+
"learning_rate": 5.639111111111112e-06,
|
| 8872 |
+
"loss": 2.3702,
|
| 8873 |
+
"step": 24625
|
| 8874 |
+
},
|
| 8875 |
+
{
|
| 8876 |
+
"epoch": 0.493,
|
| 8877 |
+
"grad_norm": 0.5608796205061338,
|
| 8878 |
+
"learning_rate": 5.633555555555557e-06,
|
| 8879 |
+
"loss": 2.373,
|
| 8880 |
+
"step": 24650
|
| 8881 |
+
},
|
| 8882 |
+
{
|
| 8883 |
+
"epoch": 0.4935,
|
| 8884 |
+
"grad_norm": 0.5639681025688454,
|
| 8885 |
+
"learning_rate": 5.628e-06,
|
| 8886 |
+
"loss": 2.3641,
|
| 8887 |
+
"step": 24675
|
| 8888 |
+
},
|
| 8889 |
+
{
|
| 8890 |
+
"epoch": 0.494,
|
| 8891 |
+
"grad_norm": 0.5610119210421548,
|
| 8892 |
+
"learning_rate": 5.6224444444444446e-06,
|
| 8893 |
+
"loss": 2.372,
|
| 8894 |
+
"step": 24700
|
| 8895 |
+
},
|
| 8896 |
+
{
|
| 8897 |
+
"epoch": 0.494,
|
| 8898 |
+
"eval_loss": 2.383573293685913,
|
| 8899 |
+
"eval_runtime": 31.6948,
|
| 8900 |
+
"eval_samples_per_second": 3.218,
|
| 8901 |
+
"eval_steps_per_second": 1.609,
|
| 8902 |
+
"step": 24700
|
| 8903 |
+
},
|
| 8904 |
+
{
|
| 8905 |
+
"epoch": 0.4945,
|
| 8906 |
+
"grad_norm": 0.5442392815853518,
|
| 8907 |
+
"learning_rate": 5.61688888888889e-06,
|
| 8908 |
+
"loss": 2.3651,
|
| 8909 |
+
"step": 24725
|
| 8910 |
+
},
|
| 8911 |
+
{
|
| 8912 |
+
"epoch": 0.495,
|
| 8913 |
+
"grad_norm": 0.5562532962787945,
|
| 8914 |
+
"learning_rate": 5.611333333333334e-06,
|
| 8915 |
+
"loss": 2.3705,
|
| 8916 |
+
"step": 24750
|
| 8917 |
+
},
|
| 8918 |
+
{
|
| 8919 |
+
"epoch": 0.4955,
|
| 8920 |
+
"grad_norm": 0.5488206873990799,
|
| 8921 |
+
"learning_rate": 5.605777777777778e-06,
|
| 8922 |
+
"loss": 2.3623,
|
| 8923 |
+
"step": 24775
|
| 8924 |
+
},
|
| 8925 |
+
{
|
| 8926 |
+
"epoch": 0.496,
|
| 8927 |
+
"grad_norm": 0.5653453728755813,
|
| 8928 |
+
"learning_rate": 5.600222222222222e-06,
|
| 8929 |
+
"loss": 2.3746,
|
| 8930 |
+
"step": 24800
|
| 8931 |
+
},
|
| 8932 |
+
{
|
| 8933 |
+
"epoch": 0.496,
|
| 8934 |
+
"eval_loss": 2.383600950241089,
|
| 8935 |
+
"eval_runtime": 31.8215,
|
| 8936 |
+
"eval_samples_per_second": 3.205,
|
| 8937 |
+
"eval_steps_per_second": 1.603,
|
| 8938 |
+
"step": 24800
|
| 8939 |
+
},
|
| 8940 |
+
{
|
| 8941 |
+
"epoch": 0.4965,
|
| 8942 |
+
"grad_norm": 0.5714575887868236,
|
| 8943 |
+
"learning_rate": 5.5946666666666674e-06,
|
| 8944 |
+
"loss": 2.3698,
|
| 8945 |
+
"step": 24825
|
| 8946 |
+
},
|
| 8947 |
+
{
|
| 8948 |
+
"epoch": 0.497,
|
| 8949 |
+
"grad_norm": 0.5479503311373944,
|
| 8950 |
+
"learning_rate": 5.589111111111112e-06,
|
| 8951 |
+
"loss": 2.3753,
|
| 8952 |
+
"step": 24850
|
| 8953 |
+
},
|
| 8954 |
+
{
|
| 8955 |
+
"epoch": 0.4975,
|
| 8956 |
+
"grad_norm": 0.5465196721627547,
|
| 8957 |
+
"learning_rate": 5.583555555555556e-06,
|
| 8958 |
+
"loss": 2.3627,
|
| 8959 |
+
"step": 24875
|
| 8960 |
+
},
|
| 8961 |
+
{
|
| 8962 |
+
"epoch": 0.498,
|
| 8963 |
+
"grad_norm": 0.5545182382115218,
|
| 8964 |
+
"learning_rate": 5.578e-06,
|
| 8965 |
+
"loss": 2.3623,
|
| 8966 |
+
"step": 24900
|
| 8967 |
+
},
|
| 8968 |
+
{
|
| 8969 |
+
"epoch": 0.498,
|
| 8970 |
+
"eval_loss": 2.383317470550537,
|
| 8971 |
+
"eval_runtime": 31.8409,
|
| 8972 |
+
"eval_samples_per_second": 3.203,
|
| 8973 |
+
"eval_steps_per_second": 1.602,
|
| 8974 |
+
"step": 24900
|
| 8975 |
+
},
|
| 8976 |
+
{
|
| 8977 |
+
"epoch": 0.4985,
|
| 8978 |
+
"grad_norm": 0.5624766646317664,
|
| 8979 |
+
"learning_rate": 5.572444444444445e-06,
|
| 8980 |
+
"loss": 2.3659,
|
| 8981 |
+
"step": 24925
|
| 8982 |
+
},
|
| 8983 |
+
{
|
| 8984 |
+
"epoch": 0.499,
|
| 8985 |
+
"grad_norm": 0.5642199082921324,
|
| 8986 |
+
"learning_rate": 5.5668888888888894e-06,
|
| 8987 |
+
"loss": 2.3684,
|
| 8988 |
+
"step": 24950
|
| 8989 |
+
},
|
| 8990 |
+
{
|
| 8991 |
+
"epoch": 0.4995,
|
| 8992 |
+
"grad_norm": 0.5917431910025611,
|
| 8993 |
+
"learning_rate": 5.561333333333334e-06,
|
| 8994 |
+
"loss": 2.3723,
|
| 8995 |
+
"step": 24975
|
| 8996 |
+
},
|
| 8997 |
+
{
|
| 8998 |
+
"epoch": 0.5,
|
| 8999 |
+
"grad_norm": 0.5530201275821488,
|
| 9000 |
+
"learning_rate": 5.555777777777777e-06,
|
| 9001 |
+
"loss": 2.3685,
|
| 9002 |
+
"step": 25000
|
| 9003 |
+
},
|
| 9004 |
+
{
|
| 9005 |
+
"epoch": 0.5,
|
| 9006 |
+
"eval_loss": 2.3832170963287354,
|
| 9007 |
+
"eval_runtime": 31.7959,
|
| 9008 |
+
"eval_samples_per_second": 3.208,
|
| 9009 |
+
"eval_steps_per_second": 1.604,
|
| 9010 |
+
"step": 25000
|
| 9011 |
}
|
| 9012 |
],
|
| 9013 |
"logging_steps": 25,
|
|
|
|
| 9027 |
"attributes": {}
|
| 9028 |
}
|
| 9029 |
},
|
| 9030 |
+
"total_flos": 7.95800574581801e+19,
|
| 9031 |
"train_batch_size": 1,
|
| 9032 |
"trial_name": null,
|
| 9033 |
"trial_params": null
|