Training checkpoint at step 20000
Browse files- trainer_state.json +365 -5
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-19000",
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6848,6 +6848,366 @@
|
|
| 6848 |
"eval_samples_per_second": 3.207,
|
| 6849 |
"eval_steps_per_second": 1.603,
|
| 6850 |
"step": 19000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6851 |
}
|
| 6852 |
],
|
| 6853 |
"logging_steps": 25,
|
|
@@ -6867,7 +7227,7 @@
|
|
| 6867 |
"attributes": {}
|
| 6868 |
}
|
| 6869 |
},
|
| 6870 |
-
"total_flos": 6.
|
| 6871 |
"train_batch_size": 1,
|
| 6872 |
"trial_name": null,
|
| 6873 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 19900,
|
| 3 |
+
"best_metric": 2.388927698135376,
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-19000",
|
| 5 |
+
"epoch": 0.4,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 20000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6848 |
"eval_samples_per_second": 3.207,
|
| 6849 |
"eval_steps_per_second": 1.603,
|
| 6850 |
"step": 19000
|
| 6851 |
+
},
|
| 6852 |
+
{
|
| 6853 |
+
"epoch": 0.3805,
|
| 6854 |
+
"grad_norm": 0.5526396554433541,
|
| 6855 |
+
"learning_rate": 6.8835555555555565e-06,
|
| 6856 |
+
"loss": 2.3779,
|
| 6857 |
+
"step": 19025
|
| 6858 |
+
},
|
| 6859 |
+
{
|
| 6860 |
+
"epoch": 0.381,
|
| 6861 |
+
"grad_norm": 0.574490460414078,
|
| 6862 |
+
"learning_rate": 6.878e-06,
|
| 6863 |
+
"loss": 2.3727,
|
| 6864 |
+
"step": 19050
|
| 6865 |
+
},
|
| 6866 |
+
{
|
| 6867 |
+
"epoch": 0.3815,
|
| 6868 |
+
"grad_norm": 0.5611671894801677,
|
| 6869 |
+
"learning_rate": 6.872444444444445e-06,
|
| 6870 |
+
"loss": 2.379,
|
| 6871 |
+
"step": 19075
|
| 6872 |
+
},
|
| 6873 |
+
{
|
| 6874 |
+
"epoch": 0.382,
|
| 6875 |
+
"grad_norm": 0.5434475778092571,
|
| 6876 |
+
"learning_rate": 6.86688888888889e-06,
|
| 6877 |
+
"loss": 2.3788,
|
| 6878 |
+
"step": 19100
|
| 6879 |
+
},
|
| 6880 |
+
{
|
| 6881 |
+
"epoch": 0.382,
|
| 6882 |
+
"eval_loss": 2.390854597091675,
|
| 6883 |
+
"eval_runtime": 31.4727,
|
| 6884 |
+
"eval_samples_per_second": 3.241,
|
| 6885 |
+
"eval_steps_per_second": 1.62,
|
| 6886 |
+
"step": 19100
|
| 6887 |
+
},
|
| 6888 |
+
{
|
| 6889 |
+
"epoch": 0.3825,
|
| 6890 |
+
"grad_norm": 0.5438441040943751,
|
| 6891 |
+
"learning_rate": 6.861333333333334e-06,
|
| 6892 |
+
"loss": 2.3849,
|
| 6893 |
+
"step": 19125
|
| 6894 |
+
},
|
| 6895 |
+
{
|
| 6896 |
+
"epoch": 0.383,
|
| 6897 |
+
"grad_norm": 0.5617582167520553,
|
| 6898 |
+
"learning_rate": 6.855777777777778e-06,
|
| 6899 |
+
"loss": 2.3778,
|
| 6900 |
+
"step": 19150
|
| 6901 |
+
},
|
| 6902 |
+
{
|
| 6903 |
+
"epoch": 0.3835,
|
| 6904 |
+
"grad_norm": 0.5734148354957039,
|
| 6905 |
+
"learning_rate": 6.850222222222223e-06,
|
| 6906 |
+
"loss": 2.3749,
|
| 6907 |
+
"step": 19175
|
| 6908 |
+
},
|
| 6909 |
+
{
|
| 6910 |
+
"epoch": 0.384,
|
| 6911 |
+
"grad_norm": 0.5567016447555824,
|
| 6912 |
+
"learning_rate": 6.844666666666667e-06,
|
| 6913 |
+
"loss": 2.3786,
|
| 6914 |
+
"step": 19200
|
| 6915 |
+
},
|
| 6916 |
+
{
|
| 6917 |
+
"epoch": 0.384,
|
| 6918 |
+
"eval_loss": 2.390947103500366,
|
| 6919 |
+
"eval_runtime": 31.472,
|
| 6920 |
+
"eval_samples_per_second": 3.241,
|
| 6921 |
+
"eval_steps_per_second": 1.62,
|
| 6922 |
+
"step": 19200
|
| 6923 |
+
},
|
| 6924 |
+
{
|
| 6925 |
+
"epoch": 0.3845,
|
| 6926 |
+
"grad_norm": 0.5630941651558155,
|
| 6927 |
+
"learning_rate": 6.839111111111112e-06,
|
| 6928 |
+
"loss": 2.371,
|
| 6929 |
+
"step": 19225
|
| 6930 |
+
},
|
| 6931 |
+
{
|
| 6932 |
+
"epoch": 0.385,
|
| 6933 |
+
"grad_norm": 0.5472891744821744,
|
| 6934 |
+
"learning_rate": 6.833555555555557e-06,
|
| 6935 |
+
"loss": 2.371,
|
| 6936 |
+
"step": 19250
|
| 6937 |
+
},
|
| 6938 |
+
{
|
| 6939 |
+
"epoch": 0.3855,
|
| 6940 |
+
"grad_norm": 0.563854124925733,
|
| 6941 |
+
"learning_rate": 6.8280000000000005e-06,
|
| 6942 |
+
"loss": 2.3802,
|
| 6943 |
+
"step": 19275
|
| 6944 |
+
},
|
| 6945 |
+
{
|
| 6946 |
+
"epoch": 0.386,
|
| 6947 |
+
"grad_norm": 0.5535188682099162,
|
| 6948 |
+
"learning_rate": 6.822444444444445e-06,
|
| 6949 |
+
"loss": 2.3668,
|
| 6950 |
+
"step": 19300
|
| 6951 |
+
},
|
| 6952 |
+
{
|
| 6953 |
+
"epoch": 0.386,
|
| 6954 |
+
"eval_loss": 2.3904383182525635,
|
| 6955 |
+
"eval_runtime": 31.5109,
|
| 6956 |
+
"eval_samples_per_second": 3.237,
|
| 6957 |
+
"eval_steps_per_second": 1.618,
|
| 6958 |
+
"step": 19300
|
| 6959 |
+
},
|
| 6960 |
+
{
|
| 6961 |
+
"epoch": 0.3865,
|
| 6962 |
+
"grad_norm": 0.5847689751509554,
|
| 6963 |
+
"learning_rate": 6.816888888888889e-06,
|
| 6964 |
+
"loss": 2.3723,
|
| 6965 |
+
"step": 19325
|
| 6966 |
+
},
|
| 6967 |
+
{
|
| 6968 |
+
"epoch": 0.387,
|
| 6969 |
+
"grad_norm": 0.5477508463021717,
|
| 6970 |
+
"learning_rate": 6.811333333333335e-06,
|
| 6971 |
+
"loss": 2.3748,
|
| 6972 |
+
"step": 19350
|
| 6973 |
+
},
|
| 6974 |
+
{
|
| 6975 |
+
"epoch": 0.3875,
|
| 6976 |
+
"grad_norm": 0.5530662776524751,
|
| 6977 |
+
"learning_rate": 6.805777777777778e-06,
|
| 6978 |
+
"loss": 2.372,
|
| 6979 |
+
"step": 19375
|
| 6980 |
+
},
|
| 6981 |
+
{
|
| 6982 |
+
"epoch": 0.388,
|
| 6983 |
+
"grad_norm": 0.5627088332087185,
|
| 6984 |
+
"learning_rate": 6.8002222222222225e-06,
|
| 6985 |
+
"loss": 2.3649,
|
| 6986 |
+
"step": 19400
|
| 6987 |
+
},
|
| 6988 |
+
{
|
| 6989 |
+
"epoch": 0.388,
|
| 6990 |
+
"eval_loss": 2.3902432918548584,
|
| 6991 |
+
"eval_runtime": 31.5016,
|
| 6992 |
+
"eval_samples_per_second": 3.238,
|
| 6993 |
+
"eval_steps_per_second": 1.619,
|
| 6994 |
+
"step": 19400
|
| 6995 |
+
},
|
| 6996 |
+
{
|
| 6997 |
+
"epoch": 0.3885,
|
| 6998 |
+
"grad_norm": 0.5917805991329846,
|
| 6999 |
+
"learning_rate": 6.794666666666667e-06,
|
| 7000 |
+
"loss": 2.389,
|
| 7001 |
+
"step": 19425
|
| 7002 |
+
},
|
| 7003 |
+
{
|
| 7004 |
+
"epoch": 0.389,
|
| 7005 |
+
"grad_norm": 0.5637153841856668,
|
| 7006 |
+
"learning_rate": 6.789111111111112e-06,
|
| 7007 |
+
"loss": 2.381,
|
| 7008 |
+
"step": 19450
|
| 7009 |
+
},
|
| 7010 |
+
{
|
| 7011 |
+
"epoch": 0.3895,
|
| 7012 |
+
"grad_norm": 0.5638546592221216,
|
| 7013 |
+
"learning_rate": 6.783555555555557e-06,
|
| 7014 |
+
"loss": 2.3674,
|
| 7015 |
+
"step": 19475
|
| 7016 |
+
},
|
| 7017 |
+
{
|
| 7018 |
+
"epoch": 0.39,
|
| 7019 |
+
"grad_norm": 0.5442599823902955,
|
| 7020 |
+
"learning_rate": 6.778e-06,
|
| 7021 |
+
"loss": 2.3684,
|
| 7022 |
+
"step": 19500
|
| 7023 |
+
},
|
| 7024 |
+
{
|
| 7025 |
+
"epoch": 0.39,
|
| 7026 |
+
"eval_loss": 2.3898606300354004,
|
| 7027 |
+
"eval_runtime": 31.4637,
|
| 7028 |
+
"eval_samples_per_second": 3.242,
|
| 7029 |
+
"eval_steps_per_second": 1.621,
|
| 7030 |
+
"step": 19500
|
| 7031 |
+
},
|
| 7032 |
+
{
|
| 7033 |
+
"epoch": 0.3905,
|
| 7034 |
+
"grad_norm": 0.582280869057288,
|
| 7035 |
+
"learning_rate": 6.7724444444444446e-06,
|
| 7036 |
+
"loss": 2.3691,
|
| 7037 |
+
"step": 19525
|
| 7038 |
+
},
|
| 7039 |
+
{
|
| 7040 |
+
"epoch": 0.391,
|
| 7041 |
+
"grad_norm": 0.5427829071455205,
|
| 7042 |
+
"learning_rate": 6.76688888888889e-06,
|
| 7043 |
+
"loss": 2.372,
|
| 7044 |
+
"step": 19550
|
| 7045 |
+
},
|
| 7046 |
+
{
|
| 7047 |
+
"epoch": 0.3915,
|
| 7048 |
+
"grad_norm": 0.5690660297920415,
|
| 7049 |
+
"learning_rate": 6.761333333333334e-06,
|
| 7050 |
+
"loss": 2.3696,
|
| 7051 |
+
"step": 19575
|
| 7052 |
+
},
|
| 7053 |
+
{
|
| 7054 |
+
"epoch": 0.392,
|
| 7055 |
+
"grad_norm": 0.5887280660795969,
|
| 7056 |
+
"learning_rate": 6.755777777777779e-06,
|
| 7057 |
+
"loss": 2.3647,
|
| 7058 |
+
"step": 19600
|
| 7059 |
+
},
|
| 7060 |
+
{
|
| 7061 |
+
"epoch": 0.392,
|
| 7062 |
+
"eval_loss": 2.389928102493286,
|
| 7063 |
+
"eval_runtime": 31.425,
|
| 7064 |
+
"eval_samples_per_second": 3.246,
|
| 7065 |
+
"eval_steps_per_second": 1.623,
|
| 7066 |
+
"step": 19600
|
| 7067 |
+
},
|
| 7068 |
+
{
|
| 7069 |
+
"epoch": 0.3925,
|
| 7070 |
+
"grad_norm": 0.5706193677763675,
|
| 7071 |
+
"learning_rate": 6.750222222222222e-06,
|
| 7072 |
+
"loss": 2.3693,
|
| 7073 |
+
"step": 19625
|
| 7074 |
+
},
|
| 7075 |
+
{
|
| 7076 |
+
"epoch": 0.393,
|
| 7077 |
+
"grad_norm": 0.5446782496969111,
|
| 7078 |
+
"learning_rate": 6.7446666666666674e-06,
|
| 7079 |
+
"loss": 2.3808,
|
| 7080 |
+
"step": 19650
|
| 7081 |
+
},
|
| 7082 |
+
{
|
| 7083 |
+
"epoch": 0.3935,
|
| 7084 |
+
"grad_norm": 0.5571942248079983,
|
| 7085 |
+
"learning_rate": 6.739111111111112e-06,
|
| 7086 |
+
"loss": 2.3825,
|
| 7087 |
+
"step": 19675
|
| 7088 |
+
},
|
| 7089 |
+
{
|
| 7090 |
+
"epoch": 0.394,
|
| 7091 |
+
"grad_norm": 0.5452923856402259,
|
| 7092 |
+
"learning_rate": 6.733555555555556e-06,
|
| 7093 |
+
"loss": 2.3689,
|
| 7094 |
+
"step": 19700
|
| 7095 |
+
},
|
| 7096 |
+
{
|
| 7097 |
+
"epoch": 0.394,
|
| 7098 |
+
"eval_loss": 2.3896048069000244,
|
| 7099 |
+
"eval_runtime": 31.5836,
|
| 7100 |
+
"eval_samples_per_second": 3.23,
|
| 7101 |
+
"eval_steps_per_second": 1.615,
|
| 7102 |
+
"step": 19700
|
| 7103 |
+
},
|
| 7104 |
+
{
|
| 7105 |
+
"epoch": 0.3945,
|
| 7106 |
+
"grad_norm": 0.5828792681612529,
|
| 7107 |
+
"learning_rate": 6.728e-06,
|
| 7108 |
+
"loss": 2.3733,
|
| 7109 |
+
"step": 19725
|
| 7110 |
+
},
|
| 7111 |
+
{
|
| 7112 |
+
"epoch": 0.395,
|
| 7113 |
+
"grad_norm": 0.5615201455315739,
|
| 7114 |
+
"learning_rate": 6.722444444444445e-06,
|
| 7115 |
+
"loss": 2.3689,
|
| 7116 |
+
"step": 19750
|
| 7117 |
+
},
|
| 7118 |
+
{
|
| 7119 |
+
"epoch": 0.3955,
|
| 7120 |
+
"grad_norm": 0.5585669738111114,
|
| 7121 |
+
"learning_rate": 6.7168888888888894e-06,
|
| 7122 |
+
"loss": 2.3873,
|
| 7123 |
+
"step": 19775
|
| 7124 |
+
},
|
| 7125 |
+
{
|
| 7126 |
+
"epoch": 0.396,
|
| 7127 |
+
"grad_norm": 0.5412795214285975,
|
| 7128 |
+
"learning_rate": 6.711333333333334e-06,
|
| 7129 |
+
"loss": 2.3786,
|
| 7130 |
+
"step": 19800
|
| 7131 |
+
},
|
| 7132 |
+
{
|
| 7133 |
+
"epoch": 0.396,
|
| 7134 |
+
"eval_loss": 2.3894851207733154,
|
| 7135 |
+
"eval_runtime": 31.4877,
|
| 7136 |
+
"eval_samples_per_second": 3.239,
|
| 7137 |
+
"eval_steps_per_second": 1.62,
|
| 7138 |
+
"step": 19800
|
| 7139 |
+
},
|
| 7140 |
+
{
|
| 7141 |
+
"epoch": 0.3965,
|
| 7142 |
+
"grad_norm": 0.5778930227780084,
|
| 7143 |
+
"learning_rate": 6.705777777777779e-06,
|
| 7144 |
+
"loss": 2.3766,
|
| 7145 |
+
"step": 19825
|
| 7146 |
+
},
|
| 7147 |
+
{
|
| 7148 |
+
"epoch": 0.397,
|
| 7149 |
+
"grad_norm": 0.5682987690385847,
|
| 7150 |
+
"learning_rate": 6.700222222222223e-06,
|
| 7151 |
+
"loss": 2.3783,
|
| 7152 |
+
"step": 19850
|
| 7153 |
+
},
|
| 7154 |
+
{
|
| 7155 |
+
"epoch": 0.3975,
|
| 7156 |
+
"grad_norm": 0.5763865594632764,
|
| 7157 |
+
"learning_rate": 6.694666666666667e-06,
|
| 7158 |
+
"loss": 2.3738,
|
| 7159 |
+
"step": 19875
|
| 7160 |
+
},
|
| 7161 |
+
{
|
| 7162 |
+
"epoch": 0.398,
|
| 7163 |
+
"grad_norm": 0.5514756259491804,
|
| 7164 |
+
"learning_rate": 6.6891111111111115e-06,
|
| 7165 |
+
"loss": 2.3764,
|
| 7166 |
+
"step": 19900
|
| 7167 |
+
},
|
| 7168 |
+
{
|
| 7169 |
+
"epoch": 0.398,
|
| 7170 |
+
"eval_loss": 2.388927698135376,
|
| 7171 |
+
"eval_runtime": 31.7775,
|
| 7172 |
+
"eval_samples_per_second": 3.21,
|
| 7173 |
+
"eval_steps_per_second": 1.605,
|
| 7174 |
+
"step": 19900
|
| 7175 |
+
},
|
| 7176 |
+
{
|
| 7177 |
+
"epoch": 0.3985,
|
| 7178 |
+
"grad_norm": 0.5577240438533453,
|
| 7179 |
+
"learning_rate": 6.683555555555557e-06,
|
| 7180 |
+
"loss": 2.374,
|
| 7181 |
+
"step": 19925
|
| 7182 |
+
},
|
| 7183 |
+
{
|
| 7184 |
+
"epoch": 0.399,
|
| 7185 |
+
"grad_norm": 0.553314104963858,
|
| 7186 |
+
"learning_rate": 6.678e-06,
|
| 7187 |
+
"loss": 2.3726,
|
| 7188 |
+
"step": 19950
|
| 7189 |
+
},
|
| 7190 |
+
{
|
| 7191 |
+
"epoch": 0.3995,
|
| 7192 |
+
"grad_norm": 0.5615070159418603,
|
| 7193 |
+
"learning_rate": 6.672444444444445e-06,
|
| 7194 |
+
"loss": 2.3683,
|
| 7195 |
+
"step": 19975
|
| 7196 |
+
},
|
| 7197 |
+
{
|
| 7198 |
+
"epoch": 0.4,
|
| 7199 |
+
"grad_norm": 0.5595654854755111,
|
| 7200 |
+
"learning_rate": 6.666888888888889e-06,
|
| 7201 |
+
"loss": 2.3632,
|
| 7202 |
+
"step": 20000
|
| 7203 |
+
},
|
| 7204 |
+
{
|
| 7205 |
+
"epoch": 0.4,
|
| 7206 |
+
"eval_loss": 2.389249801635742,
|
| 7207 |
+
"eval_runtime": 31.7934,
|
| 7208 |
+
"eval_samples_per_second": 3.208,
|
| 7209 |
+
"eval_steps_per_second": 1.604,
|
| 7210 |
+
"step": 20000
|
| 7211 |
}
|
| 7212 |
],
|
| 7213 |
"logging_steps": 25,
|
|
|
|
| 7227 |
"attributes": {}
|
| 7228 |
}
|
| 7229 |
},
|
| 7230 |
+
"total_flos": 6.366404596654408e+19,
|
| 7231 |
"train_batch_size": 1,
|
| 7232 |
"trial_name": null,
|
| 7233 |
"trial_params": null
|