Training checkpoint at step 23000
Browse files- trainer_state.json +365 -5
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-22000",
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -7928,6 +7928,366 @@
|
|
| 7928 |
"eval_samples_per_second": 3.206,
|
| 7929 |
"eval_steps_per_second": 1.603,
|
| 7930 |
"step": 22000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7931 |
}
|
| 7932 |
],
|
| 7933 |
"logging_steps": 25,
|
|
@@ -7947,7 +8307,7 @@
|
|
| 7947 |
"attributes": {}
|
| 7948 |
}
|
| 7949 |
},
|
| 7950 |
-
"total_flos": 7.
|
| 7951 |
"train_batch_size": 1,
|
| 7952 |
"trial_name": null,
|
| 7953 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 22700,
|
| 3 |
+
"best_metric": 2.3853445053100586,
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-22000",
|
| 5 |
+
"epoch": 0.46,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 23000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 7928 |
"eval_samples_per_second": 3.206,
|
| 7929 |
"eval_steps_per_second": 1.603,
|
| 7930 |
"step": 22000
|
| 7931 |
+
},
|
| 7932 |
+
{
|
| 7933 |
+
"epoch": 0.4405,
|
| 7934 |
+
"grad_norm": 0.5533397760322247,
|
| 7935 |
+
"learning_rate": 6.216888888888889e-06,
|
| 7936 |
+
"loss": 2.371,
|
| 7937 |
+
"step": 22025
|
| 7938 |
+
},
|
| 7939 |
+
{
|
| 7940 |
+
"epoch": 0.441,
|
| 7941 |
+
"grad_norm": 0.5551275205002794,
|
| 7942 |
+
"learning_rate": 6.2113333333333336e-06,
|
| 7943 |
+
"loss": 2.3684,
|
| 7944 |
+
"step": 22050
|
| 7945 |
+
},
|
| 7946 |
+
{
|
| 7947 |
+
"epoch": 0.4415,
|
| 7948 |
+
"grad_norm": 0.5520948023453888,
|
| 7949 |
+
"learning_rate": 6.205777777777778e-06,
|
| 7950 |
+
"loss": 2.3602,
|
| 7951 |
+
"step": 22075
|
| 7952 |
+
},
|
| 7953 |
+
{
|
| 7954 |
+
"epoch": 0.442,
|
| 7955 |
+
"grad_norm": 0.5679529169964138,
|
| 7956 |
+
"learning_rate": 6.200222222222223e-06,
|
| 7957 |
+
"loss": 2.3867,
|
| 7958 |
+
"step": 22100
|
| 7959 |
+
},
|
| 7960 |
+
{
|
| 7961 |
+
"epoch": 0.442,
|
| 7962 |
+
"eval_loss": 2.3863022327423096,
|
| 7963 |
+
"eval_runtime": 32.0036,
|
| 7964 |
+
"eval_samples_per_second": 3.187,
|
| 7965 |
+
"eval_steps_per_second": 1.594,
|
| 7966 |
+
"step": 22100
|
| 7967 |
+
},
|
| 7968 |
+
{
|
| 7969 |
+
"epoch": 0.4425,
|
| 7970 |
+
"grad_norm": 0.5619895216629556,
|
| 7971 |
+
"learning_rate": 6.194666666666668e-06,
|
| 7972 |
+
"loss": 2.3701,
|
| 7973 |
+
"step": 22125
|
| 7974 |
+
},
|
| 7975 |
+
{
|
| 7976 |
+
"epoch": 0.443,
|
| 7977 |
+
"grad_norm": 0.5515875809771505,
|
| 7978 |
+
"learning_rate": 6.189111111111111e-06,
|
| 7979 |
+
"loss": 2.3734,
|
| 7980 |
+
"step": 22150
|
| 7981 |
+
},
|
| 7982 |
+
{
|
| 7983 |
+
"epoch": 0.4435,
|
| 7984 |
+
"grad_norm": 0.5686425996531567,
|
| 7985 |
+
"learning_rate": 6.1835555555555556e-06,
|
| 7986 |
+
"loss": 2.3698,
|
| 7987 |
+
"step": 22175
|
| 7988 |
+
},
|
| 7989 |
+
{
|
| 7990 |
+
"epoch": 0.444,
|
| 7991 |
+
"grad_norm": 0.5580871882801617,
|
| 7992 |
+
"learning_rate": 6.178000000000001e-06,
|
| 7993 |
+
"loss": 2.3676,
|
| 7994 |
+
"step": 22200
|
| 7995 |
+
},
|
| 7996 |
+
{
|
| 7997 |
+
"epoch": 0.444,
|
| 7998 |
+
"eval_loss": 2.3865246772766113,
|
| 7999 |
+
"eval_runtime": 31.7174,
|
| 8000 |
+
"eval_samples_per_second": 3.216,
|
| 8001 |
+
"eval_steps_per_second": 1.608,
|
| 8002 |
+
"step": 22200
|
| 8003 |
+
},
|
| 8004 |
+
{
|
| 8005 |
+
"epoch": 0.4445,
|
| 8006 |
+
"grad_norm": 0.5784261034385078,
|
| 8007 |
+
"learning_rate": 6.172444444444445e-06,
|
| 8008 |
+
"loss": 2.3723,
|
| 8009 |
+
"step": 22225
|
| 8010 |
+
},
|
| 8011 |
+
{
|
| 8012 |
+
"epoch": 0.445,
|
| 8013 |
+
"grad_norm": 0.5570688655308026,
|
| 8014 |
+
"learning_rate": 6.166888888888889e-06,
|
| 8015 |
+
"loss": 2.3709,
|
| 8016 |
+
"step": 22250
|
| 8017 |
+
},
|
| 8018 |
+
{
|
| 8019 |
+
"epoch": 0.4455,
|
| 8020 |
+
"grad_norm": 0.5716930839552549,
|
| 8021 |
+
"learning_rate": 6.161333333333334e-06,
|
| 8022 |
+
"loss": 2.3734,
|
| 8023 |
+
"step": 22275
|
| 8024 |
+
},
|
| 8025 |
+
{
|
| 8026 |
+
"epoch": 0.446,
|
| 8027 |
+
"grad_norm": 0.5550340902020618,
|
| 8028 |
+
"learning_rate": 6.1557777777777784e-06,
|
| 8029 |
+
"loss": 2.3648,
|
| 8030 |
+
"step": 22300
|
| 8031 |
+
},
|
| 8032 |
+
{
|
| 8033 |
+
"epoch": 0.446,
|
| 8034 |
+
"eval_loss": 2.38633131980896,
|
| 8035 |
+
"eval_runtime": 31.7943,
|
| 8036 |
+
"eval_samples_per_second": 3.208,
|
| 8037 |
+
"eval_steps_per_second": 1.604,
|
| 8038 |
+
"step": 22300
|
| 8039 |
+
},
|
| 8040 |
+
{
|
| 8041 |
+
"epoch": 0.4465,
|
| 8042 |
+
"grad_norm": 0.5719936248106342,
|
| 8043 |
+
"learning_rate": 6.150222222222223e-06,
|
| 8044 |
+
"loss": 2.3751,
|
| 8045 |
+
"step": 22325
|
| 8046 |
+
},
|
| 8047 |
+
{
|
| 8048 |
+
"epoch": 0.447,
|
| 8049 |
+
"grad_norm": 0.5616671760742846,
|
| 8050 |
+
"learning_rate": 6.144666666666668e-06,
|
| 8051 |
+
"loss": 2.3748,
|
| 8052 |
+
"step": 22350
|
| 8053 |
+
},
|
| 8054 |
+
{
|
| 8055 |
+
"epoch": 0.4475,
|
| 8056 |
+
"grad_norm": 0.5785985644213604,
|
| 8057 |
+
"learning_rate": 6.139111111111112e-06,
|
| 8058 |
+
"loss": 2.3837,
|
| 8059 |
+
"step": 22375
|
| 8060 |
+
},
|
| 8061 |
+
{
|
| 8062 |
+
"epoch": 0.448,
|
| 8063 |
+
"grad_norm": 0.5645620599147937,
|
| 8064 |
+
"learning_rate": 6.133555555555556e-06,
|
| 8065 |
+
"loss": 2.3745,
|
| 8066 |
+
"step": 22400
|
| 8067 |
+
},
|
| 8068 |
+
{
|
| 8069 |
+
"epoch": 0.448,
|
| 8070 |
+
"eval_loss": 2.3862569332122803,
|
| 8071 |
+
"eval_runtime": 31.9593,
|
| 8072 |
+
"eval_samples_per_second": 3.192,
|
| 8073 |
+
"eval_steps_per_second": 1.596,
|
| 8074 |
+
"step": 22400
|
| 8075 |
+
},
|
| 8076 |
+
{
|
| 8077 |
+
"epoch": 0.4485,
|
| 8078 |
+
"grad_norm": 0.5469950240628229,
|
| 8079 |
+
"learning_rate": 6.1280000000000005e-06,
|
| 8080 |
+
"loss": 2.3642,
|
| 8081 |
+
"step": 22425
|
| 8082 |
+
},
|
| 8083 |
+
{
|
| 8084 |
+
"epoch": 0.449,
|
| 8085 |
+
"grad_norm": 0.5324393599981698,
|
| 8086 |
+
"learning_rate": 6.122444444444446e-06,
|
| 8087 |
+
"loss": 2.379,
|
| 8088 |
+
"step": 22450
|
| 8089 |
+
},
|
| 8090 |
+
{
|
| 8091 |
+
"epoch": 0.4495,
|
| 8092 |
+
"grad_norm": 0.5519962387254249,
|
| 8093 |
+
"learning_rate": 6.116888888888889e-06,
|
| 8094 |
+
"loss": 2.3635,
|
| 8095 |
+
"step": 22475
|
| 8096 |
+
},
|
| 8097 |
+
{
|
| 8098 |
+
"epoch": 0.45,
|
| 8099 |
+
"grad_norm": 0.5588336399127953,
|
| 8100 |
+
"learning_rate": 6.111333333333334e-06,
|
| 8101 |
+
"loss": 2.3718,
|
| 8102 |
+
"step": 22500
|
| 8103 |
+
},
|
| 8104 |
+
{
|
| 8105 |
+
"epoch": 0.45,
|
| 8106 |
+
"eval_loss": 2.385950803756714,
|
| 8107 |
+
"eval_runtime": 31.7208,
|
| 8108 |
+
"eval_samples_per_second": 3.216,
|
| 8109 |
+
"eval_steps_per_second": 1.608,
|
| 8110 |
+
"step": 22500
|
| 8111 |
+
},
|
| 8112 |
+
{
|
| 8113 |
+
"epoch": 0.4505,
|
| 8114 |
+
"grad_norm": 0.5923640418917652,
|
| 8115 |
+
"learning_rate": 6.105777777777778e-06,
|
| 8116 |
+
"loss": 2.3719,
|
| 8117 |
+
"step": 22525
|
| 8118 |
+
},
|
| 8119 |
+
{
|
| 8120 |
+
"epoch": 0.451,
|
| 8121 |
+
"grad_norm": 0.5653562982992056,
|
| 8122 |
+
"learning_rate": 6.100222222222223e-06,
|
| 8123 |
+
"loss": 2.3808,
|
| 8124 |
+
"step": 22550
|
| 8125 |
+
},
|
| 8126 |
+
{
|
| 8127 |
+
"epoch": 0.4515,
|
| 8128 |
+
"grad_norm": 0.5636846873459127,
|
| 8129 |
+
"learning_rate": 6.094666666666668e-06,
|
| 8130 |
+
"loss": 2.3641,
|
| 8131 |
+
"step": 22575
|
| 8132 |
+
},
|
| 8133 |
+
{
|
| 8134 |
+
"epoch": 0.452,
|
| 8135 |
+
"grad_norm": 0.5850003926588586,
|
| 8136 |
+
"learning_rate": 6.089111111111111e-06,
|
| 8137 |
+
"loss": 2.3572,
|
| 8138 |
+
"step": 22600
|
| 8139 |
+
},
|
| 8140 |
+
{
|
| 8141 |
+
"epoch": 0.452,
|
| 8142 |
+
"eval_loss": 2.386296033859253,
|
| 8143 |
+
"eval_runtime": 31.8709,
|
| 8144 |
+
"eval_samples_per_second": 3.2,
|
| 8145 |
+
"eval_steps_per_second": 1.6,
|
| 8146 |
+
"step": 22600
|
| 8147 |
+
},
|
| 8148 |
+
{
|
| 8149 |
+
"epoch": 0.4525,
|
| 8150 |
+
"grad_norm": 0.5334735362781007,
|
| 8151 |
+
"learning_rate": 6.083555555555556e-06,
|
| 8152 |
+
"loss": 2.3732,
|
| 8153 |
+
"step": 22625
|
| 8154 |
+
},
|
| 8155 |
+
{
|
| 8156 |
+
"epoch": 0.453,
|
| 8157 |
+
"grad_norm": 0.5809776122118506,
|
| 8158 |
+
"learning_rate": 6.078000000000001e-06,
|
| 8159 |
+
"loss": 2.3842,
|
| 8160 |
+
"step": 22650
|
| 8161 |
+
},
|
| 8162 |
+
{
|
| 8163 |
+
"epoch": 0.4535,
|
| 8164 |
+
"grad_norm": 0.5438625993671827,
|
| 8165 |
+
"learning_rate": 6.072444444444445e-06,
|
| 8166 |
+
"loss": 2.3802,
|
| 8167 |
+
"step": 22675
|
| 8168 |
+
},
|
| 8169 |
+
{
|
| 8170 |
+
"epoch": 0.454,
|
| 8171 |
+
"grad_norm": 0.5581266930595516,
|
| 8172 |
+
"learning_rate": 6.06688888888889e-06,
|
| 8173 |
+
"loss": 2.3757,
|
| 8174 |
+
"step": 22700
|
| 8175 |
+
},
|
| 8176 |
+
{
|
| 8177 |
+
"epoch": 0.454,
|
| 8178 |
+
"eval_loss": 2.3853445053100586,
|
| 8179 |
+
"eval_runtime": 31.9465,
|
| 8180 |
+
"eval_samples_per_second": 3.193,
|
| 8181 |
+
"eval_steps_per_second": 1.596,
|
| 8182 |
+
"step": 22700
|
| 8183 |
+
},
|
| 8184 |
+
{
|
| 8185 |
+
"epoch": 0.4545,
|
| 8186 |
+
"grad_norm": 0.5665471911134969,
|
| 8187 |
+
"learning_rate": 6.061333333333333e-06,
|
| 8188 |
+
"loss": 2.3632,
|
| 8189 |
+
"step": 22725
|
| 8190 |
+
},
|
| 8191 |
+
{
|
| 8192 |
+
"epoch": 0.455,
|
| 8193 |
+
"grad_norm": 0.5602817372745607,
|
| 8194 |
+
"learning_rate": 6.0557777777777785e-06,
|
| 8195 |
+
"loss": 2.3759,
|
| 8196 |
+
"step": 22750
|
| 8197 |
+
},
|
| 8198 |
+
{
|
| 8199 |
+
"epoch": 0.4555,
|
| 8200 |
+
"grad_norm": 0.5546395592927382,
|
| 8201 |
+
"learning_rate": 6.050222222222223e-06,
|
| 8202 |
+
"loss": 2.3654,
|
| 8203 |
+
"step": 22775
|
| 8204 |
+
},
|
| 8205 |
+
{
|
| 8206 |
+
"epoch": 0.456,
|
| 8207 |
+
"grad_norm": 0.5466059675730089,
|
| 8208 |
+
"learning_rate": 6.044666666666667e-06,
|
| 8209 |
+
"loss": 2.3747,
|
| 8210 |
+
"step": 22800
|
| 8211 |
+
},
|
| 8212 |
+
{
|
| 8213 |
+
"epoch": 0.456,
|
| 8214 |
+
"eval_loss": 2.3854382038116455,
|
| 8215 |
+
"eval_runtime": 31.8135,
|
| 8216 |
+
"eval_samples_per_second": 3.206,
|
| 8217 |
+
"eval_steps_per_second": 1.603,
|
| 8218 |
+
"step": 22800
|
| 8219 |
+
},
|
| 8220 |
+
{
|
| 8221 |
+
"epoch": 0.4565,
|
| 8222 |
+
"grad_norm": 0.556576922176953,
|
| 8223 |
+
"learning_rate": 6.039111111111111e-06,
|
| 8224 |
+
"loss": 2.3752,
|
| 8225 |
+
"step": 22825
|
| 8226 |
+
},
|
| 8227 |
+
{
|
| 8228 |
+
"epoch": 0.457,
|
| 8229 |
+
"grad_norm": 0.5587160453347744,
|
| 8230 |
+
"learning_rate": 6.033555555555556e-06,
|
| 8231 |
+
"loss": 2.3753,
|
| 8232 |
+
"step": 22850
|
| 8233 |
+
},
|
| 8234 |
+
{
|
| 8235 |
+
"epoch": 0.4575,
|
| 8236 |
+
"grad_norm": 0.5581750567947692,
|
| 8237 |
+
"learning_rate": 6.0280000000000006e-06,
|
| 8238 |
+
"loss": 2.3744,
|
| 8239 |
+
"step": 22875
|
| 8240 |
+
},
|
| 8241 |
+
{
|
| 8242 |
+
"epoch": 0.458,
|
| 8243 |
+
"grad_norm": 0.5665211201226871,
|
| 8244 |
+
"learning_rate": 6.022444444444445e-06,
|
| 8245 |
+
"loss": 2.3707,
|
| 8246 |
+
"step": 22900
|
| 8247 |
+
},
|
| 8248 |
+
{
|
| 8249 |
+
"epoch": 0.458,
|
| 8250 |
+
"eval_loss": 2.3854050636291504,
|
| 8251 |
+
"eval_runtime": 31.8453,
|
| 8252 |
+
"eval_samples_per_second": 3.203,
|
| 8253 |
+
"eval_steps_per_second": 1.601,
|
| 8254 |
+
"step": 22900
|
| 8255 |
+
},
|
| 8256 |
+
{
|
| 8257 |
+
"epoch": 0.4585,
|
| 8258 |
+
"grad_norm": 0.559138638343371,
|
| 8259 |
+
"learning_rate": 6.01688888888889e-06,
|
| 8260 |
+
"loss": 2.3771,
|
| 8261 |
+
"step": 22925
|
| 8262 |
+
},
|
| 8263 |
+
{
|
| 8264 |
+
"epoch": 0.459,
|
| 8265 |
+
"grad_norm": 0.5765629867304476,
|
| 8266 |
+
"learning_rate": 6.011333333333334e-06,
|
| 8267 |
+
"loss": 2.3751,
|
| 8268 |
+
"step": 22950
|
| 8269 |
+
},
|
| 8270 |
+
{
|
| 8271 |
+
"epoch": 0.4595,
|
| 8272 |
+
"grad_norm": 0.5697804508664757,
|
| 8273 |
+
"learning_rate": 6.005777777777778e-06,
|
| 8274 |
+
"loss": 2.3837,
|
| 8275 |
+
"step": 22975
|
| 8276 |
+
},
|
| 8277 |
+
{
|
| 8278 |
+
"epoch": 0.46,
|
| 8279 |
+
"grad_norm": 0.5813773268685459,
|
| 8280 |
+
"learning_rate": 6.000222222222223e-06,
|
| 8281 |
+
"loss": 2.37,
|
| 8282 |
+
"step": 23000
|
| 8283 |
+
},
|
| 8284 |
+
{
|
| 8285 |
+
"epoch": 0.46,
|
| 8286 |
+
"eval_loss": 2.385390520095825,
|
| 8287 |
+
"eval_runtime": 31.767,
|
| 8288 |
+
"eval_samples_per_second": 3.211,
|
| 8289 |
+
"eval_steps_per_second": 1.605,
|
| 8290 |
+
"step": 23000
|
| 8291 |
}
|
| 8292 |
],
|
| 8293 |
"logging_steps": 25,
|
|
|
|
| 8307 |
"attributes": {}
|
| 8308 |
}
|
| 8309 |
},
|
| 8310 |
+
"total_flos": 7.321365286152569e+19,
|
| 8311 |
"train_batch_size": 1,
|
| 8312 |
"trial_name": null,
|
| 8313 |
"trial_params": null
|