Training checkpoint at step 26000
Browse files- trainer_state.json +365 -5
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-25000",
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9008,6 +9008,366 @@
|
|
| 9008 |
"eval_samples_per_second": 3.208,
|
| 9009 |
"eval_steps_per_second": 1.604,
|
| 9010 |
"step": 25000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9011 |
}
|
| 9012 |
],
|
| 9013 |
"logging_steps": 25,
|
|
@@ -9027,7 +9387,7 @@
|
|
| 9027 |
"attributes": {}
|
| 9028 |
}
|
| 9029 |
},
|
| 9030 |
-
"total_flos":
|
| 9031 |
"train_batch_size": 1,
|
| 9032 |
"trial_name": null,
|
| 9033 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 25900,
|
| 3 |
+
"best_metric": 2.3824901580810547,
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-25000",
|
| 5 |
+
"epoch": 0.52,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 26000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9008 |
"eval_samples_per_second": 3.208,
|
| 9009 |
"eval_steps_per_second": 1.604,
|
| 9010 |
"step": 25000
|
| 9011 |
+
},
|
| 9012 |
+
{
|
| 9013 |
+
"epoch": 0.5005,
|
| 9014 |
+
"grad_norm": 0.5509816083841773,
|
| 9015 |
+
"learning_rate": 5.550222222222223e-06,
|
| 9016 |
+
"loss": 2.3559,
|
| 9017 |
+
"step": 25025
|
| 9018 |
+
},
|
| 9019 |
+
{
|
| 9020 |
+
"epoch": 0.501,
|
| 9021 |
+
"grad_norm": 0.5547472529206742,
|
| 9022 |
+
"learning_rate": 5.544666666666667e-06,
|
| 9023 |
+
"loss": 2.3648,
|
| 9024 |
+
"step": 25050
|
| 9025 |
+
},
|
| 9026 |
+
{
|
| 9027 |
+
"epoch": 0.5015,
|
| 9028 |
+
"grad_norm": 0.546260980184131,
|
| 9029 |
+
"learning_rate": 5.5391111111111115e-06,
|
| 9030 |
+
"loss": 2.3701,
|
| 9031 |
+
"step": 25075
|
| 9032 |
+
},
|
| 9033 |
+
{
|
| 9034 |
+
"epoch": 0.502,
|
| 9035 |
+
"grad_norm": 0.5481216862316385,
|
| 9036 |
+
"learning_rate": 5.533555555555557e-06,
|
| 9037 |
+
"loss": 2.3798,
|
| 9038 |
+
"step": 25100
|
| 9039 |
+
},
|
| 9040 |
+
{
|
| 9041 |
+
"epoch": 0.502,
|
| 9042 |
+
"eval_loss": 2.38305926322937,
|
| 9043 |
+
"eval_runtime": 32.0473,
|
| 9044 |
+
"eval_samples_per_second": 3.183,
|
| 9045 |
+
"eval_steps_per_second": 1.591,
|
| 9046 |
+
"step": 25100
|
| 9047 |
+
},
|
| 9048 |
+
{
|
| 9049 |
+
"epoch": 0.5025,
|
| 9050 |
+
"grad_norm": 0.5670640165543723,
|
| 9051 |
+
"learning_rate": 5.528e-06,
|
| 9052 |
+
"loss": 2.3622,
|
| 9053 |
+
"step": 25125
|
| 9054 |
+
},
|
| 9055 |
+
{
|
| 9056 |
+
"epoch": 0.503,
|
| 9057 |
+
"grad_norm": 0.5463137917421312,
|
| 9058 |
+
"learning_rate": 5.522444444444445e-06,
|
| 9059 |
+
"loss": 2.3719,
|
| 9060 |
+
"step": 25150
|
| 9061 |
+
},
|
| 9062 |
+
{
|
| 9063 |
+
"epoch": 0.5035,
|
| 9064 |
+
"grad_norm": 0.5400999701410277,
|
| 9065 |
+
"learning_rate": 5.516888888888889e-06,
|
| 9066 |
+
"loss": 2.3616,
|
| 9067 |
+
"step": 25175
|
| 9068 |
+
},
|
| 9069 |
+
{
|
| 9070 |
+
"epoch": 0.504,
|
| 9071 |
+
"grad_norm": 0.5802126499364532,
|
| 9072 |
+
"learning_rate": 5.511333333333334e-06,
|
| 9073 |
+
"loss": 2.3721,
|
| 9074 |
+
"step": 25200
|
| 9075 |
+
},
|
| 9076 |
+
{
|
| 9077 |
+
"epoch": 0.504,
|
| 9078 |
+
"eval_loss": 2.3829147815704346,
|
| 9079 |
+
"eval_runtime": 31.7438,
|
| 9080 |
+
"eval_samples_per_second": 3.213,
|
| 9081 |
+
"eval_steps_per_second": 1.607,
|
| 9082 |
+
"step": 25200
|
| 9083 |
+
},
|
| 9084 |
+
{
|
| 9085 |
+
"epoch": 0.5045,
|
| 9086 |
+
"grad_norm": 0.5435607747773122,
|
| 9087 |
+
"learning_rate": 5.505777777777779e-06,
|
| 9088 |
+
"loss": 2.3603,
|
| 9089 |
+
"step": 25225
|
| 9090 |
+
},
|
| 9091 |
+
{
|
| 9092 |
+
"epoch": 0.505,
|
| 9093 |
+
"grad_norm": 0.5453890322127348,
|
| 9094 |
+
"learning_rate": 5.500222222222222e-06,
|
| 9095 |
+
"loss": 2.3636,
|
| 9096 |
+
"step": 25250
|
| 9097 |
+
},
|
| 9098 |
+
{
|
| 9099 |
+
"epoch": 0.5055,
|
| 9100 |
+
"grad_norm": 0.5477131217196112,
|
| 9101 |
+
"learning_rate": 5.494666666666667e-06,
|
| 9102 |
+
"loss": 2.3697,
|
| 9103 |
+
"step": 25275
|
| 9104 |
+
},
|
| 9105 |
+
{
|
| 9106 |
+
"epoch": 0.506,
|
| 9107 |
+
"grad_norm": 0.5621665226631756,
|
| 9108 |
+
"learning_rate": 5.489111111111112e-06,
|
| 9109 |
+
"loss": 2.3687,
|
| 9110 |
+
"step": 25300
|
| 9111 |
+
},
|
| 9112 |
+
{
|
| 9113 |
+
"epoch": 0.506,
|
| 9114 |
+
"eval_loss": 2.3831355571746826,
|
| 9115 |
+
"eval_runtime": 31.7979,
|
| 9116 |
+
"eval_samples_per_second": 3.208,
|
| 9117 |
+
"eval_steps_per_second": 1.604,
|
| 9118 |
+
"step": 25300
|
| 9119 |
+
},
|
| 9120 |
+
{
|
| 9121 |
+
"epoch": 0.5065,
|
| 9122 |
+
"grad_norm": 0.5622191727496813,
|
| 9123 |
+
"learning_rate": 5.483555555555556e-06,
|
| 9124 |
+
"loss": 2.368,
|
| 9125 |
+
"step": 25325
|
| 9126 |
+
},
|
| 9127 |
+
{
|
| 9128 |
+
"epoch": 0.507,
|
| 9129 |
+
"grad_norm": 0.5375310388584507,
|
| 9130 |
+
"learning_rate": 5.478e-06,
|
| 9131 |
+
"loss": 2.3617,
|
| 9132 |
+
"step": 25350
|
| 9133 |
+
},
|
| 9134 |
+
{
|
| 9135 |
+
"epoch": 0.5075,
|
| 9136 |
+
"grad_norm": 0.5421092937376346,
|
| 9137 |
+
"learning_rate": 5.472444444444444e-06,
|
| 9138 |
+
"loss": 2.3759,
|
| 9139 |
+
"step": 25375
|
| 9140 |
+
},
|
| 9141 |
+
{
|
| 9142 |
+
"epoch": 0.508,
|
| 9143 |
+
"grad_norm": 0.5726686989658507,
|
| 9144 |
+
"learning_rate": 5.4668888888888896e-06,
|
| 9145 |
+
"loss": 2.37,
|
| 9146 |
+
"step": 25400
|
| 9147 |
+
},
|
| 9148 |
+
{
|
| 9149 |
+
"epoch": 0.508,
|
| 9150 |
+
"eval_loss": 2.383046865463257,
|
| 9151 |
+
"eval_runtime": 31.8165,
|
| 9152 |
+
"eval_samples_per_second": 3.206,
|
| 9153 |
+
"eval_steps_per_second": 1.603,
|
| 9154 |
+
"step": 25400
|
| 9155 |
+
},
|
| 9156 |
+
{
|
| 9157 |
+
"epoch": 0.5085,
|
| 9158 |
+
"grad_norm": 0.536904504012326,
|
| 9159 |
+
"learning_rate": 5.461333333333334e-06,
|
| 9160 |
+
"loss": 2.3683,
|
| 9161 |
+
"step": 25425
|
| 9162 |
+
},
|
| 9163 |
+
{
|
| 9164 |
+
"epoch": 0.509,
|
| 9165 |
+
"grad_norm": 0.5792290465322086,
|
| 9166 |
+
"learning_rate": 5.455777777777778e-06,
|
| 9167 |
+
"loss": 2.3641,
|
| 9168 |
+
"step": 25450
|
| 9169 |
+
},
|
| 9170 |
+
{
|
| 9171 |
+
"epoch": 0.5095,
|
| 9172 |
+
"grad_norm": 0.5667490944788528,
|
| 9173 |
+
"learning_rate": 5.450222222222222e-06,
|
| 9174 |
+
"loss": 2.3673,
|
| 9175 |
+
"step": 25475
|
| 9176 |
+
},
|
| 9177 |
+
{
|
| 9178 |
+
"epoch": 0.51,
|
| 9179 |
+
"grad_norm": 0.5581091402617585,
|
| 9180 |
+
"learning_rate": 5.444666666666667e-06,
|
| 9181 |
+
"loss": 2.374,
|
| 9182 |
+
"step": 25500
|
| 9183 |
+
},
|
| 9184 |
+
{
|
| 9185 |
+
"epoch": 0.51,
|
| 9186 |
+
"eval_loss": 2.3831074237823486,
|
| 9187 |
+
"eval_runtime": 31.8462,
|
| 9188 |
+
"eval_samples_per_second": 3.203,
|
| 9189 |
+
"eval_steps_per_second": 1.601,
|
| 9190 |
+
"step": 25500
|
| 9191 |
+
},
|
| 9192 |
+
{
|
| 9193 |
+
"epoch": 0.5105,
|
| 9194 |
+
"grad_norm": 0.5629059983127724,
|
| 9195 |
+
"learning_rate": 5.4391111111111116e-06,
|
| 9196 |
+
"loss": 2.376,
|
| 9197 |
+
"step": 25525
|
| 9198 |
+
},
|
| 9199 |
+
{
|
| 9200 |
+
"epoch": 0.511,
|
| 9201 |
+
"grad_norm": 0.5600711744363054,
|
| 9202 |
+
"learning_rate": 5.433555555555556e-06,
|
| 9203 |
+
"loss": 2.3702,
|
| 9204 |
+
"step": 25550
|
| 9205 |
+
},
|
| 9206 |
+
{
|
| 9207 |
+
"epoch": 0.5115,
|
| 9208 |
+
"grad_norm": 0.5500784026204207,
|
| 9209 |
+
"learning_rate": 5.4279999999999995e-06,
|
| 9210 |
+
"loss": 2.3704,
|
| 9211 |
+
"step": 25575
|
| 9212 |
+
},
|
| 9213 |
+
{
|
| 9214 |
+
"epoch": 0.512,
|
| 9215 |
+
"grad_norm": 0.553377338742942,
|
| 9216 |
+
"learning_rate": 5.422444444444445e-06,
|
| 9217 |
+
"loss": 2.3644,
|
| 9218 |
+
"step": 25600
|
| 9219 |
+
},
|
| 9220 |
+
{
|
| 9221 |
+
"epoch": 0.512,
|
| 9222 |
+
"eval_loss": 2.3826544284820557,
|
| 9223 |
+
"eval_runtime": 31.7739,
|
| 9224 |
+
"eval_samples_per_second": 3.21,
|
| 9225 |
+
"eval_steps_per_second": 1.605,
|
| 9226 |
+
"step": 25600
|
| 9227 |
+
},
|
| 9228 |
+
{
|
| 9229 |
+
"epoch": 0.5125,
|
| 9230 |
+
"grad_norm": 0.5861763037221558,
|
| 9231 |
+
"learning_rate": 5.416888888888889e-06,
|
| 9232 |
+
"loss": 2.3658,
|
| 9233 |
+
"step": 25625
|
| 9234 |
+
},
|
| 9235 |
+
{
|
| 9236 |
+
"epoch": 0.513,
|
| 9237 |
+
"grad_norm": 0.5538084648071333,
|
| 9238 |
+
"learning_rate": 5.411333333333334e-06,
|
| 9239 |
+
"loss": 2.3693,
|
| 9240 |
+
"step": 25650
|
| 9241 |
+
},
|
| 9242 |
+
{
|
| 9243 |
+
"epoch": 0.5135,
|
| 9244 |
+
"grad_norm": 0.5699472071254841,
|
| 9245 |
+
"learning_rate": 5.405777777777779e-06,
|
| 9246 |
+
"loss": 2.3707,
|
| 9247 |
+
"step": 25675
|
| 9248 |
+
},
|
| 9249 |
+
{
|
| 9250 |
+
"epoch": 0.514,
|
| 9251 |
+
"grad_norm": 0.5440880568370218,
|
| 9252 |
+
"learning_rate": 5.400222222222222e-06,
|
| 9253 |
+
"loss": 2.3664,
|
| 9254 |
+
"step": 25700
|
| 9255 |
+
},
|
| 9256 |
+
{
|
| 9257 |
+
"epoch": 0.514,
|
| 9258 |
+
"eval_loss": 2.382906675338745,
|
| 9259 |
+
"eval_runtime": 31.7874,
|
| 9260 |
+
"eval_samples_per_second": 3.209,
|
| 9261 |
+
"eval_steps_per_second": 1.604,
|
| 9262 |
+
"step": 25700
|
| 9263 |
+
},
|
| 9264 |
+
{
|
| 9265 |
+
"epoch": 0.5145,
|
| 9266 |
+
"grad_norm": 0.551256815387497,
|
| 9267 |
+
"learning_rate": 5.394666666666667e-06,
|
| 9268 |
+
"loss": 2.3608,
|
| 9269 |
+
"step": 25725
|
| 9270 |
+
},
|
| 9271 |
+
{
|
| 9272 |
+
"epoch": 0.515,
|
| 9273 |
+
"grad_norm": 0.552653919875225,
|
| 9274 |
+
"learning_rate": 5.389111111111112e-06,
|
| 9275 |
+
"loss": 2.3648,
|
| 9276 |
+
"step": 25750
|
| 9277 |
+
},
|
| 9278 |
+
{
|
| 9279 |
+
"epoch": 0.5155,
|
| 9280 |
+
"grad_norm": 0.5489775829628063,
|
| 9281 |
+
"learning_rate": 5.3835555555555565e-06,
|
| 9282 |
+
"loss": 2.368,
|
| 9283 |
+
"step": 25775
|
| 9284 |
+
},
|
| 9285 |
+
{
|
| 9286 |
+
"epoch": 0.516,
|
| 9287 |
+
"grad_norm": 0.545224524462321,
|
| 9288 |
+
"learning_rate": 5.378e-06,
|
| 9289 |
+
"loss": 2.37,
|
| 9290 |
+
"step": 25800
|
| 9291 |
+
},
|
| 9292 |
+
{
|
| 9293 |
+
"epoch": 0.516,
|
| 9294 |
+
"eval_loss": 2.382946491241455,
|
| 9295 |
+
"eval_runtime": 31.8142,
|
| 9296 |
+
"eval_samples_per_second": 3.206,
|
| 9297 |
+
"eval_steps_per_second": 1.603,
|
| 9298 |
+
"step": 25800
|
| 9299 |
+
},
|
| 9300 |
+
{
|
| 9301 |
+
"epoch": 0.5165,
|
| 9302 |
+
"grad_norm": 0.6177434912819645,
|
| 9303 |
+
"learning_rate": 5.372444444444444e-06,
|
| 9304 |
+
"loss": 2.3576,
|
| 9305 |
+
"step": 25825
|
| 9306 |
+
},
|
| 9307 |
+
{
|
| 9308 |
+
"epoch": 0.517,
|
| 9309 |
+
"grad_norm": 0.5731672053410489,
|
| 9310 |
+
"learning_rate": 5.36688888888889e-06,
|
| 9311 |
+
"loss": 2.3641,
|
| 9312 |
+
"step": 25850
|
| 9313 |
+
},
|
| 9314 |
+
{
|
| 9315 |
+
"epoch": 0.5175,
|
| 9316 |
+
"grad_norm": 0.547417736306074,
|
| 9317 |
+
"learning_rate": 5.361333333333334e-06,
|
| 9318 |
+
"loss": 2.3669,
|
| 9319 |
+
"step": 25875
|
| 9320 |
+
},
|
| 9321 |
+
{
|
| 9322 |
+
"epoch": 0.518,
|
| 9323 |
+
"grad_norm": 0.5666721324439973,
|
| 9324 |
+
"learning_rate": 5.3557777777777785e-06,
|
| 9325 |
+
"loss": 2.3633,
|
| 9326 |
+
"step": 25900
|
| 9327 |
+
},
|
| 9328 |
+
{
|
| 9329 |
+
"epoch": 0.518,
|
| 9330 |
+
"eval_loss": 2.3824901580810547,
|
| 9331 |
+
"eval_runtime": 31.8236,
|
| 9332 |
+
"eval_samples_per_second": 3.205,
|
| 9333 |
+
"eval_steps_per_second": 1.603,
|
| 9334 |
+
"step": 25900
|
| 9335 |
+
},
|
| 9336 |
+
{
|
| 9337 |
+
"epoch": 0.5185,
|
| 9338 |
+
"grad_norm": 0.5493694553264233,
|
| 9339 |
+
"learning_rate": 5.350222222222222e-06,
|
| 9340 |
+
"loss": 2.3676,
|
| 9341 |
+
"step": 25925
|
| 9342 |
+
},
|
| 9343 |
+
{
|
| 9344 |
+
"epoch": 0.519,
|
| 9345 |
+
"grad_norm": 0.5581911332398992,
|
| 9346 |
+
"learning_rate": 5.344666666666667e-06,
|
| 9347 |
+
"loss": 2.3665,
|
| 9348 |
+
"step": 25950
|
| 9349 |
+
},
|
| 9350 |
+
{
|
| 9351 |
+
"epoch": 0.5195,
|
| 9352 |
+
"grad_norm": 0.5523156791576098,
|
| 9353 |
+
"learning_rate": 5.339111111111112e-06,
|
| 9354 |
+
"loss": 2.3634,
|
| 9355 |
+
"step": 25975
|
| 9356 |
+
},
|
| 9357 |
+
{
|
| 9358 |
+
"epoch": 0.52,
|
| 9359 |
+
"grad_norm": 0.5394984851015033,
|
| 9360 |
+
"learning_rate": 5.333555555555556e-06,
|
| 9361 |
+
"loss": 2.3693,
|
| 9362 |
+
"step": 26000
|
| 9363 |
+
},
|
| 9364 |
+
{
|
| 9365 |
+
"epoch": 0.52,
|
| 9366 |
+
"eval_loss": 2.3825063705444336,
|
| 9367 |
+
"eval_runtime": 31.7579,
|
| 9368 |
+
"eval_samples_per_second": 3.212,
|
| 9369 |
+
"eval_steps_per_second": 1.606,
|
| 9370 |
+
"step": 26000
|
| 9371 |
}
|
| 9372 |
],
|
| 9373 |
"logging_steps": 25,
|
|
|
|
| 9387 |
"attributes": {}
|
| 9388 |
}
|
| 9389 |
},
|
| 9390 |
+
"total_flos": 8.27632597565073e+19,
|
| 9391 |
"train_batch_size": 1,
|
| 9392 |
"trial_name": null,
|
| 9393 |
"trial_params": null
|