Training checkpoint at step 18000
Browse files- trainer_state.json +366 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6128,6 +6128,366 @@
|
|
| 6128 |
"eval_samples_per_second": 3.215,
|
| 6129 |
"eval_steps_per_second": 1.607,
|
| 6130 |
"step": 17000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6131 |
}
|
| 6132 |
],
|
| 6133 |
"logging_steps": 25,
|
|
@@ -6147,7 +6507,7 @@
|
|
| 6147 |
"attributes": {}
|
| 6148 |
}
|
| 6149 |
},
|
| 6150 |
-
"total_flos": 5.
|
| 6151 |
"train_batch_size": 1,
|
| 6152 |
"trial_name": null,
|
| 6153 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 18000,
|
| 3 |
+
"best_metric": 2.3920133113861084,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-18000",
|
| 5 |
+
"epoch": 0.36,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 18000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6128 |
"eval_samples_per_second": 3.215,
|
| 6129 |
"eval_steps_per_second": 1.607,
|
| 6130 |
"step": 17000
|
| 6131 |
+
},
|
| 6132 |
+
{
|
| 6133 |
+
"epoch": 0.3405,
|
| 6134 |
+
"grad_norm": 0.5625979440161315,
|
| 6135 |
+
"learning_rate": 7.328000000000001e-06,
|
| 6136 |
+
"loss": 2.3706,
|
| 6137 |
+
"step": 17025
|
| 6138 |
+
},
|
| 6139 |
+
{
|
| 6140 |
+
"epoch": 0.341,
|
| 6141 |
+
"grad_norm": 0.5578934058534382,
|
| 6142 |
+
"learning_rate": 7.322444444444445e-06,
|
| 6143 |
+
"loss": 2.3717,
|
| 6144 |
+
"step": 17050
|
| 6145 |
+
},
|
| 6146 |
+
{
|
| 6147 |
+
"epoch": 0.3415,
|
| 6148 |
+
"grad_norm": 0.5600783145650656,
|
| 6149 |
+
"learning_rate": 7.31688888888889e-06,
|
| 6150 |
+
"loss": 2.3549,
|
| 6151 |
+
"step": 17075
|
| 6152 |
+
},
|
| 6153 |
+
{
|
| 6154 |
+
"epoch": 0.342,
|
| 6155 |
+
"grad_norm": 0.5443562716925451,
|
| 6156 |
+
"learning_rate": 7.311333333333334e-06,
|
| 6157 |
+
"loss": 2.3818,
|
| 6158 |
+
"step": 17100
|
| 6159 |
+
},
|
| 6160 |
+
{
|
| 6161 |
+
"epoch": 0.342,
|
| 6162 |
+
"eval_loss": 2.3939199447631836,
|
| 6163 |
+
"eval_runtime": 31.7183,
|
| 6164 |
+
"eval_samples_per_second": 3.216,
|
| 6165 |
+
"eval_steps_per_second": 1.608,
|
| 6166 |
+
"step": 17100
|
| 6167 |
+
},
|
| 6168 |
+
{
|
| 6169 |
+
"epoch": 0.3425,
|
| 6170 |
+
"grad_norm": 0.6040551095214175,
|
| 6171 |
+
"learning_rate": 7.3057777777777784e-06,
|
| 6172 |
+
"loss": 2.3856,
|
| 6173 |
+
"step": 17125
|
| 6174 |
+
},
|
| 6175 |
+
{
|
| 6176 |
+
"epoch": 0.343,
|
| 6177 |
+
"grad_norm": 0.5800600768624563,
|
| 6178 |
+
"learning_rate": 7.300222222222223e-06,
|
| 6179 |
+
"loss": 2.3812,
|
| 6180 |
+
"step": 17150
|
| 6181 |
+
},
|
| 6182 |
+
{
|
| 6183 |
+
"epoch": 0.3435,
|
| 6184 |
+
"grad_norm": 0.606456873691792,
|
| 6185 |
+
"learning_rate": 7.294666666666668e-06,
|
| 6186 |
+
"loss": 2.3823,
|
| 6187 |
+
"step": 17175
|
| 6188 |
+
},
|
| 6189 |
+
{
|
| 6190 |
+
"epoch": 0.344,
|
| 6191 |
+
"grad_norm": 0.5820033666001653,
|
| 6192 |
+
"learning_rate": 7.289111111111112e-06,
|
| 6193 |
+
"loss": 2.3772,
|
| 6194 |
+
"step": 17200
|
| 6195 |
+
},
|
| 6196 |
+
{
|
| 6197 |
+
"epoch": 0.344,
|
| 6198 |
+
"eval_loss": 2.39414644241333,
|
| 6199 |
+
"eval_runtime": 31.4591,
|
| 6200 |
+
"eval_samples_per_second": 3.242,
|
| 6201 |
+
"eval_steps_per_second": 1.621,
|
| 6202 |
+
"step": 17200
|
| 6203 |
+
},
|
| 6204 |
+
{
|
| 6205 |
+
"epoch": 0.3445,
|
| 6206 |
+
"grad_norm": 0.592691728166079,
|
| 6207 |
+
"learning_rate": 7.283555555555556e-06,
|
| 6208 |
+
"loss": 2.3757,
|
| 6209 |
+
"step": 17225
|
| 6210 |
+
},
|
| 6211 |
+
{
|
| 6212 |
+
"epoch": 0.345,
|
| 6213 |
+
"grad_norm": 0.5475066044517582,
|
| 6214 |
+
"learning_rate": 7.2780000000000005e-06,
|
| 6215 |
+
"loss": 2.393,
|
| 6216 |
+
"step": 17250
|
| 6217 |
+
},
|
| 6218 |
+
{
|
| 6219 |
+
"epoch": 0.3455,
|
| 6220 |
+
"grad_norm": 0.5412153350606916,
|
| 6221 |
+
"learning_rate": 7.272444444444446e-06,
|
| 6222 |
+
"loss": 2.3775,
|
| 6223 |
+
"step": 17275
|
| 6224 |
+
},
|
| 6225 |
+
{
|
| 6226 |
+
"epoch": 0.346,
|
| 6227 |
+
"grad_norm": 0.5703055910606494,
|
| 6228 |
+
"learning_rate": 7.26688888888889e-06,
|
| 6229 |
+
"loss": 2.3919,
|
| 6230 |
+
"step": 17300
|
| 6231 |
+
},
|
| 6232 |
+
{
|
| 6233 |
+
"epoch": 0.346,
|
| 6234 |
+
"eval_loss": 2.393954277038574,
|
| 6235 |
+
"eval_runtime": 31.4832,
|
| 6236 |
+
"eval_samples_per_second": 3.24,
|
| 6237 |
+
"eval_steps_per_second": 1.62,
|
| 6238 |
+
"step": 17300
|
| 6239 |
+
},
|
| 6240 |
+
{
|
| 6241 |
+
"epoch": 0.3465,
|
| 6242 |
+
"grad_norm": 0.5720004911842855,
|
| 6243 |
+
"learning_rate": 7.261333333333334e-06,
|
| 6244 |
+
"loss": 2.3744,
|
| 6245 |
+
"step": 17325
|
| 6246 |
+
},
|
| 6247 |
+
{
|
| 6248 |
+
"epoch": 0.347,
|
| 6249 |
+
"grad_norm": 0.5651936652229611,
|
| 6250 |
+
"learning_rate": 7.255777777777778e-06,
|
| 6251 |
+
"loss": 2.3766,
|
| 6252 |
+
"step": 17350
|
| 6253 |
+
},
|
| 6254 |
+
{
|
| 6255 |
+
"epoch": 0.3475,
|
| 6256 |
+
"grad_norm": 0.552954097582646,
|
| 6257 |
+
"learning_rate": 7.250222222222223e-06,
|
| 6258 |
+
"loss": 2.38,
|
| 6259 |
+
"step": 17375
|
| 6260 |
+
},
|
| 6261 |
+
{
|
| 6262 |
+
"epoch": 0.348,
|
| 6263 |
+
"grad_norm": 0.5753937605402671,
|
| 6264 |
+
"learning_rate": 7.244666666666668e-06,
|
| 6265 |
+
"loss": 2.3825,
|
| 6266 |
+
"step": 17400
|
| 6267 |
+
},
|
| 6268 |
+
{
|
| 6269 |
+
"epoch": 0.348,
|
| 6270 |
+
"eval_loss": 2.3936057090759277,
|
| 6271 |
+
"eval_runtime": 31.5155,
|
| 6272 |
+
"eval_samples_per_second": 3.237,
|
| 6273 |
+
"eval_steps_per_second": 1.618,
|
| 6274 |
+
"step": 17400
|
| 6275 |
+
},
|
| 6276 |
+
{
|
| 6277 |
+
"epoch": 0.3485,
|
| 6278 |
+
"grad_norm": 0.5982429265702776,
|
| 6279 |
+
"learning_rate": 7.239111111111111e-06,
|
| 6280 |
+
"loss": 2.3748,
|
| 6281 |
+
"step": 17425
|
| 6282 |
+
},
|
| 6283 |
+
{
|
| 6284 |
+
"epoch": 0.349,
|
| 6285 |
+
"grad_norm": 0.5707105076014326,
|
| 6286 |
+
"learning_rate": 7.233555555555556e-06,
|
| 6287 |
+
"loss": 2.3871,
|
| 6288 |
+
"step": 17450
|
| 6289 |
+
},
|
| 6290 |
+
{
|
| 6291 |
+
"epoch": 0.3495,
|
| 6292 |
+
"grad_norm": 0.5749982454192974,
|
| 6293 |
+
"learning_rate": 7.228000000000001e-06,
|
| 6294 |
+
"loss": 2.3722,
|
| 6295 |
+
"step": 17475
|
| 6296 |
+
},
|
| 6297 |
+
{
|
| 6298 |
+
"epoch": 0.35,
|
| 6299 |
+
"grad_norm": 0.5667678087541999,
|
| 6300 |
+
"learning_rate": 7.222444444444445e-06,
|
| 6301 |
+
"loss": 2.3897,
|
| 6302 |
+
"step": 17500
|
| 6303 |
+
},
|
| 6304 |
+
{
|
| 6305 |
+
"epoch": 0.35,
|
| 6306 |
+
"eval_loss": 2.3934316635131836,
|
| 6307 |
+
"eval_runtime": 31.5133,
|
| 6308 |
+
"eval_samples_per_second": 3.237,
|
| 6309 |
+
"eval_steps_per_second": 1.618,
|
| 6310 |
+
"step": 17500
|
| 6311 |
+
},
|
| 6312 |
+
{
|
| 6313 |
+
"epoch": 0.3505,
|
| 6314 |
+
"grad_norm": 0.551269238238286,
|
| 6315 |
+
"learning_rate": 7.21688888888889e-06,
|
| 6316 |
+
"loss": 2.3759,
|
| 6317 |
+
"step": 17525
|
| 6318 |
+
},
|
| 6319 |
+
{
|
| 6320 |
+
"epoch": 0.351,
|
| 6321 |
+
"grad_norm": 0.5683477126287287,
|
| 6322 |
+
"learning_rate": 7.211333333333333e-06,
|
| 6323 |
+
"loss": 2.3751,
|
| 6324 |
+
"step": 17550
|
| 6325 |
+
},
|
| 6326 |
+
{
|
| 6327 |
+
"epoch": 0.3515,
|
| 6328 |
+
"grad_norm": 0.5534527601932518,
|
| 6329 |
+
"learning_rate": 7.2057777777777785e-06,
|
| 6330 |
+
"loss": 2.3749,
|
| 6331 |
+
"step": 17575
|
| 6332 |
+
},
|
| 6333 |
+
{
|
| 6334 |
+
"epoch": 0.352,
|
| 6335 |
+
"grad_norm": 0.5444580304379504,
|
| 6336 |
+
"learning_rate": 7.200222222222223e-06,
|
| 6337 |
+
"loss": 2.3839,
|
| 6338 |
+
"step": 17600
|
| 6339 |
+
},
|
| 6340 |
+
{
|
| 6341 |
+
"epoch": 0.352,
|
| 6342 |
+
"eval_loss": 2.3928964138031006,
|
| 6343 |
+
"eval_runtime": 31.79,
|
| 6344 |
+
"eval_samples_per_second": 3.209,
|
| 6345 |
+
"eval_steps_per_second": 1.604,
|
| 6346 |
+
"step": 17600
|
| 6347 |
+
},
|
| 6348 |
+
{
|
| 6349 |
+
"epoch": 0.3525,
|
| 6350 |
+
"grad_norm": 0.5683011717419817,
|
| 6351 |
+
"learning_rate": 7.194666666666667e-06,
|
| 6352 |
+
"loss": 2.3697,
|
| 6353 |
+
"step": 17625
|
| 6354 |
+
},
|
| 6355 |
+
{
|
| 6356 |
+
"epoch": 0.353,
|
| 6357 |
+
"grad_norm": 0.5597200154635523,
|
| 6358 |
+
"learning_rate": 7.189111111111111e-06,
|
| 6359 |
+
"loss": 2.3758,
|
| 6360 |
+
"step": 17650
|
| 6361 |
+
},
|
| 6362 |
+
{
|
| 6363 |
+
"epoch": 0.3535,
|
| 6364 |
+
"grad_norm": 0.5389975543023572,
|
| 6365 |
+
"learning_rate": 7.183555555555556e-06,
|
| 6366 |
+
"loss": 2.3748,
|
| 6367 |
+
"step": 17675
|
| 6368 |
+
},
|
| 6369 |
+
{
|
| 6370 |
+
"epoch": 0.354,
|
| 6371 |
+
"grad_norm": 0.5766556300730846,
|
| 6372 |
+
"learning_rate": 7.1780000000000006e-06,
|
| 6373 |
+
"loss": 2.3863,
|
| 6374 |
+
"step": 17700
|
| 6375 |
+
},
|
| 6376 |
+
{
|
| 6377 |
+
"epoch": 0.354,
|
| 6378 |
+
"eval_loss": 2.3929381370544434,
|
| 6379 |
+
"eval_runtime": 31.4662,
|
| 6380 |
+
"eval_samples_per_second": 3.242,
|
| 6381 |
+
"eval_steps_per_second": 1.621,
|
| 6382 |
+
"step": 17700
|
| 6383 |
+
},
|
| 6384 |
+
{
|
| 6385 |
+
"epoch": 0.3545,
|
| 6386 |
+
"grad_norm": 0.5422601731930108,
|
| 6387 |
+
"learning_rate": 7.172444444444445e-06,
|
| 6388 |
+
"loss": 2.3795,
|
| 6389 |
+
"step": 17725
|
| 6390 |
+
},
|
| 6391 |
+
{
|
| 6392 |
+
"epoch": 0.355,
|
| 6393 |
+
"grad_norm": 0.587749563771833,
|
| 6394 |
+
"learning_rate": 7.16688888888889e-06,
|
| 6395 |
+
"loss": 2.3741,
|
| 6396 |
+
"step": 17750
|
| 6397 |
+
},
|
| 6398 |
+
{
|
| 6399 |
+
"epoch": 0.3555,
|
| 6400 |
+
"grad_norm": 0.5448174780243932,
|
| 6401 |
+
"learning_rate": 7.161333333333334e-06,
|
| 6402 |
+
"loss": 2.374,
|
| 6403 |
+
"step": 17775
|
| 6404 |
+
},
|
| 6405 |
+
{
|
| 6406 |
+
"epoch": 0.356,
|
| 6407 |
+
"grad_norm": 0.5487711297157323,
|
| 6408 |
+
"learning_rate": 7.155777777777778e-06,
|
| 6409 |
+
"loss": 2.3872,
|
| 6410 |
+
"step": 17800
|
| 6411 |
+
},
|
| 6412 |
+
{
|
| 6413 |
+
"epoch": 0.356,
|
| 6414 |
+
"eval_loss": 2.3928709030151367,
|
| 6415 |
+
"eval_runtime": 31.7364,
|
| 6416 |
+
"eval_samples_per_second": 3.214,
|
| 6417 |
+
"eval_steps_per_second": 1.607,
|
| 6418 |
+
"step": 17800
|
| 6419 |
+
},
|
| 6420 |
+
{
|
| 6421 |
+
"epoch": 0.3565,
|
| 6422 |
+
"grad_norm": 0.5749112760792647,
|
| 6423 |
+
"learning_rate": 7.150222222222223e-06,
|
| 6424 |
+
"loss": 2.375,
|
| 6425 |
+
"step": 17825
|
| 6426 |
+
},
|
| 6427 |
+
{
|
| 6428 |
+
"epoch": 0.357,
|
| 6429 |
+
"grad_norm": 0.5657127084376901,
|
| 6430 |
+
"learning_rate": 7.144666666666668e-06,
|
| 6431 |
+
"loss": 2.3635,
|
| 6432 |
+
"step": 17850
|
| 6433 |
+
},
|
| 6434 |
+
{
|
| 6435 |
+
"epoch": 0.3575,
|
| 6436 |
+
"grad_norm": 0.5552559911086609,
|
| 6437 |
+
"learning_rate": 7.139111111111112e-06,
|
| 6438 |
+
"loss": 2.3791,
|
| 6439 |
+
"step": 17875
|
| 6440 |
+
},
|
| 6441 |
+
{
|
| 6442 |
+
"epoch": 0.358,
|
| 6443 |
+
"grad_norm": 0.5587079571658956,
|
| 6444 |
+
"learning_rate": 7.133555555555556e-06,
|
| 6445 |
+
"loss": 2.3792,
|
| 6446 |
+
"step": 17900
|
| 6447 |
+
},
|
| 6448 |
+
{
|
| 6449 |
+
"epoch": 0.358,
|
| 6450 |
+
"eval_loss": 2.39250111579895,
|
| 6451 |
+
"eval_runtime": 31.8377,
|
| 6452 |
+
"eval_samples_per_second": 3.204,
|
| 6453 |
+
"eval_steps_per_second": 1.602,
|
| 6454 |
+
"step": 17900
|
| 6455 |
+
},
|
| 6456 |
+
{
|
| 6457 |
+
"epoch": 0.3585,
|
| 6458 |
+
"grad_norm": 0.5476769108414363,
|
| 6459 |
+
"learning_rate": 7.128e-06,
|
| 6460 |
+
"loss": 2.3796,
|
| 6461 |
+
"step": 17925
|
| 6462 |
+
},
|
| 6463 |
+
{
|
| 6464 |
+
"epoch": 0.359,
|
| 6465 |
+
"grad_norm": 0.5519286017800472,
|
| 6466 |
+
"learning_rate": 7.1224444444444454e-06,
|
| 6467 |
+
"loss": 2.3689,
|
| 6468 |
+
"step": 17950
|
| 6469 |
+
},
|
| 6470 |
+
{
|
| 6471 |
+
"epoch": 0.3595,
|
| 6472 |
+
"grad_norm": 0.5690523665272621,
|
| 6473 |
+
"learning_rate": 7.11688888888889e-06,
|
| 6474 |
+
"loss": 2.3758,
|
| 6475 |
+
"step": 17975
|
| 6476 |
+
},
|
| 6477 |
+
{
|
| 6478 |
+
"epoch": 0.36,
|
| 6479 |
+
"grad_norm": 0.575484852893059,
|
| 6480 |
+
"learning_rate": 7.111333333333333e-06,
|
| 6481 |
+
"loss": 2.3723,
|
| 6482 |
+
"step": 18000
|
| 6483 |
+
},
|
| 6484 |
+
{
|
| 6485 |
+
"epoch": 0.36,
|
| 6486 |
+
"eval_loss": 2.3920133113861084,
|
| 6487 |
+
"eval_runtime": 31.9286,
|
| 6488 |
+
"eval_samples_per_second": 3.195,
|
| 6489 |
+
"eval_steps_per_second": 1.597,
|
| 6490 |
+
"step": 18000
|
| 6491 |
}
|
| 6492 |
],
|
| 6493 |
"logging_steps": 25,
|
|
|
|
| 6507 |
"attributes": {}
|
| 6508 |
}
|
| 6509 |
},
|
| 6510 |
+
"total_flos": 5.729764136988967e+19,
|
| 6511 |
"train_batch_size": 1,
|
| 6512 |
"trial_name": null,
|
| 6513 |
"trial_params": null
|