Training checkpoint at step 24000
Browse files- trainer_state.json +366 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8288,6 +8288,366 @@
|
|
| 8288 |
"eval_samples_per_second": 3.211,
|
| 8289 |
"eval_steps_per_second": 1.605,
|
| 8290 |
"step": 23000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8291 |
}
|
| 8292 |
],
|
| 8293 |
"logging_steps": 25,
|
|
@@ -8307,7 +8667,7 @@
|
|
| 8307 |
"attributes": {}
|
| 8308 |
}
|
| 8309 |
},
|
| 8310 |
-
"total_flos": 7.
|
| 8311 |
"train_batch_size": 1,
|
| 8312 |
"trial_name": null,
|
| 8313 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 24000,
|
| 3 |
+
"best_metric": 2.3842599391937256,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-24000",
|
| 5 |
+
"epoch": 0.48,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 24000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8288 |
"eval_samples_per_second": 3.211,
|
| 8289 |
"eval_steps_per_second": 1.605,
|
| 8290 |
"step": 23000
|
| 8291 |
+
},
|
| 8292 |
+
{
|
| 8293 |
+
"epoch": 0.4605,
|
| 8294 |
+
"grad_norm": 0.5644614073323889,
|
| 8295 |
+
"learning_rate": 5.994666666666668e-06,
|
| 8296 |
+
"loss": 2.3627,
|
| 8297 |
+
"step": 23025
|
| 8298 |
+
},
|
| 8299 |
+
{
|
| 8300 |
+
"epoch": 0.461,
|
| 8301 |
+
"grad_norm": 0.561196100799294,
|
| 8302 |
+
"learning_rate": 5.989111111111111e-06,
|
| 8303 |
+
"loss": 2.373,
|
| 8304 |
+
"step": 23050
|
| 8305 |
+
},
|
| 8306 |
+
{
|
| 8307 |
+
"epoch": 0.4615,
|
| 8308 |
+
"grad_norm": 0.5988172465498709,
|
| 8309 |
+
"learning_rate": 5.983555555555556e-06,
|
| 8310 |
+
"loss": 2.3625,
|
| 8311 |
+
"step": 23075
|
| 8312 |
+
},
|
| 8313 |
+
{
|
| 8314 |
+
"epoch": 0.462,
|
| 8315 |
+
"grad_norm": 0.5561927981892911,
|
| 8316 |
+
"learning_rate": 5.978e-06,
|
| 8317 |
+
"loss": 2.366,
|
| 8318 |
+
"step": 23100
|
| 8319 |
+
},
|
| 8320 |
+
{
|
| 8321 |
+
"epoch": 0.462,
|
| 8322 |
+
"eval_loss": 2.3851592540740967,
|
| 8323 |
+
"eval_runtime": 31.9972,
|
| 8324 |
+
"eval_samples_per_second": 3.188,
|
| 8325 |
+
"eval_steps_per_second": 1.594,
|
| 8326 |
+
"step": 23100
|
| 8327 |
+
},
|
| 8328 |
+
{
|
| 8329 |
+
"epoch": 0.4625,
|
| 8330 |
+
"grad_norm": 0.5473375939412587,
|
| 8331 |
+
"learning_rate": 5.9724444444444454e-06,
|
| 8332 |
+
"loss": 2.3577,
|
| 8333 |
+
"step": 23125
|
| 8334 |
+
},
|
| 8335 |
+
{
|
| 8336 |
+
"epoch": 0.463,
|
| 8337 |
+
"grad_norm": 0.5422432723666715,
|
| 8338 |
+
"learning_rate": 5.96688888888889e-06,
|
| 8339 |
+
"loss": 2.3724,
|
| 8340 |
+
"step": 23150
|
| 8341 |
+
},
|
| 8342 |
+
{
|
| 8343 |
+
"epoch": 0.4635,
|
| 8344 |
+
"grad_norm": 0.5459369802725026,
|
| 8345 |
+
"learning_rate": 5.961333333333333e-06,
|
| 8346 |
+
"loss": 2.3693,
|
| 8347 |
+
"step": 23175
|
| 8348 |
+
},
|
| 8349 |
+
{
|
| 8350 |
+
"epoch": 0.464,
|
| 8351 |
+
"grad_norm": 0.5602391995824985,
|
| 8352 |
+
"learning_rate": 5.955777777777778e-06,
|
| 8353 |
+
"loss": 2.3662,
|
| 8354 |
+
"step": 23200
|
| 8355 |
+
},
|
| 8356 |
+
{
|
| 8357 |
+
"epoch": 0.464,
|
| 8358 |
+
"eval_loss": 2.384812593460083,
|
| 8359 |
+
"eval_runtime": 31.7736,
|
| 8360 |
+
"eval_samples_per_second": 3.21,
|
| 8361 |
+
"eval_steps_per_second": 1.605,
|
| 8362 |
+
"step": 23200
|
| 8363 |
+
},
|
| 8364 |
+
{
|
| 8365 |
+
"epoch": 0.4645,
|
| 8366 |
+
"grad_norm": 0.5382771454200044,
|
| 8367 |
+
"learning_rate": 5.950222222222223e-06,
|
| 8368 |
+
"loss": 2.373,
|
| 8369 |
+
"step": 23225
|
| 8370 |
+
},
|
| 8371 |
+
{
|
| 8372 |
+
"epoch": 0.465,
|
| 8373 |
+
"grad_norm": 0.5616408548500356,
|
| 8374 |
+
"learning_rate": 5.9446666666666675e-06,
|
| 8375 |
+
"loss": 2.3744,
|
| 8376 |
+
"step": 23250
|
| 8377 |
+
},
|
| 8378 |
+
{
|
| 8379 |
+
"epoch": 0.4655,
|
| 8380 |
+
"grad_norm": 0.5626270768454595,
|
| 8381 |
+
"learning_rate": 5.939111111111111e-06,
|
| 8382 |
+
"loss": 2.3745,
|
| 8383 |
+
"step": 23275
|
| 8384 |
+
},
|
| 8385 |
+
{
|
| 8386 |
+
"epoch": 0.466,
|
| 8387 |
+
"grad_norm": 0.5771198592247021,
|
| 8388 |
+
"learning_rate": 5.933555555555555e-06,
|
| 8389 |
+
"loss": 2.3712,
|
| 8390 |
+
"step": 23300
|
| 8391 |
+
},
|
| 8392 |
+
{
|
| 8393 |
+
"epoch": 0.466,
|
| 8394 |
+
"eval_loss": 2.385037660598755,
|
| 8395 |
+
"eval_runtime": 31.6688,
|
| 8396 |
+
"eval_samples_per_second": 3.221,
|
| 8397 |
+
"eval_steps_per_second": 1.61,
|
| 8398 |
+
"step": 23300
|
| 8399 |
+
},
|
| 8400 |
+
{
|
| 8401 |
+
"epoch": 0.4665,
|
| 8402 |
+
"grad_norm": 0.553677767303205,
|
| 8403 |
+
"learning_rate": 5.928000000000001e-06,
|
| 8404 |
+
"loss": 2.3688,
|
| 8405 |
+
"step": 23325
|
| 8406 |
+
},
|
| 8407 |
+
{
|
| 8408 |
+
"epoch": 0.467,
|
| 8409 |
+
"grad_norm": 0.5761122434148291,
|
| 8410 |
+
"learning_rate": 5.922444444444445e-06,
|
| 8411 |
+
"loss": 2.3697,
|
| 8412 |
+
"step": 23350
|
| 8413 |
+
},
|
| 8414 |
+
{
|
| 8415 |
+
"epoch": 0.4675,
|
| 8416 |
+
"grad_norm": 0.5776134096430138,
|
| 8417 |
+
"learning_rate": 5.9168888888888895e-06,
|
| 8418 |
+
"loss": 2.3696,
|
| 8419 |
+
"step": 23375
|
| 8420 |
+
},
|
| 8421 |
+
{
|
| 8422 |
+
"epoch": 0.468,
|
| 8423 |
+
"grad_norm": 0.5410943763458229,
|
| 8424 |
+
"learning_rate": 5.911333333333333e-06,
|
| 8425 |
+
"loss": 2.3748,
|
| 8426 |
+
"step": 23400
|
| 8427 |
+
},
|
| 8428 |
+
{
|
| 8429 |
+
"epoch": 0.468,
|
| 8430 |
+
"eval_loss": 2.3850579261779785,
|
| 8431 |
+
"eval_runtime": 31.7506,
|
| 8432 |
+
"eval_samples_per_second": 3.213,
|
| 8433 |
+
"eval_steps_per_second": 1.606,
|
| 8434 |
+
"step": 23400
|
| 8435 |
+
},
|
| 8436 |
+
{
|
| 8437 |
+
"epoch": 0.4685,
|
| 8438 |
+
"grad_norm": 0.5496846088073756,
|
| 8439 |
+
"learning_rate": 5.905777777777778e-06,
|
| 8440 |
+
"loss": 2.3631,
|
| 8441 |
+
"step": 23425
|
| 8442 |
+
},
|
| 8443 |
+
{
|
| 8444 |
+
"epoch": 0.469,
|
| 8445 |
+
"grad_norm": 0.5489837887647091,
|
| 8446 |
+
"learning_rate": 5.900222222222223e-06,
|
| 8447 |
+
"loss": 2.3752,
|
| 8448 |
+
"step": 23450
|
| 8449 |
+
},
|
| 8450 |
+
{
|
| 8451 |
+
"epoch": 0.4695,
|
| 8452 |
+
"grad_norm": 0.5595321821458019,
|
| 8453 |
+
"learning_rate": 5.894666666666667e-06,
|
| 8454 |
+
"loss": 2.3681,
|
| 8455 |
+
"step": 23475
|
| 8456 |
+
},
|
| 8457 |
+
{
|
| 8458 |
+
"epoch": 0.47,
|
| 8459 |
+
"grad_norm": 0.5441176871533538,
|
| 8460 |
+
"learning_rate": 5.889111111111112e-06,
|
| 8461 |
+
"loss": 2.3689,
|
| 8462 |
+
"step": 23500
|
| 8463 |
+
},
|
| 8464 |
+
{
|
| 8465 |
+
"epoch": 0.47,
|
| 8466 |
+
"eval_loss": 2.3847615718841553,
|
| 8467 |
+
"eval_runtime": 31.7515,
|
| 8468 |
+
"eval_samples_per_second": 3.212,
|
| 8469 |
+
"eval_steps_per_second": 1.606,
|
| 8470 |
+
"step": 23500
|
| 8471 |
+
},
|
| 8472 |
+
{
|
| 8473 |
+
"epoch": 0.4705,
|
| 8474 |
+
"grad_norm": 0.5591005943894303,
|
| 8475 |
+
"learning_rate": 5.883555555555556e-06,
|
| 8476 |
+
"loss": 2.3687,
|
| 8477 |
+
"step": 23525
|
| 8478 |
+
},
|
| 8479 |
+
{
|
| 8480 |
+
"epoch": 0.471,
|
| 8481 |
+
"grad_norm": 0.5569068986313633,
|
| 8482 |
+
"learning_rate": 5.878e-06,
|
| 8483 |
+
"loss": 2.3579,
|
| 8484 |
+
"step": 23550
|
| 8485 |
+
},
|
| 8486 |
+
{
|
| 8487 |
+
"epoch": 0.4715,
|
| 8488 |
+
"grad_norm": 0.5544550604142251,
|
| 8489 |
+
"learning_rate": 5.872444444444445e-06,
|
| 8490 |
+
"loss": 2.3654,
|
| 8491 |
+
"step": 23575
|
| 8492 |
+
},
|
| 8493 |
+
{
|
| 8494 |
+
"epoch": 0.472,
|
| 8495 |
+
"grad_norm": 0.5682698532685105,
|
| 8496 |
+
"learning_rate": 5.86688888888889e-06,
|
| 8497 |
+
"loss": 2.3686,
|
| 8498 |
+
"step": 23600
|
| 8499 |
+
},
|
| 8500 |
+
{
|
| 8501 |
+
"epoch": 0.472,
|
| 8502 |
+
"eval_loss": 2.384906053543091,
|
| 8503 |
+
"eval_runtime": 31.7623,
|
| 8504 |
+
"eval_samples_per_second": 3.211,
|
| 8505 |
+
"eval_steps_per_second": 1.606,
|
| 8506 |
+
"step": 23600
|
| 8507 |
+
},
|
| 8508 |
+
{
|
| 8509 |
+
"epoch": 0.4725,
|
| 8510 |
+
"grad_norm": 0.5754081011772445,
|
| 8511 |
+
"learning_rate": 5.8613333333333335e-06,
|
| 8512 |
+
"loss": 2.3629,
|
| 8513 |
+
"step": 23625
|
| 8514 |
+
},
|
| 8515 |
+
{
|
| 8516 |
+
"epoch": 0.473,
|
| 8517 |
+
"grad_norm": 0.605492062724259,
|
| 8518 |
+
"learning_rate": 5.855777777777778e-06,
|
| 8519 |
+
"loss": 2.3702,
|
| 8520 |
+
"step": 23650
|
| 8521 |
+
},
|
| 8522 |
+
{
|
| 8523 |
+
"epoch": 0.4735,
|
| 8524 |
+
"grad_norm": 0.5407520724247802,
|
| 8525 |
+
"learning_rate": 5.850222222222222e-06,
|
| 8526 |
+
"loss": 2.3652,
|
| 8527 |
+
"step": 23675
|
| 8528 |
+
},
|
| 8529 |
+
{
|
| 8530 |
+
"epoch": 0.474,
|
| 8531 |
+
"grad_norm": 0.5531865604429913,
|
| 8532 |
+
"learning_rate": 5.8446666666666676e-06,
|
| 8533 |
+
"loss": 2.3724,
|
| 8534 |
+
"step": 23700
|
| 8535 |
+
},
|
| 8536 |
+
{
|
| 8537 |
+
"epoch": 0.474,
|
| 8538 |
+
"eval_loss": 2.3844547271728516,
|
| 8539 |
+
"eval_runtime": 31.833,
|
| 8540 |
+
"eval_samples_per_second": 3.204,
|
| 8541 |
+
"eval_steps_per_second": 1.602,
|
| 8542 |
+
"step": 23700
|
| 8543 |
+
},
|
| 8544 |
+
{
|
| 8545 |
+
"epoch": 0.4745,
|
| 8546 |
+
"grad_norm": 0.573840223481603,
|
| 8547 |
+
"learning_rate": 5.839111111111112e-06,
|
| 8548 |
+
"loss": 2.365,
|
| 8549 |
+
"step": 23725
|
| 8550 |
+
},
|
| 8551 |
+
{
|
| 8552 |
+
"epoch": 0.475,
|
| 8553 |
+
"grad_norm": 0.545580569851831,
|
| 8554 |
+
"learning_rate": 5.8335555555555555e-06,
|
| 8555 |
+
"loss": 2.3813,
|
| 8556 |
+
"step": 23750
|
| 8557 |
+
},
|
| 8558 |
+
{
|
| 8559 |
+
"epoch": 0.4755,
|
| 8560 |
+
"grad_norm": 0.551471960312376,
|
| 8561 |
+
"learning_rate": 5.828e-06,
|
| 8562 |
+
"loss": 2.3617,
|
| 8563 |
+
"step": 23775
|
| 8564 |
+
},
|
| 8565 |
+
{
|
| 8566 |
+
"epoch": 0.476,
|
| 8567 |
+
"grad_norm": 0.5953130526303944,
|
| 8568 |
+
"learning_rate": 5.822444444444445e-06,
|
| 8569 |
+
"loss": 2.3781,
|
| 8570 |
+
"step": 23800
|
| 8571 |
+
},
|
| 8572 |
+
{
|
| 8573 |
+
"epoch": 0.476,
|
| 8574 |
+
"eval_loss": 2.38433575630188,
|
| 8575 |
+
"eval_runtime": 31.8506,
|
| 8576 |
+
"eval_samples_per_second": 3.202,
|
| 8577 |
+
"eval_steps_per_second": 1.601,
|
| 8578 |
+
"step": 23800
|
| 8579 |
+
},
|
| 8580 |
+
{
|
| 8581 |
+
"epoch": 0.4765,
|
| 8582 |
+
"grad_norm": 0.5604797565202618,
|
| 8583 |
+
"learning_rate": 5.81688888888889e-06,
|
| 8584 |
+
"loss": 2.3716,
|
| 8585 |
+
"step": 23825
|
| 8586 |
+
},
|
| 8587 |
+
{
|
| 8588 |
+
"epoch": 0.477,
|
| 8589 |
+
"grad_norm": 0.554661200228578,
|
| 8590 |
+
"learning_rate": 5.811333333333333e-06,
|
| 8591 |
+
"loss": 2.3724,
|
| 8592 |
+
"step": 23850
|
| 8593 |
+
},
|
| 8594 |
+
{
|
| 8595 |
+
"epoch": 0.4775,
|
| 8596 |
+
"grad_norm": 0.5534736868914567,
|
| 8597 |
+
"learning_rate": 5.8057777777777775e-06,
|
| 8598 |
+
"loss": 2.3754,
|
| 8599 |
+
"step": 23875
|
| 8600 |
+
},
|
| 8601 |
+
{
|
| 8602 |
+
"epoch": 0.478,
|
| 8603 |
+
"grad_norm": 0.541434243018937,
|
| 8604 |
+
"learning_rate": 5.800222222222223e-06,
|
| 8605 |
+
"loss": 2.3612,
|
| 8606 |
+
"step": 23900
|
| 8607 |
+
},
|
| 8608 |
+
{
|
| 8609 |
+
"epoch": 0.478,
|
| 8610 |
+
"eval_loss": 2.3843014240264893,
|
| 8611 |
+
"eval_runtime": 31.7803,
|
| 8612 |
+
"eval_samples_per_second": 3.21,
|
| 8613 |
+
"eval_steps_per_second": 1.605,
|
| 8614 |
+
"step": 23900
|
| 8615 |
+
},
|
| 8616 |
+
{
|
| 8617 |
+
"epoch": 0.4785,
|
| 8618 |
+
"grad_norm": 0.5557683143124796,
|
| 8619 |
+
"learning_rate": 5.794666666666667e-06,
|
| 8620 |
+
"loss": 2.3639,
|
| 8621 |
+
"step": 23925
|
| 8622 |
+
},
|
| 8623 |
+
{
|
| 8624 |
+
"epoch": 0.479,
|
| 8625 |
+
"grad_norm": 0.5799527873689908,
|
| 8626 |
+
"learning_rate": 5.789111111111112e-06,
|
| 8627 |
+
"loss": 2.373,
|
| 8628 |
+
"step": 23950
|
| 8629 |
+
},
|
| 8630 |
+
{
|
| 8631 |
+
"epoch": 0.4795,
|
| 8632 |
+
"grad_norm": 0.590904770982699,
|
| 8633 |
+
"learning_rate": 5.783555555555556e-06,
|
| 8634 |
+
"loss": 2.3778,
|
| 8635 |
+
"step": 23975
|
| 8636 |
+
},
|
| 8637 |
+
{
|
| 8638 |
+
"epoch": 0.48,
|
| 8639 |
+
"grad_norm": 0.5561040991296016,
|
| 8640 |
+
"learning_rate": 5.778e-06,
|
| 8641 |
+
"loss": 2.3552,
|
| 8642 |
+
"step": 24000
|
| 8643 |
+
},
|
| 8644 |
+
{
|
| 8645 |
+
"epoch": 0.48,
|
| 8646 |
+
"eval_loss": 2.3842599391937256,
|
| 8647 |
+
"eval_runtime": 31.7209,
|
| 8648 |
+
"eval_samples_per_second": 3.216,
|
| 8649 |
+
"eval_steps_per_second": 1.608,
|
| 8650 |
+
"step": 24000
|
| 8651 |
}
|
| 8652 |
],
|
| 8653 |
"logging_steps": 25,
|
|
|
|
| 8667 |
"attributes": {}
|
| 8668 |
}
|
| 8669 |
},
|
| 8670 |
+
"total_flos": 7.63968551598529e+19,
|
| 8671 |
"train_batch_size": 1,
|
| 8672 |
"trial_name": null,
|
| 8673 |
"trial_params": null
|