Training checkpoint at step 13000
Browse files- trainer_state.json +366 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4328,6 +4328,366 @@
|
|
| 4328 |
"eval_samples_per_second": 3.216,
|
| 4329 |
"eval_steps_per_second": 1.608,
|
| 4330 |
"step": 12000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4331 |
}
|
| 4332 |
],
|
| 4333 |
"logging_steps": 25,
|
|
@@ -4347,7 +4707,7 @@
|
|
| 4347 |
"attributes": {}
|
| 4348 |
}
|
| 4349 |
},
|
| 4350 |
-
"total_flos":
|
| 4351 |
"train_batch_size": 1,
|
| 4352 |
"trial_name": null,
|
| 4353 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 13000,
|
| 3 |
+
"best_metric": 2.4009385108947754,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-13000",
|
| 5 |
+
"epoch": 0.26,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 13000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4328 |
"eval_samples_per_second": 3.216,
|
| 4329 |
"eval_steps_per_second": 1.608,
|
| 4330 |
"step": 12000
|
| 4331 |
+
},
|
| 4332 |
+
{
|
| 4333 |
+
"epoch": 0.2405,
|
| 4334 |
+
"grad_norm": 0.5687487747515707,
|
| 4335 |
+
"learning_rate": 8.43911111111111e-06,
|
| 4336 |
+
"loss": 2.3859,
|
| 4337 |
+
"step": 12025
|
| 4338 |
+
},
|
| 4339 |
+
{
|
| 4340 |
+
"epoch": 0.241,
|
| 4341 |
+
"grad_norm": 0.6156971193882954,
|
| 4342 |
+
"learning_rate": 8.433555555555556e-06,
|
| 4343 |
+
"loss": 2.3936,
|
| 4344 |
+
"step": 12050
|
| 4345 |
+
},
|
| 4346 |
+
{
|
| 4347 |
+
"epoch": 0.2415,
|
| 4348 |
+
"grad_norm": 0.5735725917481376,
|
| 4349 |
+
"learning_rate": 8.428000000000001e-06,
|
| 4350 |
+
"loss": 2.3867,
|
| 4351 |
+
"step": 12075
|
| 4352 |
+
},
|
| 4353 |
+
{
|
| 4354 |
+
"epoch": 0.242,
|
| 4355 |
+
"grad_norm": 0.5900311312717111,
|
| 4356 |
+
"learning_rate": 8.422444444444445e-06,
|
| 4357 |
+
"loss": 2.381,
|
| 4358 |
+
"step": 12100
|
| 4359 |
+
},
|
| 4360 |
+
{
|
| 4361 |
+
"epoch": 0.242,
|
| 4362 |
+
"eval_loss": 2.402616262435913,
|
| 4363 |
+
"eval_runtime": 31.728,
|
| 4364 |
+
"eval_samples_per_second": 3.215,
|
| 4365 |
+
"eval_steps_per_second": 1.607,
|
| 4366 |
+
"step": 12100
|
| 4367 |
+
},
|
| 4368 |
+
{
|
| 4369 |
+
"epoch": 0.2425,
|
| 4370 |
+
"grad_norm": 0.6210456413331185,
|
| 4371 |
+
"learning_rate": 8.41688888888889e-06,
|
| 4372 |
+
"loss": 2.3897,
|
| 4373 |
+
"step": 12125
|
| 4374 |
+
},
|
| 4375 |
+
{
|
| 4376 |
+
"epoch": 0.243,
|
| 4377 |
+
"grad_norm": 0.564076844370536,
|
| 4378 |
+
"learning_rate": 8.411333333333334e-06,
|
| 4379 |
+
"loss": 2.3789,
|
| 4380 |
+
"step": 12150
|
| 4381 |
+
},
|
| 4382 |
+
{
|
| 4383 |
+
"epoch": 0.2435,
|
| 4384 |
+
"grad_norm": 0.5787670607206897,
|
| 4385 |
+
"learning_rate": 8.405777777777779e-06,
|
| 4386 |
+
"loss": 2.3927,
|
| 4387 |
+
"step": 12175
|
| 4388 |
+
},
|
| 4389 |
+
{
|
| 4390 |
+
"epoch": 0.244,
|
| 4391 |
+
"grad_norm": 0.557686861390105,
|
| 4392 |
+
"learning_rate": 8.400222222222222e-06,
|
| 4393 |
+
"loss": 2.3761,
|
| 4394 |
+
"step": 12200
|
| 4395 |
+
},
|
| 4396 |
+
{
|
| 4397 |
+
"epoch": 0.244,
|
| 4398 |
+
"eval_loss": 2.4025542736053467,
|
| 4399 |
+
"eval_runtime": 31.8116,
|
| 4400 |
+
"eval_samples_per_second": 3.206,
|
| 4401 |
+
"eval_steps_per_second": 1.603,
|
| 4402 |
+
"step": 12200
|
| 4403 |
+
},
|
| 4404 |
+
{
|
| 4405 |
+
"epoch": 0.2445,
|
| 4406 |
+
"grad_norm": 0.5642621664909974,
|
| 4407 |
+
"learning_rate": 8.394666666666668e-06,
|
| 4408 |
+
"loss": 2.3787,
|
| 4409 |
+
"step": 12225
|
| 4410 |
+
},
|
| 4411 |
+
{
|
| 4412 |
+
"epoch": 0.245,
|
| 4413 |
+
"grad_norm": 0.5812642245692796,
|
| 4414 |
+
"learning_rate": 8.389111111111113e-06,
|
| 4415 |
+
"loss": 2.3888,
|
| 4416 |
+
"step": 12250
|
| 4417 |
+
},
|
| 4418 |
+
{
|
| 4419 |
+
"epoch": 0.2455,
|
| 4420 |
+
"grad_norm": 0.5903665572148793,
|
| 4421 |
+
"learning_rate": 8.383555555555557e-06,
|
| 4422 |
+
"loss": 2.3874,
|
| 4423 |
+
"step": 12275
|
| 4424 |
+
},
|
| 4425 |
+
{
|
| 4426 |
+
"epoch": 0.246,
|
| 4427 |
+
"grad_norm": 0.5752826274496151,
|
| 4428 |
+
"learning_rate": 8.378e-06,
|
| 4429 |
+
"loss": 2.3851,
|
| 4430 |
+
"step": 12300
|
| 4431 |
+
},
|
| 4432 |
+
{
|
| 4433 |
+
"epoch": 0.246,
|
| 4434 |
+
"eval_loss": 2.4024178981781006,
|
| 4435 |
+
"eval_runtime": 31.9538,
|
| 4436 |
+
"eval_samples_per_second": 3.192,
|
| 4437 |
+
"eval_steps_per_second": 1.596,
|
| 4438 |
+
"step": 12300
|
| 4439 |
+
},
|
| 4440 |
+
{
|
| 4441 |
+
"epoch": 0.2465,
|
| 4442 |
+
"grad_norm": 0.5625780105871633,
|
| 4443 |
+
"learning_rate": 8.372444444444445e-06,
|
| 4444 |
+
"loss": 2.3857,
|
| 4445 |
+
"step": 12325
|
| 4446 |
+
},
|
| 4447 |
+
{
|
| 4448 |
+
"epoch": 0.247,
|
| 4449 |
+
"grad_norm": 0.5516059110433715,
|
| 4450 |
+
"learning_rate": 8.36688888888889e-06,
|
| 4451 |
+
"loss": 2.387,
|
| 4452 |
+
"step": 12350
|
| 4453 |
+
},
|
| 4454 |
+
{
|
| 4455 |
+
"epoch": 0.2475,
|
| 4456 |
+
"grad_norm": 0.5743651124710031,
|
| 4457 |
+
"learning_rate": 8.361333333333334e-06,
|
| 4458 |
+
"loss": 2.3899,
|
| 4459 |
+
"step": 12375
|
| 4460 |
+
},
|
| 4461 |
+
{
|
| 4462 |
+
"epoch": 0.248,
|
| 4463 |
+
"grad_norm": 0.6065509345211424,
|
| 4464 |
+
"learning_rate": 8.355777777777778e-06,
|
| 4465 |
+
"loss": 2.3811,
|
| 4466 |
+
"step": 12400
|
| 4467 |
+
},
|
| 4468 |
+
{
|
| 4469 |
+
"epoch": 0.248,
|
| 4470 |
+
"eval_loss": 2.402189254760742,
|
| 4471 |
+
"eval_runtime": 31.7357,
|
| 4472 |
+
"eval_samples_per_second": 3.214,
|
| 4473 |
+
"eval_steps_per_second": 1.607,
|
| 4474 |
+
"step": 12400
|
| 4475 |
+
},
|
| 4476 |
+
{
|
| 4477 |
+
"epoch": 0.2485,
|
| 4478 |
+
"grad_norm": 0.569411806780091,
|
| 4479 |
+
"learning_rate": 8.350222222222223e-06,
|
| 4480 |
+
"loss": 2.3891,
|
| 4481 |
+
"step": 12425
|
| 4482 |
+
},
|
| 4483 |
+
{
|
| 4484 |
+
"epoch": 0.249,
|
| 4485 |
+
"grad_norm": 0.5781227404353481,
|
| 4486 |
+
"learning_rate": 8.344666666666668e-06,
|
| 4487 |
+
"loss": 2.3799,
|
| 4488 |
+
"step": 12450
|
| 4489 |
+
},
|
| 4490 |
+
{
|
| 4491 |
+
"epoch": 0.2495,
|
| 4492 |
+
"grad_norm": 0.5882770416548074,
|
| 4493 |
+
"learning_rate": 8.339111111111112e-06,
|
| 4494 |
+
"loss": 2.3921,
|
| 4495 |
+
"step": 12475
|
| 4496 |
+
},
|
| 4497 |
+
{
|
| 4498 |
+
"epoch": 0.25,
|
| 4499 |
+
"grad_norm": 0.6053137792053689,
|
| 4500 |
+
"learning_rate": 8.333555555555555e-06,
|
| 4501 |
+
"loss": 2.3923,
|
| 4502 |
+
"step": 12500
|
| 4503 |
+
},
|
| 4504 |
+
{
|
| 4505 |
+
"epoch": 0.25,
|
| 4506 |
+
"eval_loss": 2.401906967163086,
|
| 4507 |
+
"eval_runtime": 31.7052,
|
| 4508 |
+
"eval_samples_per_second": 3.217,
|
| 4509 |
+
"eval_steps_per_second": 1.609,
|
| 4510 |
+
"step": 12500
|
| 4511 |
+
},
|
| 4512 |
+
{
|
| 4513 |
+
"epoch": 0.2505,
|
| 4514 |
+
"grad_norm": 0.5493940361276148,
|
| 4515 |
+
"learning_rate": 8.328e-06,
|
| 4516 |
+
"loss": 2.3872,
|
| 4517 |
+
"step": 12525
|
| 4518 |
+
},
|
| 4519 |
+
{
|
| 4520 |
+
"epoch": 0.251,
|
| 4521 |
+
"grad_norm": 0.5844453837465953,
|
| 4522 |
+
"learning_rate": 8.322444444444446e-06,
|
| 4523 |
+
"loss": 2.3859,
|
| 4524 |
+
"step": 12550
|
| 4525 |
+
},
|
| 4526 |
+
{
|
| 4527 |
+
"epoch": 0.2515,
|
| 4528 |
+
"grad_norm": 0.589694030674745,
|
| 4529 |
+
"learning_rate": 8.31688888888889e-06,
|
| 4530 |
+
"loss": 2.3852,
|
| 4531 |
+
"step": 12575
|
| 4532 |
+
},
|
| 4533 |
+
{
|
| 4534 |
+
"epoch": 0.252,
|
| 4535 |
+
"grad_norm": 0.5985872367130171,
|
| 4536 |
+
"learning_rate": 8.311333333333333e-06,
|
| 4537 |
+
"loss": 2.378,
|
| 4538 |
+
"step": 12600
|
| 4539 |
+
},
|
| 4540 |
+
{
|
| 4541 |
+
"epoch": 0.252,
|
| 4542 |
+
"eval_loss": 2.4017632007598877,
|
| 4543 |
+
"eval_runtime": 31.8059,
|
| 4544 |
+
"eval_samples_per_second": 3.207,
|
| 4545 |
+
"eval_steps_per_second": 1.603,
|
| 4546 |
+
"step": 12600
|
| 4547 |
+
},
|
| 4548 |
+
{
|
| 4549 |
+
"epoch": 0.2525,
|
| 4550 |
+
"grad_norm": 0.6246560097732429,
|
| 4551 |
+
"learning_rate": 8.305777777777778e-06,
|
| 4552 |
+
"loss": 2.3891,
|
| 4553 |
+
"step": 12625
|
| 4554 |
+
},
|
| 4555 |
+
{
|
| 4556 |
+
"epoch": 0.253,
|
| 4557 |
+
"grad_norm": 0.5977851115835912,
|
| 4558 |
+
"learning_rate": 8.300222222222223e-06,
|
| 4559 |
+
"loss": 2.3884,
|
| 4560 |
+
"step": 12650
|
| 4561 |
+
},
|
| 4562 |
+
{
|
| 4563 |
+
"epoch": 0.2535,
|
| 4564 |
+
"grad_norm": 0.5535634109353079,
|
| 4565 |
+
"learning_rate": 8.294666666666667e-06,
|
| 4566 |
+
"loss": 2.3894,
|
| 4567 |
+
"step": 12675
|
| 4568 |
+
},
|
| 4569 |
+
{
|
| 4570 |
+
"epoch": 0.254,
|
| 4571 |
+
"grad_norm": 0.5647542662126371,
|
| 4572 |
+
"learning_rate": 8.289111111111112e-06,
|
| 4573 |
+
"loss": 2.3889,
|
| 4574 |
+
"step": 12700
|
| 4575 |
+
},
|
| 4576 |
+
{
|
| 4577 |
+
"epoch": 0.254,
|
| 4578 |
+
"eval_loss": 2.4015073776245117,
|
| 4579 |
+
"eval_runtime": 31.6682,
|
| 4580 |
+
"eval_samples_per_second": 3.221,
|
| 4581 |
+
"eval_steps_per_second": 1.61,
|
| 4582 |
+
"step": 12700
|
| 4583 |
+
},
|
| 4584 |
+
{
|
| 4585 |
+
"epoch": 0.2545,
|
| 4586 |
+
"grad_norm": 0.5689860381748764,
|
| 4587 |
+
"learning_rate": 8.283555555555556e-06,
|
| 4588 |
+
"loss": 2.391,
|
| 4589 |
+
"step": 12725
|
| 4590 |
+
},
|
| 4591 |
+
{
|
| 4592 |
+
"epoch": 0.255,
|
| 4593 |
+
"grad_norm": 0.5788815220722723,
|
| 4594 |
+
"learning_rate": 8.278000000000001e-06,
|
| 4595 |
+
"loss": 2.3746,
|
| 4596 |
+
"step": 12750
|
| 4597 |
+
},
|
| 4598 |
+
{
|
| 4599 |
+
"epoch": 0.2555,
|
| 4600 |
+
"grad_norm": 0.5746385277305921,
|
| 4601 |
+
"learning_rate": 8.272444444444445e-06,
|
| 4602 |
+
"loss": 2.3884,
|
| 4603 |
+
"step": 12775
|
| 4604 |
+
},
|
| 4605 |
+
{
|
| 4606 |
+
"epoch": 0.256,
|
| 4607 |
+
"grad_norm": 0.5952261074381101,
|
| 4608 |
+
"learning_rate": 8.26688888888889e-06,
|
| 4609 |
+
"loss": 2.387,
|
| 4610 |
+
"step": 12800
|
| 4611 |
+
},
|
| 4612 |
+
{
|
| 4613 |
+
"epoch": 0.256,
|
| 4614 |
+
"eval_loss": 2.401090383529663,
|
| 4615 |
+
"eval_runtime": 31.7518,
|
| 4616 |
+
"eval_samples_per_second": 3.212,
|
| 4617 |
+
"eval_steps_per_second": 1.606,
|
| 4618 |
+
"step": 12800
|
| 4619 |
+
},
|
| 4620 |
+
{
|
| 4621 |
+
"epoch": 0.2565,
|
| 4622 |
+
"grad_norm": 0.581914246490724,
|
| 4623 |
+
"learning_rate": 8.261333333333335e-06,
|
| 4624 |
+
"loss": 2.3879,
|
| 4625 |
+
"step": 12825
|
| 4626 |
+
},
|
| 4627 |
+
{
|
| 4628 |
+
"epoch": 0.257,
|
| 4629 |
+
"grad_norm": 0.5582195018164189,
|
| 4630 |
+
"learning_rate": 8.255777777777779e-06,
|
| 4631 |
+
"loss": 2.3783,
|
| 4632 |
+
"step": 12850
|
| 4633 |
+
},
|
| 4634 |
+
{
|
| 4635 |
+
"epoch": 0.2575,
|
| 4636 |
+
"grad_norm": 0.5633036552978725,
|
| 4637 |
+
"learning_rate": 8.250222222222222e-06,
|
| 4638 |
+
"loss": 2.3845,
|
| 4639 |
+
"step": 12875
|
| 4640 |
+
},
|
| 4641 |
+
{
|
| 4642 |
+
"epoch": 0.258,
|
| 4643 |
+
"grad_norm": 0.5613155523789654,
|
| 4644 |
+
"learning_rate": 8.244666666666667e-06,
|
| 4645 |
+
"loss": 2.3942,
|
| 4646 |
+
"step": 12900
|
| 4647 |
+
},
|
| 4648 |
+
{
|
| 4649 |
+
"epoch": 0.258,
|
| 4650 |
+
"eval_loss": 2.4014108180999756,
|
| 4651 |
+
"eval_runtime": 31.8052,
|
| 4652 |
+
"eval_samples_per_second": 3.207,
|
| 4653 |
+
"eval_steps_per_second": 1.604,
|
| 4654 |
+
"step": 12900
|
| 4655 |
+
},
|
| 4656 |
+
{
|
| 4657 |
+
"epoch": 0.2585,
|
| 4658 |
+
"grad_norm": 0.5906307979751212,
|
| 4659 |
+
"learning_rate": 8.239111111111113e-06,
|
| 4660 |
+
"loss": 2.3807,
|
| 4661 |
+
"step": 12925
|
| 4662 |
+
},
|
| 4663 |
+
{
|
| 4664 |
+
"epoch": 0.259,
|
| 4665 |
+
"grad_norm": 0.5786593603781868,
|
| 4666 |
+
"learning_rate": 8.233555555555556e-06,
|
| 4667 |
+
"loss": 2.3848,
|
| 4668 |
+
"step": 12950
|
| 4669 |
+
},
|
| 4670 |
+
{
|
| 4671 |
+
"epoch": 0.2595,
|
| 4672 |
+
"grad_norm": 0.5739057988147651,
|
| 4673 |
+
"learning_rate": 8.228e-06,
|
| 4674 |
+
"loss": 2.3841,
|
| 4675 |
+
"step": 12975
|
| 4676 |
+
},
|
| 4677 |
+
{
|
| 4678 |
+
"epoch": 0.26,
|
| 4679 |
+
"grad_norm": 0.5727067411665359,
|
| 4680 |
+
"learning_rate": 8.222444444444445e-06,
|
| 4681 |
+
"loss": 2.3771,
|
| 4682 |
+
"step": 13000
|
| 4683 |
+
},
|
| 4684 |
+
{
|
| 4685 |
+
"epoch": 0.26,
|
| 4686 |
+
"eval_loss": 2.4009385108947754,
|
| 4687 |
+
"eval_runtime": 31.8075,
|
| 4688 |
+
"eval_samples_per_second": 3.207,
|
| 4689 |
+
"eval_steps_per_second": 1.603,
|
| 4690 |
+
"step": 13000
|
| 4691 |
}
|
| 4692 |
],
|
| 4693 |
"logging_steps": 25,
|
|
|
|
| 4707 |
"attributes": {}
|
| 4708 |
}
|
| 4709 |
},
|
| 4710 |
+
"total_flos": 4.138162987825365e+19,
|
| 4711 |
"train_batch_size": 1,
|
| 4712 |
"trial_name": null,
|
| 4713 |
"trial_params": null
|