Training checkpoint at step 19000
Browse files- trainer_state.json +366 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6488,6 +6488,366 @@
|
|
| 6488 |
"eval_samples_per_second": 3.195,
|
| 6489 |
"eval_steps_per_second": 1.597,
|
| 6490 |
"step": 18000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6491 |
}
|
| 6492 |
],
|
| 6493 |
"logging_steps": 25,
|
|
@@ -6507,7 +6867,7 @@
|
|
| 6507 |
"attributes": {}
|
| 6508 |
}
|
| 6509 |
},
|
| 6510 |
-
"total_flos":
|
| 6511 |
"train_batch_size": 1,
|
| 6512 |
"trial_name": null,
|
| 6513 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 19000,
|
| 3 |
+
"best_metric": 2.390749454498291,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-19000",
|
| 5 |
+
"epoch": 0.38,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 19000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6488 |
"eval_samples_per_second": 3.195,
|
| 6489 |
"eval_steps_per_second": 1.597,
|
| 6490 |
"step": 18000
|
| 6491 |
+
},
|
| 6492 |
+
{
|
| 6493 |
+
"epoch": 0.3605,
|
| 6494 |
+
"grad_norm": 0.5779120077378331,
|
| 6495 |
+
"learning_rate": 7.105777777777778e-06,
|
| 6496 |
+
"loss": 2.3798,
|
| 6497 |
+
"step": 18025
|
| 6498 |
+
},
|
| 6499 |
+
{
|
| 6500 |
+
"epoch": 0.361,
|
| 6501 |
+
"grad_norm": 0.575309417070187,
|
| 6502 |
+
"learning_rate": 7.100222222222223e-06,
|
| 6503 |
+
"loss": 2.3875,
|
| 6504 |
+
"step": 18050
|
| 6505 |
+
},
|
| 6506 |
+
{
|
| 6507 |
+
"epoch": 0.3615,
|
| 6508 |
+
"grad_norm": 0.6000430306182747,
|
| 6509 |
+
"learning_rate": 7.0946666666666675e-06,
|
| 6510 |
+
"loss": 2.3727,
|
| 6511 |
+
"step": 18075
|
| 6512 |
+
},
|
| 6513 |
+
{
|
| 6514 |
+
"epoch": 0.362,
|
| 6515 |
+
"grad_norm": 0.5701734522791184,
|
| 6516 |
+
"learning_rate": 7.089111111111112e-06,
|
| 6517 |
+
"loss": 2.3793,
|
| 6518 |
+
"step": 18100
|
| 6519 |
+
},
|
| 6520 |
+
{
|
| 6521 |
+
"epoch": 0.362,
|
| 6522 |
+
"eval_loss": 2.392152786254883,
|
| 6523 |
+
"eval_runtime": 31.8363,
|
| 6524 |
+
"eval_samples_per_second": 3.204,
|
| 6525 |
+
"eval_steps_per_second": 1.602,
|
| 6526 |
+
"step": 18100
|
| 6527 |
+
},
|
| 6528 |
+
{
|
| 6529 |
+
"epoch": 0.3625,
|
| 6530 |
+
"grad_norm": 0.5731611332750656,
|
| 6531 |
+
"learning_rate": 7.083555555555555e-06,
|
| 6532 |
+
"loss": 2.3715,
|
| 6533 |
+
"step": 18125
|
| 6534 |
+
},
|
| 6535 |
+
{
|
| 6536 |
+
"epoch": 0.363,
|
| 6537 |
+
"grad_norm": 0.6114229583074544,
|
| 6538 |
+
"learning_rate": 7.078000000000001e-06,
|
| 6539 |
+
"loss": 2.383,
|
| 6540 |
+
"step": 18150
|
| 6541 |
+
},
|
| 6542 |
+
{
|
| 6543 |
+
"epoch": 0.3635,
|
| 6544 |
+
"grad_norm": 0.541007634609165,
|
| 6545 |
+
"learning_rate": 7.072444444444445e-06,
|
| 6546 |
+
"loss": 2.3686,
|
| 6547 |
+
"step": 18175
|
| 6548 |
+
},
|
| 6549 |
+
{
|
| 6550 |
+
"epoch": 0.364,
|
| 6551 |
+
"grad_norm": 0.5725748950012406,
|
| 6552 |
+
"learning_rate": 7.0668888888888895e-06,
|
| 6553 |
+
"loss": 2.3873,
|
| 6554 |
+
"step": 18200
|
| 6555 |
+
},
|
| 6556 |
+
{
|
| 6557 |
+
"epoch": 0.364,
|
| 6558 |
+
"eval_loss": 2.392261505126953,
|
| 6559 |
+
"eval_runtime": 31.7706,
|
| 6560 |
+
"eval_samples_per_second": 3.211,
|
| 6561 |
+
"eval_steps_per_second": 1.605,
|
| 6562 |
+
"step": 18200
|
| 6563 |
+
},
|
| 6564 |
+
{
|
| 6565 |
+
"epoch": 0.3645,
|
| 6566 |
+
"grad_norm": 0.5593670656564304,
|
| 6567 |
+
"learning_rate": 7.061333333333333e-06,
|
| 6568 |
+
"loss": 2.3804,
|
| 6569 |
+
"step": 18225
|
| 6570 |
+
},
|
| 6571 |
+
{
|
| 6572 |
+
"epoch": 0.365,
|
| 6573 |
+
"grad_norm": 0.6009795583649221,
|
| 6574 |
+
"learning_rate": 7.055777777777778e-06,
|
| 6575 |
+
"loss": 2.3795,
|
| 6576 |
+
"step": 18250
|
| 6577 |
+
},
|
| 6578 |
+
{
|
| 6579 |
+
"epoch": 0.3655,
|
| 6580 |
+
"grad_norm": 0.5664495345544722,
|
| 6581 |
+
"learning_rate": 7.050222222222223e-06,
|
| 6582 |
+
"loss": 2.3631,
|
| 6583 |
+
"step": 18275
|
| 6584 |
+
},
|
| 6585 |
+
{
|
| 6586 |
+
"epoch": 0.366,
|
| 6587 |
+
"grad_norm": 0.6104006309418994,
|
| 6588 |
+
"learning_rate": 7.044666666666667e-06,
|
| 6589 |
+
"loss": 2.3748,
|
| 6590 |
+
"step": 18300
|
| 6591 |
+
},
|
| 6592 |
+
{
|
| 6593 |
+
"epoch": 0.366,
|
| 6594 |
+
"eval_loss": 2.392148971557617,
|
| 6595 |
+
"eval_runtime": 31.734,
|
| 6596 |
+
"eval_samples_per_second": 3.214,
|
| 6597 |
+
"eval_steps_per_second": 1.607,
|
| 6598 |
+
"step": 18300
|
| 6599 |
+
},
|
| 6600 |
+
{
|
| 6601 |
+
"epoch": 0.3665,
|
| 6602 |
+
"grad_norm": 0.5506059883330837,
|
| 6603 |
+
"learning_rate": 7.039111111111112e-06,
|
| 6604 |
+
"loss": 2.3714,
|
| 6605 |
+
"step": 18325
|
| 6606 |
+
},
|
| 6607 |
+
{
|
| 6608 |
+
"epoch": 0.367,
|
| 6609 |
+
"grad_norm": 0.5621509156408089,
|
| 6610 |
+
"learning_rate": 7.033555555555556e-06,
|
| 6611 |
+
"loss": 2.368,
|
| 6612 |
+
"step": 18350
|
| 6613 |
+
},
|
| 6614 |
+
{
|
| 6615 |
+
"epoch": 0.3675,
|
| 6616 |
+
"grad_norm": 0.5587181787810226,
|
| 6617 |
+
"learning_rate": 7.028e-06,
|
| 6618 |
+
"loss": 2.3791,
|
| 6619 |
+
"step": 18375
|
| 6620 |
+
},
|
| 6621 |
+
{
|
| 6622 |
+
"epoch": 0.368,
|
| 6623 |
+
"grad_norm": 0.5677798724220077,
|
| 6624 |
+
"learning_rate": 7.022444444444445e-06,
|
| 6625 |
+
"loss": 2.384,
|
| 6626 |
+
"step": 18400
|
| 6627 |
+
},
|
| 6628 |
+
{
|
| 6629 |
+
"epoch": 0.368,
|
| 6630 |
+
"eval_loss": 2.391704559326172,
|
| 6631 |
+
"eval_runtime": 31.7798,
|
| 6632 |
+
"eval_samples_per_second": 3.21,
|
| 6633 |
+
"eval_steps_per_second": 1.605,
|
| 6634 |
+
"step": 18400
|
| 6635 |
+
},
|
| 6636 |
+
{
|
| 6637 |
+
"epoch": 0.3685,
|
| 6638 |
+
"grad_norm": 0.5905061339542746,
|
| 6639 |
+
"learning_rate": 7.01688888888889e-06,
|
| 6640 |
+
"loss": 2.3881,
|
| 6641 |
+
"step": 18425
|
| 6642 |
+
},
|
| 6643 |
+
{
|
| 6644 |
+
"epoch": 0.369,
|
| 6645 |
+
"grad_norm": 0.554978244766298,
|
| 6646 |
+
"learning_rate": 7.011333333333334e-06,
|
| 6647 |
+
"loss": 2.3683,
|
| 6648 |
+
"step": 18450
|
| 6649 |
+
},
|
| 6650 |
+
{
|
| 6651 |
+
"epoch": 0.3695,
|
| 6652 |
+
"grad_norm": 0.5517801842410981,
|
| 6653 |
+
"learning_rate": 7.005777777777778e-06,
|
| 6654 |
+
"loss": 2.3835,
|
| 6655 |
+
"step": 18475
|
| 6656 |
+
},
|
| 6657 |
+
{
|
| 6658 |
+
"epoch": 0.37,
|
| 6659 |
+
"grad_norm": 0.5501181046318251,
|
| 6660 |
+
"learning_rate": 7.000222222222222e-06,
|
| 6661 |
+
"loss": 2.374,
|
| 6662 |
+
"step": 18500
|
| 6663 |
+
},
|
| 6664 |
+
{
|
| 6665 |
+
"epoch": 0.37,
|
| 6666 |
+
"eval_loss": 2.3915836811065674,
|
| 6667 |
+
"eval_runtime": 31.7662,
|
| 6668 |
+
"eval_samples_per_second": 3.211,
|
| 6669 |
+
"eval_steps_per_second": 1.605,
|
| 6670 |
+
"step": 18500
|
| 6671 |
+
},
|
| 6672 |
+
{
|
| 6673 |
+
"epoch": 0.3705,
|
| 6674 |
+
"grad_norm": 0.576826996404141,
|
| 6675 |
+
"learning_rate": 6.9946666666666676e-06,
|
| 6676 |
+
"loss": 2.3819,
|
| 6677 |
+
"step": 18525
|
| 6678 |
+
},
|
| 6679 |
+
{
|
| 6680 |
+
"epoch": 0.371,
|
| 6681 |
+
"grad_norm": 0.5739797151959755,
|
| 6682 |
+
"learning_rate": 6.989111111111112e-06,
|
| 6683 |
+
"loss": 2.3794,
|
| 6684 |
+
"step": 18550
|
| 6685 |
+
},
|
| 6686 |
+
{
|
| 6687 |
+
"epoch": 0.3715,
|
| 6688 |
+
"grad_norm": 0.5511012262440002,
|
| 6689 |
+
"learning_rate": 6.9835555555555555e-06,
|
| 6690 |
+
"loss": 2.3894,
|
| 6691 |
+
"step": 18575
|
| 6692 |
+
},
|
| 6693 |
+
{
|
| 6694 |
+
"epoch": 0.372,
|
| 6695 |
+
"grad_norm": 0.5958849979817049,
|
| 6696 |
+
"learning_rate": 6.978e-06,
|
| 6697 |
+
"loss": 2.3674,
|
| 6698 |
+
"step": 18600
|
| 6699 |
+
},
|
| 6700 |
+
{
|
| 6701 |
+
"epoch": 0.372,
|
| 6702 |
+
"eval_loss": 2.391352415084839,
|
| 6703 |
+
"eval_runtime": 31.7756,
|
| 6704 |
+
"eval_samples_per_second": 3.21,
|
| 6705 |
+
"eval_steps_per_second": 1.605,
|
| 6706 |
+
"step": 18600
|
| 6707 |
+
},
|
| 6708 |
+
{
|
| 6709 |
+
"epoch": 0.3725,
|
| 6710 |
+
"grad_norm": 0.5595892595435197,
|
| 6711 |
+
"learning_rate": 6.972444444444445e-06,
|
| 6712 |
+
"loss": 2.3835,
|
| 6713 |
+
"step": 18625
|
| 6714 |
+
},
|
| 6715 |
+
{
|
| 6716 |
+
"epoch": 0.373,
|
| 6717 |
+
"grad_norm": 0.5946746403488841,
|
| 6718 |
+
"learning_rate": 6.96688888888889e-06,
|
| 6719 |
+
"loss": 2.3716,
|
| 6720 |
+
"step": 18650
|
| 6721 |
+
},
|
| 6722 |
+
{
|
| 6723 |
+
"epoch": 0.3735,
|
| 6724 |
+
"grad_norm": 0.5613740876716816,
|
| 6725 |
+
"learning_rate": 6.961333333333334e-06,
|
| 6726 |
+
"loss": 2.3843,
|
| 6727 |
+
"step": 18675
|
| 6728 |
+
},
|
| 6729 |
+
{
|
| 6730 |
+
"epoch": 0.374,
|
| 6731 |
+
"grad_norm": 0.58419422677193,
|
| 6732 |
+
"learning_rate": 6.9557777777777776e-06,
|
| 6733 |
+
"loss": 2.3883,
|
| 6734 |
+
"step": 18700
|
| 6735 |
+
},
|
| 6736 |
+
{
|
| 6737 |
+
"epoch": 0.374,
|
| 6738 |
+
"eval_loss": 2.391383409500122,
|
| 6739 |
+
"eval_runtime": 31.7182,
|
| 6740 |
+
"eval_samples_per_second": 3.216,
|
| 6741 |
+
"eval_steps_per_second": 1.608,
|
| 6742 |
+
"step": 18700
|
| 6743 |
+
},
|
| 6744 |
+
{
|
| 6745 |
+
"epoch": 0.3745,
|
| 6746 |
+
"grad_norm": 0.5508427755524951,
|
| 6747 |
+
"learning_rate": 6.950222222222223e-06,
|
| 6748 |
+
"loss": 2.3749,
|
| 6749 |
+
"step": 18725
|
| 6750 |
+
},
|
| 6751 |
+
{
|
| 6752 |
+
"epoch": 0.375,
|
| 6753 |
+
"grad_norm": 0.5686856026931271,
|
| 6754 |
+
"learning_rate": 6.944666666666667e-06,
|
| 6755 |
+
"loss": 2.38,
|
| 6756 |
+
"step": 18750
|
| 6757 |
+
},
|
| 6758 |
+
{
|
| 6759 |
+
"epoch": 0.3755,
|
| 6760 |
+
"grad_norm": 0.5531747783480245,
|
| 6761 |
+
"learning_rate": 6.939111111111112e-06,
|
| 6762 |
+
"loss": 2.3718,
|
| 6763 |
+
"step": 18775
|
| 6764 |
+
},
|
| 6765 |
+
{
|
| 6766 |
+
"epoch": 0.376,
|
| 6767 |
+
"grad_norm": 0.5800045444885175,
|
| 6768 |
+
"learning_rate": 6.933555555555556e-06,
|
| 6769 |
+
"loss": 2.3703,
|
| 6770 |
+
"step": 18800
|
| 6771 |
+
},
|
| 6772 |
+
{
|
| 6773 |
+
"epoch": 0.376,
|
| 6774 |
+
"eval_loss": 2.391113042831421,
|
| 6775 |
+
"eval_runtime": 31.7446,
|
| 6776 |
+
"eval_samples_per_second": 3.213,
|
| 6777 |
+
"eval_steps_per_second": 1.607,
|
| 6778 |
+
"step": 18800
|
| 6779 |
+
},
|
| 6780 |
+
{
|
| 6781 |
+
"epoch": 0.3765,
|
| 6782 |
+
"grad_norm": 0.5451395919825731,
|
| 6783 |
+
"learning_rate": 6.928e-06,
|
| 6784 |
+
"loss": 2.3746,
|
| 6785 |
+
"step": 18825
|
| 6786 |
+
},
|
| 6787 |
+
{
|
| 6788 |
+
"epoch": 0.377,
|
| 6789 |
+
"grad_norm": 0.5619738492106079,
|
| 6790 |
+
"learning_rate": 6.922444444444445e-06,
|
| 6791 |
+
"loss": 2.3815,
|
| 6792 |
+
"step": 18850
|
| 6793 |
+
},
|
| 6794 |
+
{
|
| 6795 |
+
"epoch": 0.3775,
|
| 6796 |
+
"grad_norm": 0.5811440137998495,
|
| 6797 |
+
"learning_rate": 6.91688888888889e-06,
|
| 6798 |
+
"loss": 2.3655,
|
| 6799 |
+
"step": 18875
|
| 6800 |
+
},
|
| 6801 |
+
{
|
| 6802 |
+
"epoch": 0.378,
|
| 6803 |
+
"grad_norm": 0.5528301840539304,
|
| 6804 |
+
"learning_rate": 6.9113333333333345e-06,
|
| 6805 |
+
"loss": 2.3721,
|
| 6806 |
+
"step": 18900
|
| 6807 |
+
},
|
| 6808 |
+
{
|
| 6809 |
+
"epoch": 0.378,
|
| 6810 |
+
"eval_loss": 2.3908257484436035,
|
| 6811 |
+
"eval_runtime": 31.6268,
|
| 6812 |
+
"eval_samples_per_second": 3.225,
|
| 6813 |
+
"eval_steps_per_second": 1.613,
|
| 6814 |
+
"step": 18900
|
| 6815 |
+
},
|
| 6816 |
+
{
|
| 6817 |
+
"epoch": 0.3785,
|
| 6818 |
+
"grad_norm": 0.5791069800351532,
|
| 6819 |
+
"learning_rate": 6.905777777777778e-06,
|
| 6820 |
+
"loss": 2.3798,
|
| 6821 |
+
"step": 18925
|
| 6822 |
+
},
|
| 6823 |
+
{
|
| 6824 |
+
"epoch": 0.379,
|
| 6825 |
+
"grad_norm": 0.5692008495737035,
|
| 6826 |
+
"learning_rate": 6.9002222222222224e-06,
|
| 6827 |
+
"loss": 2.3723,
|
| 6828 |
+
"step": 18950
|
| 6829 |
+
},
|
| 6830 |
+
{
|
| 6831 |
+
"epoch": 0.3795,
|
| 6832 |
+
"grad_norm": 0.5614405054433378,
|
| 6833 |
+
"learning_rate": 6.894666666666668e-06,
|
| 6834 |
+
"loss": 2.3739,
|
| 6835 |
+
"step": 18975
|
| 6836 |
+
},
|
| 6837 |
+
{
|
| 6838 |
+
"epoch": 0.38,
|
| 6839 |
+
"grad_norm": 0.5641420025760586,
|
| 6840 |
+
"learning_rate": 6.889111111111112e-06,
|
| 6841 |
+
"loss": 2.3728,
|
| 6842 |
+
"step": 19000
|
| 6843 |
+
},
|
| 6844 |
+
{
|
| 6845 |
+
"epoch": 0.38,
|
| 6846 |
+
"eval_loss": 2.390749454498291,
|
| 6847 |
+
"eval_runtime": 31.8098,
|
| 6848 |
+
"eval_samples_per_second": 3.207,
|
| 6849 |
+
"eval_steps_per_second": 1.603,
|
| 6850 |
+
"step": 19000
|
| 6851 |
}
|
| 6852 |
],
|
| 6853 |
"logging_steps": 25,
|
|
|
|
| 6867 |
"attributes": {}
|
| 6868 |
}
|
| 6869 |
},
|
| 6870 |
+
"total_flos": 6.048084366821687e+19,
|
| 6871 |
"train_batch_size": 1,
|
| 6872 |
"trial_name": null,
|
| 6873 |
"trial_params": null
|