Training in progress, step 9963, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2384234968
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:67b1f8f2070aa722256f17caddf76eccc4633099da20d1cc2d61bdf981a76af8
|
| 3 |
size 2384234968
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4768663315
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b36278ea5a4363a66e19662925b0de521702174b32536c12eb455816bf17796c
|
| 3 |
size 4768663315
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca5d742b0ea9db6ebea78c7225beca171b7914b0e5ee83796c299293cd2c7879
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8558,6 +8558,420 @@
|
|
| 8558 |
"mean_token_accuracy": 0.7814946219325065,
|
| 8559 |
"num_tokens": 77811712.0,
|
| 8560 |
"step": 9500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8561 |
}
|
| 8562 |
],
|
| 8563 |
"logging_steps": 10,
|
|
@@ -8572,12 +8986,12 @@
|
|
| 8572 |
"should_evaluate": false,
|
| 8573 |
"should_log": false,
|
| 8574 |
"should_save": true,
|
| 8575 |
-
"should_training_stop":
|
| 8576 |
},
|
| 8577 |
"attributes": {}
|
| 8578 |
}
|
| 8579 |
},
|
| 8580 |
-
"total_flos": 2.
|
| 8581 |
"train_batch_size": 2,
|
| 8582 |
"trial_name": null,
|
| 8583 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 9963,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8558 |
"mean_token_accuracy": 0.7814946219325065,
|
| 8559 |
"num_tokens": 77811712.0,
|
| 8560 |
"step": 9500
|
| 8561 |
+
},
|
| 8562 |
+
{
|
| 8563 |
+
"epoch": 2.86379037723063,
|
| 8564 |
+
"grad_norm": 1.1517497301101685,
|
| 8565 |
+
"learning_rate": 5.063573499888468e-07,
|
| 8566 |
+
"loss": 0.1073,
|
| 8567 |
+
"mean_token_accuracy": 0.8048312108963728,
|
| 8568 |
+
"num_tokens": 77893632.0,
|
| 8569 |
+
"step": 9510
|
| 8570 |
+
},
|
| 8571 |
+
{
|
| 8572 |
+
"epoch": 2.866802198629621,
|
| 8573 |
+
"grad_norm": 1.5399608612060547,
|
| 8574 |
+
"learning_rate": 4.952041043943788e-07,
|
| 8575 |
+
"loss": 0.0834,
|
| 8576 |
+
"mean_token_accuracy": 0.8002446211874485,
|
| 8577 |
+
"num_tokens": 77975552.0,
|
| 8578 |
+
"step": 9520
|
| 8579 |
+
},
|
| 8580 |
+
{
|
| 8581 |
+
"epoch": 2.8698140200286124,
|
| 8582 |
+
"grad_norm": 1.0819060802459717,
|
| 8583 |
+
"learning_rate": 4.840508587999108e-07,
|
| 8584 |
+
"loss": 0.1048,
|
| 8585 |
+
"mean_token_accuracy": 0.789444712176919,
|
| 8586 |
+
"num_tokens": 78057472.0,
|
| 8587 |
+
"step": 9530
|
| 8588 |
+
},
|
| 8589 |
+
{
|
| 8590 |
+
"epoch": 2.8728258414276033,
|
| 8591 |
+
"grad_norm": 1.1191598176956177,
|
| 8592 |
+
"learning_rate": 4.7289761320544284e-07,
|
| 8593 |
+
"loss": 0.101,
|
| 8594 |
+
"mean_token_accuracy": 0.8058096900582313,
|
| 8595 |
+
"num_tokens": 78139392.0,
|
| 8596 |
+
"step": 9540
|
| 8597 |
+
},
|
| 8598 |
+
{
|
| 8599 |
+
"epoch": 2.8758376628265943,
|
| 8600 |
+
"grad_norm": 1.1794003248214722,
|
| 8601 |
+
"learning_rate": 4.617443676109748e-07,
|
| 8602 |
+
"loss": 0.0946,
|
| 8603 |
+
"mean_token_accuracy": 0.804403131455183,
|
| 8604 |
+
"num_tokens": 78221312.0,
|
| 8605 |
+
"step": 9550
|
| 8606 |
+
},
|
| 8607 |
+
{
|
| 8608 |
+
"epoch": 2.8788494842255856,
|
| 8609 |
+
"grad_norm": 1.377267599105835,
|
| 8610 |
+
"learning_rate": 4.505911220165068e-07,
|
| 8611 |
+
"loss": 0.1017,
|
| 8612 |
+
"mean_token_accuracy": 0.7931017633527517,
|
| 8613 |
+
"num_tokens": 78303232.0,
|
| 8614 |
+
"step": 9560
|
| 8615 |
+
},
|
| 8616 |
+
{
|
| 8617 |
+
"epoch": 2.8818613056245765,
|
| 8618 |
+
"grad_norm": 1.4520881175994873,
|
| 8619 |
+
"learning_rate": 4.394378764220388e-07,
|
| 8620 |
+
"loss": 0.1217,
|
| 8621 |
+
"mean_token_accuracy": 0.7699119374155998,
|
| 8622 |
+
"num_tokens": 78385152.0,
|
| 8623 |
+
"step": 9570
|
| 8624 |
+
},
|
| 8625 |
+
{
|
| 8626 |
+
"epoch": 2.8848731270235675,
|
| 8627 |
+
"grad_norm": 1.2226645946502686,
|
| 8628 |
+
"learning_rate": 4.2828463082757086e-07,
|
| 8629 |
+
"loss": 0.1191,
|
| 8630 |
+
"mean_token_accuracy": 0.7835861057043075,
|
| 8631 |
+
"num_tokens": 78467072.0,
|
| 8632 |
+
"step": 9580
|
| 8633 |
+
},
|
| 8634 |
+
{
|
| 8635 |
+
"epoch": 2.8878849484225584,
|
| 8636 |
+
"grad_norm": 1.7126904726028442,
|
| 8637 |
+
"learning_rate": 4.1713138523310286e-07,
|
| 8638 |
+
"loss": 0.1196,
|
| 8639 |
+
"mean_token_accuracy": 0.7943126205354929,
|
| 8640 |
+
"num_tokens": 78548992.0,
|
| 8641 |
+
"step": 9590
|
| 8642 |
+
},
|
| 8643 |
+
{
|
| 8644 |
+
"epoch": 2.8908967698215493,
|
| 8645 |
+
"grad_norm": 1.312665343284607,
|
| 8646 |
+
"learning_rate": 4.059781396386349e-07,
|
| 8647 |
+
"loss": 0.1047,
|
| 8648 |
+
"mean_token_accuracy": 0.7909001961350441,
|
| 8649 |
+
"num_tokens": 78630912.0,
|
| 8650 |
+
"step": 9600
|
| 8651 |
+
},
|
| 8652 |
+
{
|
| 8653 |
+
"epoch": 2.8939085912205407,
|
| 8654 |
+
"grad_norm": 1.339685320854187,
|
| 8655 |
+
"learning_rate": 3.9482489404416684e-07,
|
| 8656 |
+
"loss": 0.1053,
|
| 8657 |
+
"mean_token_accuracy": 0.786631602421403,
|
| 8658 |
+
"num_tokens": 78712832.0,
|
| 8659 |
+
"step": 9610
|
| 8660 |
+
},
|
| 8661 |
+
{
|
| 8662 |
+
"epoch": 2.8969204126195316,
|
| 8663 |
+
"grad_norm": 2.0059938430786133,
|
| 8664 |
+
"learning_rate": 3.836716484496989e-07,
|
| 8665 |
+
"loss": 0.1082,
|
| 8666 |
+
"mean_token_accuracy": 0.7798312120139599,
|
| 8667 |
+
"num_tokens": 78794752.0,
|
| 8668 |
+
"step": 9620
|
| 8669 |
+
},
|
| 8670 |
+
{
|
| 8671 |
+
"epoch": 2.8999322340185225,
|
| 8672 |
+
"grad_norm": 1.2985539436340332,
|
| 8673 |
+
"learning_rate": 3.725184028552309e-07,
|
| 8674 |
+
"loss": 0.0949,
|
| 8675 |
+
"mean_token_accuracy": 0.7817759312689304,
|
| 8676 |
+
"num_tokens": 78876672.0,
|
| 8677 |
+
"step": 9630
|
| 8678 |
+
},
|
| 8679 |
+
{
|
| 8680 |
+
"epoch": 2.902944055417514,
|
| 8681 |
+
"grad_norm": 1.9637115001678467,
|
| 8682 |
+
"learning_rate": 3.613651572607629e-07,
|
| 8683 |
+
"loss": 0.1199,
|
| 8684 |
+
"mean_token_accuracy": 0.7892734818160534,
|
| 8685 |
+
"num_tokens": 78958592.0,
|
| 8686 |
+
"step": 9640
|
| 8687 |
+
},
|
| 8688 |
+
{
|
| 8689 |
+
"epoch": 2.905955876816505,
|
| 8690 |
+
"grad_norm": 1.2397360801696777,
|
| 8691 |
+
"learning_rate": 3.502119116662949e-07,
|
| 8692 |
+
"loss": 0.1069,
|
| 8693 |
+
"mean_token_accuracy": 0.7752935424447059,
|
| 8694 |
+
"num_tokens": 79040512.0,
|
| 8695 |
+
"step": 9650
|
| 8696 |
+
},
|
| 8697 |
+
{
|
| 8698 |
+
"epoch": 2.9089676982154957,
|
| 8699 |
+
"grad_norm": 1.2161389589309692,
|
| 8700 |
+
"learning_rate": 3.3905866607182696e-07,
|
| 8701 |
+
"loss": 0.0987,
|
| 8702 |
+
"mean_token_accuracy": 0.7955479428172112,
|
| 8703 |
+
"num_tokens": 79122432.0,
|
| 8704 |
+
"step": 9660
|
| 8705 |
+
},
|
| 8706 |
+
{
|
| 8707 |
+
"epoch": 2.911979519614487,
|
| 8708 |
+
"grad_norm": 0.8789703845977783,
|
| 8709 |
+
"learning_rate": 3.2790542047735896e-07,
|
| 8710 |
+
"loss": 0.1042,
|
| 8711 |
+
"mean_token_accuracy": 0.8027641840279103,
|
| 8712 |
+
"num_tokens": 79204352.0,
|
| 8713 |
+
"step": 9670
|
| 8714 |
+
},
|
| 8715 |
+
{
|
| 8716 |
+
"epoch": 2.914991341013478,
|
| 8717 |
+
"grad_norm": 0.981950581073761,
|
| 8718 |
+
"learning_rate": 3.16752174882891e-07,
|
| 8719 |
+
"loss": 0.1272,
|
| 8720 |
+
"mean_token_accuracy": 0.7878424659371376,
|
| 8721 |
+
"num_tokens": 79286272.0,
|
| 8722 |
+
"step": 9680
|
| 8723 |
+
},
|
| 8724 |
+
{
|
| 8725 |
+
"epoch": 2.918003162412469,
|
| 8726 |
+
"grad_norm": 1.3362120389938354,
|
| 8727 |
+
"learning_rate": 3.0559892928842294e-07,
|
| 8728 |
+
"loss": 0.1049,
|
| 8729 |
+
"mean_token_accuracy": 0.800146771967411,
|
| 8730 |
+
"num_tokens": 79368192.0,
|
| 8731 |
+
"step": 9690
|
| 8732 |
+
},
|
| 8733 |
+
{
|
| 8734 |
+
"epoch": 2.92101498381146,
|
| 8735 |
+
"grad_norm": 0.9886929988861084,
|
| 8736 |
+
"learning_rate": 2.94445683693955e-07,
|
| 8737 |
+
"loss": 0.1229,
|
| 8738 |
+
"mean_token_accuracy": 0.7738380614668131,
|
| 8739 |
+
"num_tokens": 79450112.0,
|
| 8740 |
+
"step": 9700
|
| 8741 |
+
},
|
| 8742 |
+
{
|
| 8743 |
+
"epoch": 2.9240268052104508,
|
| 8744 |
+
"grad_norm": 1.2238775491714478,
|
| 8745 |
+
"learning_rate": 2.83292438099487e-07,
|
| 8746 |
+
"loss": 0.1047,
|
| 8747 |
+
"mean_token_accuracy": 0.788319468870759,
|
| 8748 |
+
"num_tokens": 79532032.0,
|
| 8749 |
+
"step": 9710
|
| 8750 |
+
},
|
| 8751 |
+
{
|
| 8752 |
+
"epoch": 2.927038626609442,
|
| 8753 |
+
"grad_norm": 1.005550742149353,
|
| 8754 |
+
"learning_rate": 2.7213919250501897e-07,
|
| 8755 |
+
"loss": 0.1215,
|
| 8756 |
+
"mean_token_accuracy": 0.7744006853550672,
|
| 8757 |
+
"num_tokens": 79613952.0,
|
| 8758 |
+
"step": 9720
|
| 8759 |
+
},
|
| 8760 |
+
{
|
| 8761 |
+
"epoch": 2.930050448008433,
|
| 8762 |
+
"grad_norm": 1.1485919952392578,
|
| 8763 |
+
"learning_rate": 2.60985946910551e-07,
|
| 8764 |
+
"loss": 0.1218,
|
| 8765 |
+
"mean_token_accuracy": 0.7930772993713617,
|
| 8766 |
+
"num_tokens": 79695872.0,
|
| 8767 |
+
"step": 9730
|
| 8768 |
+
},
|
| 8769 |
+
{
|
| 8770 |
+
"epoch": 2.933062269407424,
|
| 8771 |
+
"grad_norm": 1.2947425842285156,
|
| 8772 |
+
"learning_rate": 2.49832701316083e-07,
|
| 8773 |
+
"loss": 0.1003,
|
| 8774 |
+
"mean_token_accuracy": 0.7740215234458446,
|
| 8775 |
+
"num_tokens": 79777792.0,
|
| 8776 |
+
"step": 9740
|
| 8777 |
+
},
|
| 8778 |
+
{
|
| 8779 |
+
"epoch": 2.9360740908064153,
|
| 8780 |
+
"grad_norm": 1.7832204103469849,
|
| 8781 |
+
"learning_rate": 2.38679455721615e-07,
|
| 8782 |
+
"loss": 0.1055,
|
| 8783 |
+
"mean_token_accuracy": 0.7808341465890407,
|
| 8784 |
+
"num_tokens": 79859712.0,
|
| 8785 |
+
"step": 9750
|
| 8786 |
+
},
|
| 8787 |
+
{
|
| 8788 |
+
"epoch": 2.9390859122054063,
|
| 8789 |
+
"grad_norm": 1.1873085498809814,
|
| 8790 |
+
"learning_rate": 2.2752621012714705e-07,
|
| 8791 |
+
"loss": 0.1288,
|
| 8792 |
+
"mean_token_accuracy": 0.7696917787194252,
|
| 8793 |
+
"num_tokens": 79941632.0,
|
| 8794 |
+
"step": 9760
|
| 8795 |
+
},
|
| 8796 |
+
{
|
| 8797 |
+
"epoch": 2.942097733604397,
|
| 8798 |
+
"grad_norm": 1.529731035232544,
|
| 8799 |
+
"learning_rate": 2.1637296453267904e-07,
|
| 8800 |
+
"loss": 0.1186,
|
| 8801 |
+
"mean_token_accuracy": 0.791890898346901,
|
| 8802 |
+
"num_tokens": 80023552.0,
|
| 8803 |
+
"step": 9770
|
| 8804 |
+
},
|
| 8805 |
+
{
|
| 8806 |
+
"epoch": 2.9451095550033886,
|
| 8807 |
+
"grad_norm": 1.333554983139038,
|
| 8808 |
+
"learning_rate": 2.0521971893821103e-07,
|
| 8809 |
+
"loss": 0.1051,
|
| 8810 |
+
"mean_token_accuracy": 0.7844911962747574,
|
| 8811 |
+
"num_tokens": 80105472.0,
|
| 8812 |
+
"step": 9780
|
| 8813 |
+
},
|
| 8814 |
+
{
|
| 8815 |
+
"epoch": 2.9481213764023795,
|
| 8816 |
+
"grad_norm": 1.4663509130477905,
|
| 8817 |
+
"learning_rate": 1.9406647334374302e-07,
|
| 8818 |
+
"loss": 0.1041,
|
| 8819 |
+
"mean_token_accuracy": 0.7966854199767113,
|
| 8820 |
+
"num_tokens": 80187392.0,
|
| 8821 |
+
"step": 9790
|
| 8822 |
+
},
|
| 8823 |
+
{
|
| 8824 |
+
"epoch": 2.9511331978013704,
|
| 8825 |
+
"grad_norm": 1.002288579940796,
|
| 8826 |
+
"learning_rate": 1.8291322774927504e-07,
|
| 8827 |
+
"loss": 0.0909,
|
| 8828 |
+
"mean_token_accuracy": 0.7919275924563408,
|
| 8829 |
+
"num_tokens": 80269312.0,
|
| 8830 |
+
"step": 9800
|
| 8831 |
+
},
|
| 8832 |
+
{
|
| 8833 |
+
"epoch": 2.9541450192003613,
|
| 8834 |
+
"grad_norm": 1.2249246835708618,
|
| 8835 |
+
"learning_rate": 1.7175998215480706e-07,
|
| 8836 |
+
"loss": 0.0977,
|
| 8837 |
+
"mean_token_accuracy": 0.7821917802095413,
|
| 8838 |
+
"num_tokens": 80351232.0,
|
| 8839 |
+
"step": 9810
|
| 8840 |
+
},
|
| 8841 |
+
{
|
| 8842 |
+
"epoch": 2.9571568405993522,
|
| 8843 |
+
"grad_norm": 1.3539292812347412,
|
| 8844 |
+
"learning_rate": 1.6060673656033905e-07,
|
| 8845 |
+
"loss": 0.1107,
|
| 8846 |
+
"mean_token_accuracy": 0.7898727986961603,
|
| 8847 |
+
"num_tokens": 80433152.0,
|
| 8848 |
+
"step": 9820
|
| 8849 |
+
},
|
| 8850 |
+
{
|
| 8851 |
+
"epoch": 2.9601686619983436,
|
| 8852 |
+
"grad_norm": 1.2705157995224,
|
| 8853 |
+
"learning_rate": 1.4945349096587107e-07,
|
| 8854 |
+
"loss": 0.0916,
|
| 8855 |
+
"mean_token_accuracy": 0.8011252459138631,
|
| 8856 |
+
"num_tokens": 80515072.0,
|
| 8857 |
+
"step": 9830
|
| 8858 |
+
},
|
| 8859 |
+
{
|
| 8860 |
+
"epoch": 2.9631804833973345,
|
| 8861 |
+
"grad_norm": 1.3075294494628906,
|
| 8862 |
+
"learning_rate": 1.383002453714031e-07,
|
| 8863 |
+
"loss": 0.111,
|
| 8864 |
+
"mean_token_accuracy": 0.7639554768800736,
|
| 8865 |
+
"num_tokens": 80596992.0,
|
| 8866 |
+
"step": 9840
|
| 8867 |
+
},
|
| 8868 |
+
{
|
| 8869 |
+
"epoch": 2.9661923047963255,
|
| 8870 |
+
"grad_norm": 1.1203222274780273,
|
| 8871 |
+
"learning_rate": 1.271469997769351e-07,
|
| 8872 |
+
"loss": 0.1212,
|
| 8873 |
+
"mean_token_accuracy": 0.7945205442607403,
|
| 8874 |
+
"num_tokens": 80678912.0,
|
| 8875 |
+
"step": 9850
|
| 8876 |
+
},
|
| 8877 |
+
{
|
| 8878 |
+
"epoch": 2.969204126195317,
|
| 8879 |
+
"grad_norm": 1.466186285018921,
|
| 8880 |
+
"learning_rate": 1.1599375418246712e-07,
|
| 8881 |
+
"loss": 0.1254,
|
| 8882 |
+
"mean_token_accuracy": 0.7881849348545075,
|
| 8883 |
+
"num_tokens": 80760832.0,
|
| 8884 |
+
"step": 9860
|
| 8885 |
+
},
|
| 8886 |
+
{
|
| 8887 |
+
"epoch": 2.9722159475943077,
|
| 8888 |
+
"grad_norm": 1.363336205482483,
|
| 8889 |
+
"learning_rate": 1.0484050858799912e-07,
|
| 8890 |
+
"loss": 0.1191,
|
| 8891 |
+
"mean_token_accuracy": 0.7713184926658869,
|
| 8892 |
+
"num_tokens": 80842752.0,
|
| 8893 |
+
"step": 9870
|
| 8894 |
+
},
|
| 8895 |
+
{
|
| 8896 |
+
"epoch": 2.9752277689932987,
|
| 8897 |
+
"grad_norm": 0.9907705783843994,
|
| 8898 |
+
"learning_rate": 9.368726299353113e-08,
|
| 8899 |
+
"loss": 0.0936,
|
| 8900 |
+
"mean_token_accuracy": 0.794019079580903,
|
| 8901 |
+
"num_tokens": 80924672.0,
|
| 8902 |
+
"step": 9880
|
| 8903 |
+
},
|
| 8904 |
+
{
|
| 8905 |
+
"epoch": 2.9782395903922896,
|
| 8906 |
+
"grad_norm": 1.0468392372131348,
|
| 8907 |
+
"learning_rate": 8.253401739906312e-08,
|
| 8908 |
+
"loss": 0.1124,
|
| 8909 |
+
"mean_token_accuracy": 0.7831335622817278,
|
| 8910 |
+
"num_tokens": 81006592.0,
|
| 8911 |
+
"step": 9890
|
| 8912 |
+
},
|
| 8913 |
+
{
|
| 8914 |
+
"epoch": 2.9812514117912805,
|
| 8915 |
+
"grad_norm": 1.0683683156967163,
|
| 8916 |
+
"learning_rate": 7.138077180459515e-08,
|
| 8917 |
+
"loss": 0.0994,
|
| 8918 |
+
"mean_token_accuracy": 0.7868028357625008,
|
| 8919 |
+
"num_tokens": 81088512.0,
|
| 8920 |
+
"step": 9900
|
| 8921 |
+
},
|
| 8922 |
+
{
|
| 8923 |
+
"epoch": 2.984263233190272,
|
| 8924 |
+
"grad_norm": 1.7116000652313232,
|
| 8925 |
+
"learning_rate": 6.022752621012715e-08,
|
| 8926 |
+
"loss": 0.0879,
|
| 8927 |
+
"mean_token_accuracy": 0.7907044999301434,
|
| 8928 |
+
"num_tokens": 81170432.0,
|
| 8929 |
+
"step": 9910
|
| 8930 |
+
},
|
| 8931 |
+
{
|
| 8932 |
+
"epoch": 2.987275054589263,
|
| 8933 |
+
"grad_norm": 1.3669886589050293,
|
| 8934 |
+
"learning_rate": 4.9074280615659164e-08,
|
| 8935 |
+
"loss": 0.1216,
|
| 8936 |
+
"mean_token_accuracy": 0.7789016645401716,
|
| 8937 |
+
"num_tokens": 81252352.0,
|
| 8938 |
+
"step": 9920
|
| 8939 |
+
},
|
| 8940 |
+
{
|
| 8941 |
+
"epoch": 2.9902868759882537,
|
| 8942 |
+
"grad_norm": 0.9469903707504272,
|
| 8943 |
+
"learning_rate": 3.792103502119117e-08,
|
| 8944 |
+
"loss": 0.0938,
|
| 8945 |
+
"mean_token_accuracy": 0.7959637988358736,
|
| 8946 |
+
"num_tokens": 81334272.0,
|
| 8947 |
+
"step": 9930
|
| 8948 |
+
},
|
| 8949 |
+
{
|
| 8950 |
+
"epoch": 2.993298697387245,
|
| 8951 |
+
"grad_norm": 1.380719780921936,
|
| 8952 |
+
"learning_rate": 2.676778942672318e-08,
|
| 8953 |
+
"loss": 0.089,
|
| 8954 |
+
"mean_token_accuracy": 0.7933586109429598,
|
| 8955 |
+
"num_tokens": 81416192.0,
|
| 8956 |
+
"step": 9940
|
| 8957 |
+
},
|
| 8958 |
+
{
|
| 8959 |
+
"epoch": 2.996310518786236,
|
| 8960 |
+
"grad_norm": 1.1363697052001953,
|
| 8961 |
+
"learning_rate": 1.5614543832255188e-08,
|
| 8962 |
+
"loss": 0.1048,
|
| 8963 |
+
"mean_token_accuracy": 0.8045499000698328,
|
| 8964 |
+
"num_tokens": 81498112.0,
|
| 8965 |
+
"step": 9950
|
| 8966 |
+
},
|
| 8967 |
+
{
|
| 8968 |
+
"epoch": 2.999322340185227,
|
| 8969 |
+
"grad_norm": 1.5490316152572632,
|
| 8970 |
+
"learning_rate": 4.461298237787197e-09,
|
| 8971 |
+
"loss": 0.1019,
|
| 8972 |
+
"mean_token_accuracy": 0.790349805355072,
|
| 8973 |
+
"num_tokens": 81580032.0,
|
| 8974 |
+
"step": 9960
|
| 8975 |
}
|
| 8976 |
],
|
| 8977 |
"logging_steps": 10,
|
|
|
|
| 8986 |
"should_evaluate": false,
|
| 8987 |
"should_log": false,
|
| 8988 |
"should_save": true,
|
| 8989 |
+
"should_training_stop": true
|
| 8990 |
},
|
| 8991 |
"attributes": {}
|
| 8992 |
}
|
| 8993 |
},
|
| 8994 |
+
"total_flos": 2.156488071095255e+17,
|
| 8995 |
"train_batch_size": 2,
|
| 8996 |
"trial_name": null,
|
| 8997 |
"trial_params": null
|