Training in progress, step 2600, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2066752
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f7029e76d2330b8c1ac76f27d2827fd53c1fb9a09bfd163aeb48e6ef056512d
|
| 3 |
size 2066752
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4121235
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ab2bfc3184fdccd803bb517356e783cb35e80aca85ffa6d029528770eb4cd07
|
| 3 |
size 4121235
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14391
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cbe8803c48cf63f4eea1ebb748b4c2beb1d95a5bd75f9d32b496d3b2c5d0dd4e
|
| 3 |
size 14391
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1401
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c68a3dab5d287edad29b6a8ea33c1819f3635324de902c425ef8f164d34fb46
|
| 3 |
size 1401
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -17708,6 +17708,714 @@
|
|
| 17708 |
"eval_samples_per_second": 1.114,
|
| 17709 |
"eval_steps_per_second": 0.139,
|
| 17710 |
"step": 2500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17711 |
}
|
| 17712 |
],
|
| 17713 |
"logging_steps": 1,
|
|
@@ -17727,7 +18435,7 @@
|
|
| 17727 |
"attributes": {}
|
| 17728 |
}
|
| 17729 |
},
|
| 17730 |
-
"total_flos":
|
| 17731 |
"train_batch_size": 1,
|
| 17732 |
"trial_name": null,
|
| 17733 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.11229161268031441,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 2600,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 17708 |
"eval_samples_per_second": 1.114,
|
| 17709 |
"eval_steps_per_second": 0.139,
|
| 17710 |
"step": 2500
|
| 17711 |
+
},
|
| 17712 |
+
{
|
| 17713 |
+
"epoch": 0.10801589358210245,
|
| 17714 |
+
"grad_norm": 0.478515625,
|
| 17715 |
+
"learning_rate": 0.0009908435451512379,
|
| 17716 |
+
"loss": 8.5773,
|
| 17717 |
+
"step": 2501
|
| 17718 |
+
},
|
| 17719 |
+
{
|
| 17720 |
+
"epoch": 0.10805908266390256,
|
| 17721 |
+
"grad_norm": 0.62109375,
|
| 17722 |
+
"learning_rate": 0.000990829935964463,
|
| 17723 |
+
"loss": 8.0601,
|
| 17724 |
+
"step": 2502
|
| 17725 |
+
},
|
| 17726 |
+
{
|
| 17727 |
+
"epoch": 0.10810227174570268,
|
| 17728 |
+
"grad_norm": 0.61328125,
|
| 17729 |
+
"learning_rate": 0.0009908163167651686,
|
| 17730 |
+
"loss": 8.3679,
|
| 17731 |
+
"step": 2503
|
| 17732 |
+
},
|
| 17733 |
+
{
|
| 17734 |
+
"epoch": 0.1081454608275028,
|
| 17735 |
+
"grad_norm": 0.87890625,
|
| 17736 |
+
"learning_rate": 0.000990802687553633,
|
| 17737 |
+
"loss": 7.7936,
|
| 17738 |
+
"step": 2504
|
| 17739 |
+
},
|
| 17740 |
+
{
|
| 17741 |
+
"epoch": 0.10818864990930292,
|
| 17742 |
+
"grad_norm": 0.458984375,
|
| 17743 |
+
"learning_rate": 0.000990789048330134,
|
| 17744 |
+
"loss": 8.268,
|
| 17745 |
+
"step": 2505
|
| 17746 |
+
},
|
| 17747 |
+
{
|
| 17748 |
+
"epoch": 0.10823183899110304,
|
| 17749 |
+
"grad_norm": 0.875,
|
| 17750 |
+
"learning_rate": 0.0009907753990949495,
|
| 17751 |
+
"loss": 8.4076,
|
| 17752 |
+
"step": 2506
|
| 17753 |
+
},
|
| 17754 |
+
{
|
| 17755 |
+
"epoch": 0.10827502807290316,
|
| 17756 |
+
"grad_norm": 0.5703125,
|
| 17757 |
+
"learning_rate": 0.0009907617398483583,
|
| 17758 |
+
"loss": 8.0655,
|
| 17759 |
+
"step": 2507
|
| 17760 |
+
},
|
| 17761 |
+
{
|
| 17762 |
+
"epoch": 0.10831821715470329,
|
| 17763 |
+
"grad_norm": 0.55078125,
|
| 17764 |
+
"learning_rate": 0.0009907480705906393,
|
| 17765 |
+
"loss": 8.4049,
|
| 17766 |
+
"step": 2508
|
| 17767 |
+
},
|
| 17768 |
+
{
|
| 17769 |
+
"epoch": 0.10836140623650341,
|
| 17770 |
+
"grad_norm": 0.5859375,
|
| 17771 |
+
"learning_rate": 0.0009907343913220707,
|
| 17772 |
+
"loss": 8.2208,
|
| 17773 |
+
"step": 2509
|
| 17774 |
+
},
|
| 17775 |
+
{
|
| 17776 |
+
"epoch": 0.10840459531830353,
|
| 17777 |
+
"grad_norm": 0.859375,
|
| 17778 |
+
"learning_rate": 0.0009907207020429319,
|
| 17779 |
+
"loss": 7.9682,
|
| 17780 |
+
"step": 2510
|
| 17781 |
+
},
|
| 17782 |
+
{
|
| 17783 |
+
"epoch": 0.10844778440010365,
|
| 17784 |
+
"grad_norm": 0.46484375,
|
| 17785 |
+
"learning_rate": 0.000990707002753502,
|
| 17786 |
+
"loss": 8.4395,
|
| 17787 |
+
"step": 2511
|
| 17788 |
+
},
|
| 17789 |
+
{
|
| 17790 |
+
"epoch": 0.10849097348190377,
|
| 17791 |
+
"grad_norm": 0.546875,
|
| 17792 |
+
"learning_rate": 0.0009906932934540607,
|
| 17793 |
+
"loss": 8.1248,
|
| 17794 |
+
"step": 2512
|
| 17795 |
+
},
|
| 17796 |
+
{
|
| 17797 |
+
"epoch": 0.10853416256370389,
|
| 17798 |
+
"grad_norm": 0.455078125,
|
| 17799 |
+
"learning_rate": 0.0009906795741448876,
|
| 17800 |
+
"loss": 8.3858,
|
| 17801 |
+
"step": 2513
|
| 17802 |
+
},
|
| 17803 |
+
{
|
| 17804 |
+
"epoch": 0.10857735164550401,
|
| 17805 |
+
"grad_norm": 0.59375,
|
| 17806 |
+
"learning_rate": 0.0009906658448262623,
|
| 17807 |
+
"loss": 8.2592,
|
| 17808 |
+
"step": 2514
|
| 17809 |
+
},
|
| 17810 |
+
{
|
| 17811 |
+
"epoch": 0.10862054072730414,
|
| 17812 |
+
"grad_norm": 0.47265625,
|
| 17813 |
+
"learning_rate": 0.000990652105498465,
|
| 17814 |
+
"loss": 8.3764,
|
| 17815 |
+
"step": 2515
|
| 17816 |
+
},
|
| 17817 |
+
{
|
| 17818 |
+
"epoch": 0.10866372980910426,
|
| 17819 |
+
"grad_norm": 1.703125,
|
| 17820 |
+
"learning_rate": 0.0009906383561617761,
|
| 17821 |
+
"loss": 7.9524,
|
| 17822 |
+
"step": 2516
|
| 17823 |
+
},
|
| 17824 |
+
{
|
| 17825 |
+
"epoch": 0.10870691889090438,
|
| 17826 |
+
"grad_norm": 0.6796875,
|
| 17827 |
+
"learning_rate": 0.000990624596816476,
|
| 17828 |
+
"loss": 8.2658,
|
| 17829 |
+
"step": 2517
|
| 17830 |
+
},
|
| 17831 |
+
{
|
| 17832 |
+
"epoch": 0.1087501079727045,
|
| 17833 |
+
"grad_norm": 0.76171875,
|
| 17834 |
+
"learning_rate": 0.0009906108274628455,
|
| 17835 |
+
"loss": 8.1708,
|
| 17836 |
+
"step": 2518
|
| 17837 |
+
},
|
| 17838 |
+
{
|
| 17839 |
+
"epoch": 0.10879329705450462,
|
| 17840 |
+
"grad_norm": 0.478515625,
|
| 17841 |
+
"learning_rate": 0.0009905970481011652,
|
| 17842 |
+
"loss": 8.3299,
|
| 17843 |
+
"step": 2519
|
| 17844 |
+
},
|
| 17845 |
+
{
|
| 17846 |
+
"epoch": 0.10883648613630474,
|
| 17847 |
+
"grad_norm": 0.55859375,
|
| 17848 |
+
"learning_rate": 0.0009905832587317163,
|
| 17849 |
+
"loss": 8.2721,
|
| 17850 |
+
"step": 2520
|
| 17851 |
+
},
|
| 17852 |
+
{
|
| 17853 |
+
"epoch": 0.10887967521810486,
|
| 17854 |
+
"grad_norm": 0.5234375,
|
| 17855 |
+
"learning_rate": 0.0009905694593547803,
|
| 17856 |
+
"loss": 8.2087,
|
| 17857 |
+
"step": 2521
|
| 17858 |
+
},
|
| 17859 |
+
{
|
| 17860 |
+
"epoch": 0.10892286429990498,
|
| 17861 |
+
"grad_norm": 0.69921875,
|
| 17862 |
+
"learning_rate": 0.0009905556499706382,
|
| 17863 |
+
"loss": 8.4273,
|
| 17864 |
+
"step": 2522
|
| 17865 |
+
},
|
| 17866 |
+
{
|
| 17867 |
+
"epoch": 0.1089660533817051,
|
| 17868 |
+
"grad_norm": 0.578125,
|
| 17869 |
+
"learning_rate": 0.0009905418305795723,
|
| 17870 |
+
"loss": 8.2658,
|
| 17871 |
+
"step": 2523
|
| 17872 |
+
},
|
| 17873 |
+
{
|
| 17874 |
+
"epoch": 0.10900924246350523,
|
| 17875 |
+
"grad_norm": 0.55859375,
|
| 17876 |
+
"learning_rate": 0.0009905280011818644,
|
| 17877 |
+
"loss": 7.9867,
|
| 17878 |
+
"step": 2524
|
| 17879 |
+
},
|
| 17880 |
+
{
|
| 17881 |
+
"epoch": 0.10905243154530535,
|
| 17882 |
+
"grad_norm": 0.46484375,
|
| 17883 |
+
"learning_rate": 0.000990514161777796,
|
| 17884 |
+
"loss": 8.4169,
|
| 17885 |
+
"step": 2525
|
| 17886 |
+
},
|
| 17887 |
+
{
|
| 17888 |
+
"epoch": 0.10909562062710547,
|
| 17889 |
+
"grad_norm": 0.40234375,
|
| 17890 |
+
"learning_rate": 0.0009905003123676503,
|
| 17891 |
+
"loss": 8.4543,
|
| 17892 |
+
"step": 2526
|
| 17893 |
+
},
|
| 17894 |
+
{
|
| 17895 |
+
"epoch": 0.10913880970890559,
|
| 17896 |
+
"grad_norm": 0.66796875,
|
| 17897 |
+
"learning_rate": 0.000990486452951709,
|
| 17898 |
+
"loss": 8.0104,
|
| 17899 |
+
"step": 2527
|
| 17900 |
+
},
|
| 17901 |
+
{
|
| 17902 |
+
"epoch": 0.10918199879070571,
|
| 17903 |
+
"grad_norm": 0.6171875,
|
| 17904 |
+
"learning_rate": 0.0009904725835302552,
|
| 17905 |
+
"loss": 8.2101,
|
| 17906 |
+
"step": 2528
|
| 17907 |
+
},
|
| 17908 |
+
{
|
| 17909 |
+
"epoch": 0.10922518787250583,
|
| 17910 |
+
"grad_norm": 0.57421875,
|
| 17911 |
+
"learning_rate": 0.000990458704103572,
|
| 17912 |
+
"loss": 8.1934,
|
| 17913 |
+
"step": 2529
|
| 17914 |
+
},
|
| 17915 |
+
{
|
| 17916 |
+
"epoch": 0.10926837695430595,
|
| 17917 |
+
"grad_norm": 0.59375,
|
| 17918 |
+
"learning_rate": 0.0009904448146719421,
|
| 17919 |
+
"loss": 8.4709,
|
| 17920 |
+
"step": 2530
|
| 17921 |
+
},
|
| 17922 |
+
{
|
| 17923 |
+
"epoch": 0.10931156603610608,
|
| 17924 |
+
"grad_norm": 0.5390625,
|
| 17925 |
+
"learning_rate": 0.0009904309152356495,
|
| 17926 |
+
"loss": 8.3082,
|
| 17927 |
+
"step": 2531
|
| 17928 |
+
},
|
| 17929 |
+
{
|
| 17930 |
+
"epoch": 0.1093547551179062,
|
| 17931 |
+
"grad_norm": 0.47265625,
|
| 17932 |
+
"learning_rate": 0.0009904170057949769,
|
| 17933 |
+
"loss": 8.3866,
|
| 17934 |
+
"step": 2532
|
| 17935 |
+
},
|
| 17936 |
+
{
|
| 17937 |
+
"epoch": 0.10939794419970632,
|
| 17938 |
+
"grad_norm": 1.0703125,
|
| 17939 |
+
"learning_rate": 0.0009904030863502086,
|
| 17940 |
+
"loss": 8.956,
|
| 17941 |
+
"step": 2533
|
| 17942 |
+
},
|
| 17943 |
+
{
|
| 17944 |
+
"epoch": 0.10944113328150644,
|
| 17945 |
+
"grad_norm": 0.57421875,
|
| 17946 |
+
"learning_rate": 0.0009903891569016283,
|
| 17947 |
+
"loss": 8.1289,
|
| 17948 |
+
"step": 2534
|
| 17949 |
+
},
|
| 17950 |
+
{
|
| 17951 |
+
"epoch": 0.10948432236330656,
|
| 17952 |
+
"grad_norm": 0.46875,
|
| 17953 |
+
"learning_rate": 0.0009903752174495203,
|
| 17954 |
+
"loss": 8.267,
|
| 17955 |
+
"step": 2535
|
| 17956 |
+
},
|
| 17957 |
+
{
|
| 17958 |
+
"epoch": 0.10952751144510668,
|
| 17959 |
+
"grad_norm": 0.5078125,
|
| 17960 |
+
"learning_rate": 0.000990361267994169,
|
| 17961 |
+
"loss": 8.279,
|
| 17962 |
+
"step": 2536
|
| 17963 |
+
},
|
| 17964 |
+
{
|
| 17965 |
+
"epoch": 0.1095707005269068,
|
| 17966 |
+
"grad_norm": 0.75390625,
|
| 17967 |
+
"learning_rate": 0.0009903473085358587,
|
| 17968 |
+
"loss": 8.0143,
|
| 17969 |
+
"step": 2537
|
| 17970 |
+
},
|
| 17971 |
+
{
|
| 17972 |
+
"epoch": 0.10961388960870692,
|
| 17973 |
+
"grad_norm": 0.42578125,
|
| 17974 |
+
"learning_rate": 0.0009903333390748747,
|
| 17975 |
+
"loss": 8.196,
|
| 17976 |
+
"step": 2538
|
| 17977 |
+
},
|
| 17978 |
+
{
|
| 17979 |
+
"epoch": 0.10965707869050705,
|
| 17980 |
+
"grad_norm": 0.56640625,
|
| 17981 |
+
"learning_rate": 0.000990319359611501,
|
| 17982 |
+
"loss": 8.4366,
|
| 17983 |
+
"step": 2539
|
| 17984 |
+
},
|
| 17985 |
+
{
|
| 17986 |
+
"epoch": 0.10970026777230717,
|
| 17987 |
+
"grad_norm": 0.5,
|
| 17988 |
+
"learning_rate": 0.0009903053701460236,
|
| 17989 |
+
"loss": 8.6248,
|
| 17990 |
+
"step": 2540
|
| 17991 |
+
},
|
| 17992 |
+
{
|
| 17993 |
+
"epoch": 0.10974345685410729,
|
| 17994 |
+
"grad_norm": 0.5390625,
|
| 17995 |
+
"learning_rate": 0.0009902913706787279,
|
| 17996 |
+
"loss": 8.3567,
|
| 17997 |
+
"step": 2541
|
| 17998 |
+
},
|
| 17999 |
+
{
|
| 18000 |
+
"epoch": 0.10978664593590741,
|
| 18001 |
+
"grad_norm": 0.6484375,
|
| 18002 |
+
"learning_rate": 0.0009902773612098987,
|
| 18003 |
+
"loss": 8.1894,
|
| 18004 |
+
"step": 2542
|
| 18005 |
+
},
|
| 18006 |
+
{
|
| 18007 |
+
"epoch": 0.10982983501770752,
|
| 18008 |
+
"grad_norm": 0.640625,
|
| 18009 |
+
"learning_rate": 0.0009902633417398225,
|
| 18010 |
+
"loss": 8.4882,
|
| 18011 |
+
"step": 2543
|
| 18012 |
+
},
|
| 18013 |
+
{
|
| 18014 |
+
"epoch": 0.10987302409950764,
|
| 18015 |
+
"grad_norm": 0.640625,
|
| 18016 |
+
"learning_rate": 0.0009902493122687852,
|
| 18017 |
+
"loss": 7.9297,
|
| 18018 |
+
"step": 2544
|
| 18019 |
+
},
|
| 18020 |
+
{
|
| 18021 |
+
"epoch": 0.10991621318130776,
|
| 18022 |
+
"grad_norm": 0.796875,
|
| 18023 |
+
"learning_rate": 0.0009902352727970728,
|
| 18024 |
+
"loss": 8.1606,
|
| 18025 |
+
"step": 2545
|
| 18026 |
+
},
|
| 18027 |
+
{
|
| 18028 |
+
"epoch": 0.10995940226310788,
|
| 18029 |
+
"grad_norm": 0.60546875,
|
| 18030 |
+
"learning_rate": 0.0009902212233249717,
|
| 18031 |
+
"loss": 8.3975,
|
| 18032 |
+
"step": 2546
|
| 18033 |
+
},
|
| 18034 |
+
{
|
| 18035 |
+
"epoch": 0.110002591344908,
|
| 18036 |
+
"grad_norm": 0.48046875,
|
| 18037 |
+
"learning_rate": 0.0009902071638527685,
|
| 18038 |
+
"loss": 8.5701,
|
| 18039 |
+
"step": 2547
|
| 18040 |
+
},
|
| 18041 |
+
{
|
| 18042 |
+
"epoch": 0.11004578042670812,
|
| 18043 |
+
"grad_norm": 0.490234375,
|
| 18044 |
+
"learning_rate": 0.0009901930943807503,
|
| 18045 |
+
"loss": 8.1647,
|
| 18046 |
+
"step": 2548
|
| 18047 |
+
},
|
| 18048 |
+
{
|
| 18049 |
+
"epoch": 0.11008896950850824,
|
| 18050 |
+
"grad_norm": 0.57421875,
|
| 18051 |
+
"learning_rate": 0.0009901790149092035,
|
| 18052 |
+
"loss": 8.1903,
|
| 18053 |
+
"step": 2549
|
| 18054 |
+
},
|
| 18055 |
+
{
|
| 18056 |
+
"epoch": 0.11013215859030837,
|
| 18057 |
+
"grad_norm": 0.79296875,
|
| 18058 |
+
"learning_rate": 0.0009901649254384158,
|
| 18059 |
+
"loss": 7.8448,
|
| 18060 |
+
"step": 2550
|
| 18061 |
+
},
|
| 18062 |
+
{
|
| 18063 |
+
"epoch": 0.11017534767210849,
|
| 18064 |
+
"grad_norm": 0.62109375,
|
| 18065 |
+
"learning_rate": 0.0009901508259686745,
|
| 18066 |
+
"loss": 8.0894,
|
| 18067 |
+
"step": 2551
|
| 18068 |
+
},
|
| 18069 |
+
{
|
| 18070 |
+
"epoch": 0.11021853675390861,
|
| 18071 |
+
"grad_norm": 0.58203125,
|
| 18072 |
+
"learning_rate": 0.0009901367165002673,
|
| 18073 |
+
"loss": 8.2087,
|
| 18074 |
+
"step": 2552
|
| 18075 |
+
},
|
| 18076 |
+
{
|
| 18077 |
+
"epoch": 0.11026172583570873,
|
| 18078 |
+
"grad_norm": 0.51171875,
|
| 18079 |
+
"learning_rate": 0.0009901225970334816,
|
| 18080 |
+
"loss": 8.2029,
|
| 18081 |
+
"step": 2553
|
| 18082 |
+
},
|
| 18083 |
+
{
|
| 18084 |
+
"epoch": 0.11030491491750885,
|
| 18085 |
+
"grad_norm": 0.5703125,
|
| 18086 |
+
"learning_rate": 0.0009901084675686062,
|
| 18087 |
+
"loss": 8.4018,
|
| 18088 |
+
"step": 2554
|
| 18089 |
+
},
|
| 18090 |
+
{
|
| 18091 |
+
"epoch": 0.11034810399930897,
|
| 18092 |
+
"grad_norm": 0.5234375,
|
| 18093 |
+
"learning_rate": 0.0009900943281059287,
|
| 18094 |
+
"loss": 8.3994,
|
| 18095 |
+
"step": 2555
|
| 18096 |
+
},
|
| 18097 |
+
{
|
| 18098 |
+
"epoch": 0.1103912930811091,
|
| 18099 |
+
"grad_norm": 0.609375,
|
| 18100 |
+
"learning_rate": 0.0009900801786457375,
|
| 18101 |
+
"loss": 8.138,
|
| 18102 |
+
"step": 2556
|
| 18103 |
+
},
|
| 18104 |
+
{
|
| 18105 |
+
"epoch": 0.11043448216290921,
|
| 18106 |
+
"grad_norm": 0.5234375,
|
| 18107 |
+
"learning_rate": 0.0009900660191883217,
|
| 18108 |
+
"loss": 8.1755,
|
| 18109 |
+
"step": 2557
|
| 18110 |
+
},
|
| 18111 |
+
{
|
| 18112 |
+
"epoch": 0.11047767124470934,
|
| 18113 |
+
"grad_norm": 0.69921875,
|
| 18114 |
+
"learning_rate": 0.0009900518497339696,
|
| 18115 |
+
"loss": 8.2262,
|
| 18116 |
+
"step": 2558
|
| 18117 |
+
},
|
| 18118 |
+
{
|
| 18119 |
+
"epoch": 0.11052086032650946,
|
| 18120 |
+
"grad_norm": 0.53125,
|
| 18121 |
+
"learning_rate": 0.0009900376702829707,
|
| 18122 |
+
"loss": 8.4282,
|
| 18123 |
+
"step": 2559
|
| 18124 |
+
},
|
| 18125 |
+
{
|
| 18126 |
+
"epoch": 0.11056404940830958,
|
| 18127 |
+
"grad_norm": 0.52734375,
|
| 18128 |
+
"learning_rate": 0.0009900234808356142,
|
| 18129 |
+
"loss": 8.3542,
|
| 18130 |
+
"step": 2560
|
| 18131 |
+
},
|
| 18132 |
+
{
|
| 18133 |
+
"epoch": 0.1106072384901097,
|
| 18134 |
+
"grad_norm": 0.5078125,
|
| 18135 |
+
"learning_rate": 0.0009900092813921893,
|
| 18136 |
+
"loss": 8.206,
|
| 18137 |
+
"step": 2561
|
| 18138 |
+
},
|
| 18139 |
+
{
|
| 18140 |
+
"epoch": 0.11065042757190982,
|
| 18141 |
+
"grad_norm": 0.515625,
|
| 18142 |
+
"learning_rate": 0.0009899950719529857,
|
| 18143 |
+
"loss": 8.3899,
|
| 18144 |
+
"step": 2562
|
| 18145 |
+
},
|
| 18146 |
+
{
|
| 18147 |
+
"epoch": 0.11069361665370994,
|
| 18148 |
+
"grad_norm": 0.478515625,
|
| 18149 |
+
"learning_rate": 0.0009899808525182935,
|
| 18150 |
+
"loss": 8.4189,
|
| 18151 |
+
"step": 2563
|
| 18152 |
+
},
|
| 18153 |
+
{
|
| 18154 |
+
"epoch": 0.11073680573551006,
|
| 18155 |
+
"grad_norm": 0.58203125,
|
| 18156 |
+
"learning_rate": 0.0009899666230884024,
|
| 18157 |
+
"loss": 8.3508,
|
| 18158 |
+
"step": 2564
|
| 18159 |
+
},
|
| 18160 |
+
{
|
| 18161 |
+
"epoch": 0.11077999481731018,
|
| 18162 |
+
"grad_norm": 0.5859375,
|
| 18163 |
+
"learning_rate": 0.0009899523836636032,
|
| 18164 |
+
"loss": 8.0618,
|
| 18165 |
+
"step": 2565
|
| 18166 |
+
},
|
| 18167 |
+
{
|
| 18168 |
+
"epoch": 0.1108231838991103,
|
| 18169 |
+
"grad_norm": 0.6171875,
|
| 18170 |
+
"learning_rate": 0.0009899381342441857,
|
| 18171 |
+
"loss": 8.1894,
|
| 18172 |
+
"step": 2566
|
| 18173 |
+
},
|
| 18174 |
+
{
|
| 18175 |
+
"epoch": 0.11086637298091043,
|
| 18176 |
+
"grad_norm": 0.63671875,
|
| 18177 |
+
"learning_rate": 0.0009899238748304411,
|
| 18178 |
+
"loss": 8.373,
|
| 18179 |
+
"step": 2567
|
| 18180 |
+
},
|
| 18181 |
+
{
|
| 18182 |
+
"epoch": 0.11090956206271055,
|
| 18183 |
+
"grad_norm": 0.5234375,
|
| 18184 |
+
"learning_rate": 0.0009899096054226601,
|
| 18185 |
+
"loss": 8.3343,
|
| 18186 |
+
"step": 2568
|
| 18187 |
+
},
|
| 18188 |
+
{
|
| 18189 |
+
"epoch": 0.11095275114451067,
|
| 18190 |
+
"grad_norm": 0.80859375,
|
| 18191 |
+
"learning_rate": 0.0009898953260211339,
|
| 18192 |
+
"loss": 8.0553,
|
| 18193 |
+
"step": 2569
|
| 18194 |
+
},
|
| 18195 |
+
{
|
| 18196 |
+
"epoch": 0.11099594022631079,
|
| 18197 |
+
"grad_norm": 0.6640625,
|
| 18198 |
+
"learning_rate": 0.0009898810366261535,
|
| 18199 |
+
"loss": 8.664,
|
| 18200 |
+
"step": 2570
|
| 18201 |
+
},
|
| 18202 |
+
{
|
| 18203 |
+
"epoch": 0.11103912930811091,
|
| 18204 |
+
"grad_norm": 0.578125,
|
| 18205 |
+
"learning_rate": 0.0009898667372380107,
|
| 18206 |
+
"loss": 8.3618,
|
| 18207 |
+
"step": 2571
|
| 18208 |
+
},
|
| 18209 |
+
{
|
| 18210 |
+
"epoch": 0.11108231838991103,
|
| 18211 |
+
"grad_norm": 0.49609375,
|
| 18212 |
+
"learning_rate": 0.000989852427856997,
|
| 18213 |
+
"loss": 8.279,
|
| 18214 |
+
"step": 2572
|
| 18215 |
+
},
|
| 18216 |
+
{
|
| 18217 |
+
"epoch": 0.11112550747171115,
|
| 18218 |
+
"grad_norm": 0.6328125,
|
| 18219 |
+
"learning_rate": 0.0009898381084834044,
|
| 18220 |
+
"loss": 8.2579,
|
| 18221 |
+
"step": 2573
|
| 18222 |
+
},
|
| 18223 |
+
{
|
| 18224 |
+
"epoch": 0.11116869655351128,
|
| 18225 |
+
"grad_norm": 0.494140625,
|
| 18226 |
+
"learning_rate": 0.000989823779117525,
|
| 18227 |
+
"loss": 8.2548,
|
| 18228 |
+
"step": 2574
|
| 18229 |
+
},
|
| 18230 |
+
{
|
| 18231 |
+
"epoch": 0.1112118856353114,
|
| 18232 |
+
"grad_norm": 1.15625,
|
| 18233 |
+
"learning_rate": 0.000989809439759651,
|
| 18234 |
+
"loss": 8.7369,
|
| 18235 |
+
"step": 2575
|
| 18236 |
+
},
|
| 18237 |
+
{
|
| 18238 |
+
"epoch": 0.11125507471711152,
|
| 18239 |
+
"grad_norm": 0.6171875,
|
| 18240 |
+
"learning_rate": 0.000989795090410075,
|
| 18241 |
+
"loss": 7.8746,
|
| 18242 |
+
"step": 2576
|
| 18243 |
+
},
|
| 18244 |
+
{
|
| 18245 |
+
"epoch": 0.11129826379891164,
|
| 18246 |
+
"grad_norm": 0.7890625,
|
| 18247 |
+
"learning_rate": 0.0009897807310690898,
|
| 18248 |
+
"loss": 8.6083,
|
| 18249 |
+
"step": 2577
|
| 18250 |
+
},
|
| 18251 |
+
{
|
| 18252 |
+
"epoch": 0.11134145288071176,
|
| 18253 |
+
"grad_norm": 0.640625,
|
| 18254 |
+
"learning_rate": 0.000989766361736988,
|
| 18255 |
+
"loss": 8.4279,
|
| 18256 |
+
"step": 2578
|
| 18257 |
+
},
|
| 18258 |
+
{
|
| 18259 |
+
"epoch": 0.11138464196251188,
|
| 18260 |
+
"grad_norm": 1.1015625,
|
| 18261 |
+
"learning_rate": 0.0009897519824140632,
|
| 18262 |
+
"loss": 8.1736,
|
| 18263 |
+
"step": 2579
|
| 18264 |
+
},
|
| 18265 |
+
{
|
| 18266 |
+
"epoch": 0.111427831044312,
|
| 18267 |
+
"grad_norm": 0.734375,
|
| 18268 |
+
"learning_rate": 0.0009897375931006082,
|
| 18269 |
+
"loss": 8.6102,
|
| 18270 |
+
"step": 2580
|
| 18271 |
+
},
|
| 18272 |
+
{
|
| 18273 |
+
"epoch": 0.11147102012611212,
|
| 18274 |
+
"grad_norm": 0.80078125,
|
| 18275 |
+
"learning_rate": 0.0009897231937969172,
|
| 18276 |
+
"loss": 7.9732,
|
| 18277 |
+
"step": 2581
|
| 18278 |
+
},
|
| 18279 |
+
{
|
| 18280 |
+
"epoch": 0.11151420920791225,
|
| 18281 |
+
"grad_norm": 0.47265625,
|
| 18282 |
+
"learning_rate": 0.0009897087845032832,
|
| 18283 |
+
"loss": 8.2183,
|
| 18284 |
+
"step": 2582
|
| 18285 |
+
},
|
| 18286 |
+
{
|
| 18287 |
+
"epoch": 0.11155739828971237,
|
| 18288 |
+
"grad_norm": 0.46875,
|
| 18289 |
+
"learning_rate": 0.0009896943652200005,
|
| 18290 |
+
"loss": 8.4685,
|
| 18291 |
+
"step": 2583
|
| 18292 |
+
},
|
| 18293 |
+
{
|
| 18294 |
+
"epoch": 0.11160058737151249,
|
| 18295 |
+
"grad_norm": 0.5859375,
|
| 18296 |
+
"learning_rate": 0.0009896799359473635,
|
| 18297 |
+
"loss": 8.4296,
|
| 18298 |
+
"step": 2584
|
| 18299 |
+
},
|
| 18300 |
+
{
|
| 18301 |
+
"epoch": 0.1116437764533126,
|
| 18302 |
+
"grad_norm": 0.474609375,
|
| 18303 |
+
"learning_rate": 0.000989665496685666,
|
| 18304 |
+
"loss": 8.4101,
|
| 18305 |
+
"step": 2585
|
| 18306 |
+
},
|
| 18307 |
+
{
|
| 18308 |
+
"epoch": 0.11168696553511272,
|
| 18309 |
+
"grad_norm": 0.451171875,
|
| 18310 |
+
"learning_rate": 0.0009896510474352027,
|
| 18311 |
+
"loss": 8.5164,
|
| 18312 |
+
"step": 2586
|
| 18313 |
+
},
|
| 18314 |
+
{
|
| 18315 |
+
"epoch": 0.11173015461691284,
|
| 18316 |
+
"grad_norm": 0.55078125,
|
| 18317 |
+
"learning_rate": 0.0009896365881962685,
|
| 18318 |
+
"loss": 8.3774,
|
| 18319 |
+
"step": 2587
|
| 18320 |
+
},
|
| 18321 |
+
{
|
| 18322 |
+
"epoch": 0.11177334369871296,
|
| 18323 |
+
"grad_norm": 0.54296875,
|
| 18324 |
+
"learning_rate": 0.0009896221189691586,
|
| 18325 |
+
"loss": 8.139,
|
| 18326 |
+
"step": 2588
|
| 18327 |
+
},
|
| 18328 |
+
{
|
| 18329 |
+
"epoch": 0.11181653278051308,
|
| 18330 |
+
"grad_norm": 0.65234375,
|
| 18331 |
+
"learning_rate": 0.0009896076397541676,
|
| 18332 |
+
"loss": 8.3887,
|
| 18333 |
+
"step": 2589
|
| 18334 |
+
},
|
| 18335 |
+
{
|
| 18336 |
+
"epoch": 0.1118597218623132,
|
| 18337 |
+
"grad_norm": 0.53515625,
|
| 18338 |
+
"learning_rate": 0.0009895931505515914,
|
| 18339 |
+
"loss": 8.2581,
|
| 18340 |
+
"step": 2590
|
| 18341 |
+
},
|
| 18342 |
+
{
|
| 18343 |
+
"epoch": 0.11190291094411332,
|
| 18344 |
+
"grad_norm": 0.5546875,
|
| 18345 |
+
"learning_rate": 0.000989578651361725,
|
| 18346 |
+
"loss": 8.1766,
|
| 18347 |
+
"step": 2591
|
| 18348 |
+
},
|
| 18349 |
+
{
|
| 18350 |
+
"epoch": 0.11194610002591344,
|
| 18351 |
+
"grad_norm": 0.6171875,
|
| 18352 |
+
"learning_rate": 0.0009895641421848646,
|
| 18353 |
+
"loss": 8.399,
|
| 18354 |
+
"step": 2592
|
| 18355 |
+
},
|
| 18356 |
+
{
|
| 18357 |
+
"epoch": 0.11198928910771357,
|
| 18358 |
+
"grad_norm": 0.6328125,
|
| 18359 |
+
"learning_rate": 0.000989549623021306,
|
| 18360 |
+
"loss": 8.3802,
|
| 18361 |
+
"step": 2593
|
| 18362 |
+
},
|
| 18363 |
+
{
|
| 18364 |
+
"epoch": 0.11203247818951369,
|
| 18365 |
+
"grad_norm": 0.609375,
|
| 18366 |
+
"learning_rate": 0.0009895350938713455,
|
| 18367 |
+
"loss": 8.1978,
|
| 18368 |
+
"step": 2594
|
| 18369 |
+
},
|
| 18370 |
+
{
|
| 18371 |
+
"epoch": 0.11207566727131381,
|
| 18372 |
+
"grad_norm": 0.640625,
|
| 18373 |
+
"learning_rate": 0.0009895205547352794,
|
| 18374 |
+
"loss": 8.2764,
|
| 18375 |
+
"step": 2595
|
| 18376 |
+
},
|
| 18377 |
+
{
|
| 18378 |
+
"epoch": 0.11211885635311393,
|
| 18379 |
+
"grad_norm": 0.66796875,
|
| 18380 |
+
"learning_rate": 0.0009895060056134045,
|
| 18381 |
+
"loss": 8.0888,
|
| 18382 |
+
"step": 2596
|
| 18383 |
+
},
|
| 18384 |
+
{
|
| 18385 |
+
"epoch": 0.11216204543491405,
|
| 18386 |
+
"grad_norm": 0.435546875,
|
| 18387 |
+
"learning_rate": 0.0009894914465060172,
|
| 18388 |
+
"loss": 8.5272,
|
| 18389 |
+
"step": 2597
|
| 18390 |
+
},
|
| 18391 |
+
{
|
| 18392 |
+
"epoch": 0.11220523451671417,
|
| 18393 |
+
"grad_norm": 0.55859375,
|
| 18394 |
+
"learning_rate": 0.0009894768774134147,
|
| 18395 |
+
"loss": 8.2076,
|
| 18396 |
+
"step": 2598
|
| 18397 |
+
},
|
| 18398 |
+
{
|
| 18399 |
+
"epoch": 0.1122484235985143,
|
| 18400 |
+
"grad_norm": 0.52734375,
|
| 18401 |
+
"learning_rate": 0.0009894622983358942,
|
| 18402 |
+
"loss": 8.3313,
|
| 18403 |
+
"step": 2599
|
| 18404 |
+
},
|
| 18405 |
+
{
|
| 18406 |
+
"epoch": 0.11229161268031441,
|
| 18407 |
+
"grad_norm": 0.80078125,
|
| 18408 |
+
"learning_rate": 0.0009894477092737529,
|
| 18409 |
+
"loss": 8.4905,
|
| 18410 |
+
"step": 2600
|
| 18411 |
+
},
|
| 18412 |
+
{
|
| 18413 |
+
"epoch": 0.11229161268031441,
|
| 18414 |
+
"eval_loss": 8.336125373840332,
|
| 18415 |
+
"eval_runtime": 15.4776,
|
| 18416 |
+
"eval_samples_per_second": 1.551,
|
| 18417 |
+
"eval_steps_per_second": 0.194,
|
| 18418 |
+
"step": 2600
|
| 18419 |
}
|
| 18420 |
],
|
| 18421 |
"logging_steps": 1,
|
|
|
|
| 18435 |
"attributes": {}
|
| 18436 |
}
|
| 18437 |
},
|
| 18438 |
+
"total_flos": 8311033036800.0,
|
| 18439 |
"train_batch_size": 1,
|
| 18440 |
"trial_name": null,
|
| 18441 |
"trial_params": null
|