Model save
Browse files- README.md +23 -4
- trainer_state.json +1794 -14
README.md
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
---
|
| 2 |
library_name: transformers
|
| 3 |
tags:
|
| 4 |
-
- smallm
|
| 5 |
- generated_from_trainer
|
| 6 |
model-index:
|
| 7 |
- name: smallm_70_rope
|
|
@@ -15,8 +14,8 @@ should probably proofread and complete it, then remove this comment. -->
|
|
| 15 |
|
| 16 |
This model is a fine-tuned version of [](https://huggingface.co/) on the None dataset.
|
| 17 |
It achieves the following results on the evaluation set:
|
| 18 |
-
- Loss: 2.
|
| 19 |
-
- Num Input Tokens Seen:
|
| 20 |
|
| 21 |
## Model description
|
| 22 |
|
|
@@ -44,7 +43,7 @@ The following hyperparameters were used during training:
|
|
| 44 |
- optimizer: Use OptimizerNames.ADAMW_APEX_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
| 45 |
- lr_scheduler_type: warmup_stable_decay
|
| 46 |
- lr_scheduler_warmup_steps: 500
|
| 47 |
-
- training_steps:
|
| 48 |
|
| 49 |
### Training results
|
| 50 |
|
|
@@ -170,6 +169,26 @@ The following hyperparameters were used during training:
|
|
| 170 |
| 2.9856 | 0.3969 | 59000 | 2.8846 | 15466496000 |
|
| 171 |
| 2.9824 | 0.4002 | 59500 | 2.8822 | 15597568000 |
|
| 172 |
| 2.9789 | 0.4036 | 60000 | 2.8819 | 15728640000 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
|
| 175 |
### Framework versions
|
|
|
|
| 1 |
---
|
| 2 |
library_name: transformers
|
| 3 |
tags:
|
|
|
|
| 4 |
- generated_from_trainer
|
| 5 |
model-index:
|
| 6 |
- name: smallm_70_rope
|
|
|
|
| 14 |
|
| 15 |
This model is a fine-tuned version of [](https://huggingface.co/) on the None dataset.
|
| 16 |
It achieves the following results on the evaluation set:
|
| 17 |
+
- Loss: 2.8645
|
| 18 |
+
- Num Input Tokens Seen: 18350080000
|
| 19 |
|
| 20 |
## Model description
|
| 21 |
|
|
|
|
| 43 |
- optimizer: Use OptimizerNames.ADAMW_APEX_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
| 44 |
- lr_scheduler_type: warmup_stable_decay
|
| 45 |
- lr_scheduler_warmup_steps: 500
|
| 46 |
+
- training_steps: 70000
|
| 47 |
|
| 48 |
### Training results
|
| 49 |
|
|
|
|
| 169 |
| 2.9856 | 0.3969 | 59000 | 2.8846 | 15466496000 |
|
| 170 |
| 2.9824 | 0.4002 | 59500 | 2.8822 | 15597568000 |
|
| 171 |
| 2.9789 | 0.4036 | 60000 | 2.8819 | 15728640000 |
|
| 172 |
+
| 3.0132 | 0.4070 | 60500 | 2.9149 | 15859712000 |
|
| 173 |
+
| 3.0125 | 0.4103 | 61000 | 2.9137 | 15990784000 |
|
| 174 |
+
| 3.0115 | 0.4137 | 61500 | 2.9049 | 16121856000 |
|
| 175 |
+
| 3.0079 | 0.4170 | 62000 | 2.9013 | 16252928000 |
|
| 176 |
+
| 3.0055 | 0.4204 | 62500 | 2.8968 | 16384000000 |
|
| 177 |
+
| 2.9823 | 0.4238 | 63000 | 2.8930 | 16515072000 |
|
| 178 |
+
| 3.0004 | 0.4271 | 63500 | 2.8904 | 16646144000 |
|
| 179 |
+
| 2.9839 | 0.4305 | 64000 | 2.8860 | 16777216000 |
|
| 180 |
+
| 2.9789 | 0.4339 | 64500 | 2.8814 | 16908288000 |
|
| 181 |
+
| 2.9876 | 0.4372 | 65000 | 2.8793 | 17039360000 |
|
| 182 |
+
| 2.9804 | 0.4406 | 65500 | 2.8758 | 17170432000 |
|
| 183 |
+
| 2.9851 | 0.4439 | 66000 | 2.8729 | 17301504000 |
|
| 184 |
+
| 2.9651 | 0.4473 | 66500 | 2.8710 | 17432576000 |
|
| 185 |
+
| 2.9704 | 0.4507 | 67000 | 2.8692 | 17563648000 |
|
| 186 |
+
| 2.9785 | 0.4540 | 67500 | 2.8678 | 17694720000 |
|
| 187 |
+
| 2.9724 | 0.4574 | 68000 | 2.8663 | 17825792000 |
|
| 188 |
+
| 2.9732 | 0.4608 | 68500 | 2.8653 | 17956864000 |
|
| 189 |
+
| 2.9622 | 0.4641 | 69000 | 2.8648 | 18087936000 |
|
| 190 |
+
| 2.964 | 0.4675 | 69500 | 2.8646 | 18219008000 |
|
| 191 |
+
| 2.9684 | 0.4709 | 70000 | 2.8645 | 18350080000 |
|
| 192 |
|
| 193 |
|
| 194 |
### Framework versions
|
trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10690,20 +10690,1800 @@
|
|
| 10690 |
"step": 60000
|
| 10691 |
},
|
| 10692 |
{
|
| 10693 |
-
"epoch": 0.
|
| 10694 |
-
"
|
| 10695 |
-
"
|
| 10696 |
-
"
|
| 10697 |
-
"
|
| 10698 |
-
"
|
| 10699 |
-
|
| 10700 |
-
|
| 10701 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10702 |
}
|
| 10703 |
],
|
| 10704 |
"logging_steps": 50,
|
| 10705 |
-
"max_steps":
|
| 10706 |
-
"num_input_tokens_seen":
|
| 10707 |
"num_train_epochs": 1,
|
| 10708 |
"save_steps": 1000,
|
| 10709 |
"stateful_callbacks": {
|
|
@@ -10718,7 +12498,7 @@
|
|
| 10718 |
"attributes": {}
|
| 10719 |
}
|
| 10720 |
},
|
| 10721 |
-
"total_flos": 4.
|
| 10722 |
"train_batch_size": 64,
|
| 10723 |
"trial_name": null,
|
| 10724 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.4708549211906576,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 70000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10690 |
"step": 60000
|
| 10691 |
},
|
| 10692 |
{
|
| 10693 |
+
"epoch": 0.40392625739284266,
|
| 10694 |
+
"grad_norm": 0.2130047082901001,
|
| 10695 |
+
"learning_rate": 0.0006867974850262581,
|
| 10696 |
+
"loss": 3.0074,
|
| 10697 |
+
"num_input_tokens_seen": 15741747200,
|
| 10698 |
+
"step": 60050
|
| 10699 |
+
},
|
| 10700 |
+
{
|
| 10701 |
+
"epoch": 0.40426258233655027,
|
| 10702 |
+
"grad_norm": 0.18596570193767548,
|
| 10703 |
+
"learning_rate": 0.000682235249939575,
|
| 10704 |
+
"loss": 2.9981,
|
| 10705 |
+
"num_input_tokens_seen": 15754854400,
|
| 10706 |
+
"step": 60100
|
| 10707 |
+
},
|
| 10708 |
+
{
|
| 10709 |
+
"epoch": 0.4045989072802579,
|
| 10710 |
+
"grad_norm": 0.2774942219257355,
|
| 10711 |
+
"learning_rate": 0.0006776554506402081,
|
| 10712 |
+
"loss": 3.0024,
|
| 10713 |
+
"num_input_tokens_seen": 15767961600,
|
| 10714 |
+
"step": 60150
|
| 10715 |
+
},
|
| 10716 |
+
{
|
| 10717 |
+
"epoch": 0.4049352322239655,
|
| 10718 |
+
"grad_norm": 0.19329522550106049,
|
| 10719 |
+
"learning_rate": 0.0006730585285387465,
|
| 10720 |
+
"loss": 3.0101,
|
| 10721 |
+
"num_input_tokens_seen": 15781068800,
|
| 10722 |
+
"step": 60200
|
| 10723 |
+
},
|
| 10724 |
+
{
|
| 10725 |
+
"epoch": 0.4052715571676731,
|
| 10726 |
+
"grad_norm": 0.21384254097938538,
|
| 10727 |
+
"learning_rate": 0.0006684449266961101,
|
| 10728 |
+
"loss": 3.0095,
|
| 10729 |
+
"num_input_tokens_seen": 15794176000,
|
| 10730 |
+
"step": 60250
|
| 10731 |
+
},
|
| 10732 |
+
{
|
| 10733 |
+
"epoch": 0.4056078821113807,
|
| 10734 |
+
"grad_norm": 0.3892166018486023,
|
| 10735 |
+
"learning_rate": 0.0006638150897808468,
|
| 10736 |
+
"loss": 3.0101,
|
| 10737 |
+
"num_input_tokens_seen": 15807283200,
|
| 10738 |
+
"step": 60300
|
| 10739 |
+
},
|
| 10740 |
+
{
|
| 10741 |
+
"epoch": 0.4059442070550883,
|
| 10742 |
+
"grad_norm": 0.27356287837028503,
|
| 10743 |
+
"learning_rate": 0.0006591694640262749,
|
| 10744 |
+
"loss": 3.0322,
|
| 10745 |
+
"num_input_tokens_seen": 15820390400,
|
| 10746 |
+
"step": 60350
|
| 10747 |
+
},
|
| 10748 |
+
{
|
| 10749 |
+
"epoch": 0.40628053199879594,
|
| 10750 |
+
"grad_norm": 0.20498153567314148,
|
| 10751 |
+
"learning_rate": 0.0006545084971874737,
|
| 10752 |
+
"loss": 3.0064,
|
| 10753 |
+
"num_input_tokens_seen": 15833497600,
|
| 10754 |
+
"step": 60400
|
| 10755 |
+
},
|
| 10756 |
+
{
|
| 10757 |
+
"epoch": 0.40661685694250355,
|
| 10758 |
+
"grad_norm": 0.19939659535884857,
|
| 10759 |
+
"learning_rate": 0.0006498326384981283,
|
| 10760 |
+
"loss": 3.0158,
|
| 10761 |
+
"num_input_tokens_seen": 15846604800,
|
| 10762 |
+
"step": 60450
|
| 10763 |
+
},
|
| 10764 |
+
{
|
| 10765 |
+
"epoch": 0.40695318188621116,
|
| 10766 |
+
"grad_norm": 0.24545226991176605,
|
| 10767 |
+
"learning_rate": 0.0006451423386272311,
|
| 10768 |
+
"loss": 3.0132,
|
| 10769 |
+
"num_input_tokens_seen": 15859712000,
|
| 10770 |
+
"step": 60500
|
| 10771 |
+
},
|
| 10772 |
+
{
|
| 10773 |
+
"epoch": 0.40695318188621116,
|
| 10774 |
+
"eval_loss": 2.914865255355835,
|
| 10775 |
+
"eval_runtime": 51.2039,
|
| 10776 |
+
"eval_samples_per_second": 97.649,
|
| 10777 |
+
"eval_steps_per_second": 24.412,
|
| 10778 |
+
"num_input_tokens_seen": 15859712000,
|
| 10779 |
+
"step": 60500
|
| 10780 |
+
},
|
| 10781 |
+
{
|
| 10782 |
+
"epoch": 0.40728950682991877,
|
| 10783 |
+
"grad_norm": 0.2364359349012375,
|
| 10784 |
+
"learning_rate": 0.0006404380496356461,
|
| 10785 |
+
"loss": 3.0102,
|
| 10786 |
+
"num_input_tokens_seen": 15872819200,
|
| 10787 |
+
"step": 60550
|
| 10788 |
+
},
|
| 10789 |
+
{
|
| 10790 |
+
"epoch": 0.4076258317736264,
|
| 10791 |
+
"grad_norm": 0.19283762574195862,
|
| 10792 |
+
"learning_rate": 0.0006357202249325371,
|
| 10793 |
+
"loss": 3.0132,
|
| 10794 |
+
"num_input_tokens_seen": 15885926400,
|
| 10795 |
+
"step": 60600
|
| 10796 |
+
},
|
| 10797 |
+
{
|
| 10798 |
+
"epoch": 0.40796215671733405,
|
| 10799 |
+
"grad_norm": 0.19770501554012299,
|
| 10800 |
+
"learning_rate": 0.0006309893192316686,
|
| 10801 |
+
"loss": 3.0106,
|
| 10802 |
+
"num_input_tokens_seen": 15899033600,
|
| 10803 |
+
"step": 60650
|
| 10804 |
+
},
|
| 10805 |
+
{
|
| 10806 |
+
"epoch": 0.40829848166104166,
|
| 10807 |
+
"grad_norm": 0.18395134806632996,
|
| 10808 |
+
"learning_rate": 0.000626245788507579,
|
| 10809 |
+
"loss": 3.005,
|
| 10810 |
+
"num_input_tokens_seen": 15912140800,
|
| 10811 |
+
"step": 60700
|
| 10812 |
+
},
|
| 10813 |
+
{
|
| 10814 |
+
"epoch": 0.40863480660474927,
|
| 10815 |
+
"grad_norm": 0.21380823850631714,
|
| 10816 |
+
"learning_rate": 0.000621490089951632,
|
| 10817 |
+
"loss": 3.0106,
|
| 10818 |
+
"num_input_tokens_seen": 15925248000,
|
| 10819 |
+
"step": 60750
|
| 10820 |
+
},
|
| 10821 |
+
{
|
| 10822 |
+
"epoch": 0.4089711315484569,
|
| 10823 |
+
"grad_norm": 0.17995478212833405,
|
| 10824 |
+
"learning_rate": 0.0006167226819279528,
|
| 10825 |
+
"loss": 3.0237,
|
| 10826 |
+
"num_input_tokens_seen": 15938355200,
|
| 10827 |
+
"step": 60800
|
| 10828 |
+
},
|
| 10829 |
+
{
|
| 10830 |
+
"epoch": 0.4093074564921645,
|
| 10831 |
+
"grad_norm": 0.31993716955184937,
|
| 10832 |
+
"learning_rate": 0.0006119440239292493,
|
| 10833 |
+
"loss": 3.0158,
|
| 10834 |
+
"num_input_tokens_seen": 15951462400,
|
| 10835 |
+
"step": 60850
|
| 10836 |
+
},
|
| 10837 |
+
{
|
| 10838 |
+
"epoch": 0.4096437814358721,
|
| 10839 |
+
"grad_norm": 0.19210565090179443,
|
| 10840 |
+
"learning_rate": 0.0006071545765325253,
|
| 10841 |
+
"loss": 3.0121,
|
| 10842 |
+
"num_input_tokens_seen": 15964569600,
|
| 10843 |
+
"step": 60900
|
| 10844 |
+
},
|
| 10845 |
+
{
|
| 10846 |
+
"epoch": 0.4099801063795797,
|
| 10847 |
+
"grad_norm": 0.4126472771167755,
|
| 10848 |
+
"learning_rate": 0.0006023548013546899,
|
| 10849 |
+
"loss": 3.0215,
|
| 10850 |
+
"num_input_tokens_seen": 15977676800,
|
| 10851 |
+
"step": 60950
|
| 10852 |
+
},
|
| 10853 |
+
{
|
| 10854 |
+
"epoch": 0.4103164313232873,
|
| 10855 |
+
"grad_norm": 0.26418012380599976,
|
| 10856 |
+
"learning_rate": 0.0005975451610080642,
|
| 10857 |
+
"loss": 3.0125,
|
| 10858 |
+
"num_input_tokens_seen": 15990784000,
|
| 10859 |
+
"step": 61000
|
| 10860 |
+
},
|
| 10861 |
+
{
|
| 10862 |
+
"epoch": 0.4103164313232873,
|
| 10863 |
+
"eval_loss": 2.913696765899658,
|
| 10864 |
+
"eval_runtime": 52.0924,
|
| 10865 |
+
"eval_samples_per_second": 95.983,
|
| 10866 |
+
"eval_steps_per_second": 23.996,
|
| 10867 |
+
"num_input_tokens_seen": 15990784000,
|
| 10868 |
+
"step": 61000
|
| 10869 |
+
},
|
| 10870 |
+
{
|
| 10871 |
+
"epoch": 0.41065275626699493,
|
| 10872 |
+
"grad_norm": 0.3535885810852051,
|
| 10873 |
+
"learning_rate": 0.0005927261190557954,
|
| 10874 |
+
"loss": 3.0102,
|
| 10875 |
+
"num_input_tokens_seen": 16003891200,
|
| 10876 |
+
"step": 61050
|
| 10877 |
+
},
|
| 10878 |
+
{
|
| 10879 |
+
"epoch": 0.41098908121070254,
|
| 10880 |
+
"grad_norm": 0.2633107304573059,
|
| 10881 |
+
"learning_rate": 0.0005878981399671774,
|
| 10882 |
+
"loss": 3.0424,
|
| 10883 |
+
"num_input_tokens_seen": 16016998400,
|
| 10884 |
+
"step": 61100
|
| 10885 |
+
},
|
| 10886 |
+
{
|
| 10887 |
+
"epoch": 0.41132540615441016,
|
| 10888 |
+
"grad_norm": 0.3054018020629883,
|
| 10889 |
+
"learning_rate": 0.0005830616890728827,
|
| 10890 |
+
"loss": 3.0233,
|
| 10891 |
+
"num_input_tokens_seen": 16030105600,
|
| 10892 |
+
"step": 61150
|
| 10893 |
+
},
|
| 10894 |
+
{
|
| 10895 |
+
"epoch": 0.41166173109811777,
|
| 10896 |
+
"grad_norm": 0.21453993022441864,
|
| 10897 |
+
"learning_rate": 0.0005782172325201155,
|
| 10898 |
+
"loss": 3.018,
|
| 10899 |
+
"num_input_tokens_seen": 16043212800,
|
| 10900 |
+
"step": 61200
|
| 10901 |
+
},
|
| 10902 |
+
{
|
| 10903 |
+
"epoch": 0.4119980560418254,
|
| 10904 |
+
"grad_norm": 0.27815598249435425,
|
| 10905 |
+
"learning_rate": 0.0005733652372276809,
|
| 10906 |
+
"loss": 3.0254,
|
| 10907 |
+
"num_input_tokens_seen": 16056320000,
|
| 10908 |
+
"step": 61250
|
| 10909 |
+
},
|
| 10910 |
+
{
|
| 10911 |
+
"epoch": 0.412334380985533,
|
| 10912 |
+
"grad_norm": 0.20687313377857208,
|
| 10913 |
+
"learning_rate": 0.0005685061708409841,
|
| 10914 |
+
"loss": 3.0165,
|
| 10915 |
+
"num_input_tokens_seen": 16069427200,
|
| 10916 |
+
"step": 61300
|
| 10917 |
+
},
|
| 10918 |
+
{
|
| 10919 |
+
"epoch": 0.4126707059292406,
|
| 10920 |
+
"grad_norm": 0.1985252946615219,
|
| 10921 |
+
"learning_rate": 0.0005636405016869566,
|
| 10922 |
+
"loss": 3.0164,
|
| 10923 |
+
"num_input_tokens_seen": 16082534400,
|
| 10924 |
+
"step": 61350
|
| 10925 |
+
},
|
| 10926 |
+
{
|
| 10927 |
+
"epoch": 0.4130070308729482,
|
| 10928 |
+
"grad_norm": 0.26703181862831116,
|
| 10929 |
+
"learning_rate": 0.0005587686987289189,
|
| 10930 |
+
"loss": 3.0001,
|
| 10931 |
+
"num_input_tokens_seen": 16095641600,
|
| 10932 |
+
"step": 61400
|
| 10933 |
+
},
|
| 10934 |
+
{
|
| 10935 |
+
"epoch": 0.4133433558166558,
|
| 10936 |
+
"grad_norm": 0.1948036104440689,
|
| 10937 |
+
"learning_rate": 0.0005538912315213797,
|
| 10938 |
+
"loss": 3.0058,
|
| 10939 |
+
"num_input_tokens_seen": 16108748800,
|
| 10940 |
+
"step": 61450
|
| 10941 |
+
},
|
| 10942 |
+
{
|
| 10943 |
+
"epoch": 0.41367968076036343,
|
| 10944 |
+
"grad_norm": 0.20653308928012848,
|
| 10945 |
+
"learning_rate": 0.0005490085701647804,
|
| 10946 |
+
"loss": 3.0115,
|
| 10947 |
+
"num_input_tokens_seen": 16121856000,
|
| 10948 |
+
"step": 61500
|
| 10949 |
+
},
|
| 10950 |
+
{
|
| 10951 |
+
"epoch": 0.41367968076036343,
|
| 10952 |
+
"eval_loss": 2.9048781394958496,
|
| 10953 |
+
"eval_runtime": 53.8207,
|
| 10954 |
+
"eval_samples_per_second": 92.901,
|
| 10955 |
+
"eval_steps_per_second": 23.225,
|
| 10956 |
+
"num_input_tokens_seen": 16121856000,
|
| 10957 |
+
"step": 61500
|
| 10958 |
+
},
|
| 10959 |
+
{
|
| 10960 |
+
"epoch": 0.41401600570407104,
|
| 10961 |
+
"grad_norm": 0.19605295360088348,
|
| 10962 |
+
"learning_rate": 0.0005441211852601849,
|
| 10963 |
+
"loss": 3.0225,
|
| 10964 |
+
"num_input_tokens_seen": 16134963200,
|
| 10965 |
+
"step": 61550
|
| 10966 |
+
},
|
| 10967 |
+
{
|
| 10968 |
+
"epoch": 0.41435233064777865,
|
| 10969 |
+
"grad_norm": 0.17526155710220337,
|
| 10970 |
+
"learning_rate": 0.0005392295478639225,
|
| 10971 |
+
"loss": 3.0117,
|
| 10972 |
+
"num_input_tokens_seen": 16148070400,
|
| 10973 |
+
"step": 61600
|
| 10974 |
+
},
|
| 10975 |
+
{
|
| 10976 |
+
"epoch": 0.41468865559148627,
|
| 10977 |
+
"grad_norm": 0.17657403647899628,
|
| 10978 |
+
"learning_rate": 0.0005343341294421868,
|
| 10979 |
+
"loss": 3.0107,
|
| 10980 |
+
"num_input_tokens_seen": 16161177600,
|
| 10981 |
+
"step": 61650
|
| 10982 |
+
},
|
| 10983 |
+
{
|
| 10984 |
+
"epoch": 0.4150249805351939,
|
| 10985 |
+
"grad_norm": 0.18658681213855743,
|
| 10986 |
+
"learning_rate": 0.0005294354018255945,
|
| 10987 |
+
"loss": 3.0085,
|
| 10988 |
+
"num_input_tokens_seen": 16174284800,
|
| 10989 |
+
"step": 61700
|
| 10990 |
+
},
|
| 10991 |
+
{
|
| 10992 |
+
"epoch": 0.4153613054789015,
|
| 10993 |
+
"grad_norm": 0.24781519174575806,
|
| 10994 |
+
"learning_rate": 0.0005245338371637091,
|
| 10995 |
+
"loss": 2.9939,
|
| 10996 |
+
"num_input_tokens_seen": 16187392000,
|
| 10997 |
+
"step": 61750
|
| 10998 |
+
},
|
| 10999 |
+
{
|
| 11000 |
+
"epoch": 0.4156976304226091,
|
| 11001 |
+
"grad_norm": 0.20824941992759705,
|
| 11002 |
+
"learning_rate": 0.0005196299078795343,
|
| 11003 |
+
"loss": 3.0038,
|
| 11004 |
+
"num_input_tokens_seen": 16200499200,
|
| 11005 |
+
"step": 61800
|
| 11006 |
+
},
|
| 11007 |
+
{
|
| 11008 |
+
"epoch": 0.4160339553663167,
|
| 11009 |
+
"grad_norm": 0.38262441754341125,
|
| 11010 |
+
"learning_rate": 0.0005147240866239817,
|
| 11011 |
+
"loss": 3.0141,
|
| 11012 |
+
"num_input_tokens_seen": 16213606400,
|
| 11013 |
+
"step": 61850
|
| 11014 |
+
},
|
| 11015 |
+
{
|
| 11016 |
+
"epoch": 0.4163702803100243,
|
| 11017 |
+
"grad_norm": 0.200628861784935,
|
| 11018 |
+
"learning_rate": 0.0005098168462303141,
|
| 11019 |
+
"loss": 3.0187,
|
| 11020 |
+
"num_input_tokens_seen": 16226713600,
|
| 11021 |
+
"step": 61900
|
| 11022 |
+
},
|
| 11023 |
+
{
|
| 11024 |
+
"epoch": 0.41670660525373193,
|
| 11025 |
+
"grad_norm": 0.18858259916305542,
|
| 11026 |
+
"learning_rate": 0.000504908659668575,
|
| 11027 |
+
"loss": 3.0049,
|
| 11028 |
+
"num_input_tokens_seen": 16239820800,
|
| 11029 |
+
"step": 61950
|
| 11030 |
+
},
|
| 11031 |
+
{
|
| 11032 |
+
"epoch": 0.41704293019743954,
|
| 11033 |
+
"grad_norm": 0.19025108218193054,
|
| 11034 |
+
"learning_rate": 0.0005,
|
| 11035 |
+
"loss": 3.0079,
|
| 11036 |
+
"num_input_tokens_seen": 16252928000,
|
| 11037 |
+
"step": 62000
|
| 11038 |
+
},
|
| 11039 |
+
{
|
| 11040 |
+
"epoch": 0.41704293019743954,
|
| 11041 |
+
"eval_loss": 2.9012608528137207,
|
| 11042 |
+
"eval_runtime": 52.7052,
|
| 11043 |
+
"eval_samples_per_second": 94.867,
|
| 11044 |
+
"eval_steps_per_second": 23.717,
|
| 11045 |
+
"num_input_tokens_seen": 16252928000,
|
| 11046 |
+
"step": 62000
|
| 11047 |
+
},
|
| 11048 |
+
{
|
| 11049 |
+
"epoch": 0.41737925514114715,
|
| 11050 |
+
"grad_norm": 0.19505389034748077,
|
| 11051 |
+
"learning_rate": 0.0004950913403314252,
|
| 11052 |
+
"loss": 2.9995,
|
| 11053 |
+
"num_input_tokens_seen": 16266035200,
|
| 11054 |
+
"step": 62050
|
| 11055 |
+
},
|
| 11056 |
+
{
|
| 11057 |
+
"epoch": 0.41771558008485477,
|
| 11058 |
+
"grad_norm": 0.18988089263439178,
|
| 11059 |
+
"learning_rate": 0.0004901831537696859,
|
| 11060 |
+
"loss": 3.0041,
|
| 11061 |
+
"num_input_tokens_seen": 16279142400,
|
| 11062 |
+
"step": 62100
|
| 11063 |
+
},
|
| 11064 |
+
{
|
| 11065 |
+
"epoch": 0.4180519050285624,
|
| 11066 |
+
"grad_norm": 0.19544407725334167,
|
| 11067 |
+
"learning_rate": 0.0004852759133760184,
|
| 11068 |
+
"loss": 3.0073,
|
| 11069 |
+
"num_input_tokens_seen": 16292249600,
|
| 11070 |
+
"step": 62150
|
| 11071 |
+
},
|
| 11072 |
+
{
|
| 11073 |
+
"epoch": 0.41838822997227,
|
| 11074 |
+
"grad_norm": 0.1884351521730423,
|
| 11075 |
+
"learning_rate": 0.00048037009212046586,
|
| 11076 |
+
"loss": 3.0035,
|
| 11077 |
+
"num_input_tokens_seen": 16305356800,
|
| 11078 |
+
"step": 62200
|
| 11079 |
+
},
|
| 11080 |
+
{
|
| 11081 |
+
"epoch": 0.4187245549159776,
|
| 11082 |
+
"grad_norm": 0.17927390336990356,
|
| 11083 |
+
"learning_rate": 0.000475466162836291,
|
| 11084 |
+
"loss": 2.9921,
|
| 11085 |
+
"num_input_tokens_seen": 16318464000,
|
| 11086 |
+
"step": 62250
|
| 11087 |
+
},
|
| 11088 |
+
{
|
| 11089 |
+
"epoch": 0.4190608798596852,
|
| 11090 |
+
"grad_norm": 0.18687283992767334,
|
| 11091 |
+
"learning_rate": 0.00047056459817440544,
|
| 11092 |
+
"loss": 3.0042,
|
| 11093 |
+
"num_input_tokens_seen": 16331571200,
|
| 11094 |
+
"step": 62300
|
| 11095 |
+
},
|
| 11096 |
+
{
|
| 11097 |
+
"epoch": 0.4193972048033928,
|
| 11098 |
+
"grad_norm": 0.18783149123191833,
|
| 11099 |
+
"learning_rate": 0.00046566587055781316,
|
| 11100 |
+
"loss": 3.0003,
|
| 11101 |
+
"num_input_tokens_seen": 16344678400,
|
| 11102 |
+
"step": 62350
|
| 11103 |
+
},
|
| 11104 |
+
{
|
| 11105 |
+
"epoch": 0.41973352974710043,
|
| 11106 |
+
"grad_norm": 0.18625770509243011,
|
| 11107 |
+
"learning_rate": 0.0004607704521360776,
|
| 11108 |
+
"loss": 3.0061,
|
| 11109 |
+
"num_input_tokens_seen": 16357785600,
|
| 11110 |
+
"step": 62400
|
| 11111 |
+
},
|
| 11112 |
+
{
|
| 11113 |
+
"epoch": 0.4200698546908081,
|
| 11114 |
+
"grad_norm": 0.20189669728279114,
|
| 11115 |
+
"learning_rate": 0.00045587881473981533,
|
| 11116 |
+
"loss": 2.9976,
|
| 11117 |
+
"num_input_tokens_seen": 16370892800,
|
| 11118 |
+
"step": 62450
|
| 11119 |
+
},
|
| 11120 |
+
{
|
| 11121 |
+
"epoch": 0.4204061796345157,
|
| 11122 |
+
"grad_norm": 0.19049198925495148,
|
| 11123 |
+
"learning_rate": 0.0004509914298352197,
|
| 11124 |
+
"loss": 3.0055,
|
| 11125 |
+
"num_input_tokens_seen": 16384000000,
|
| 11126 |
+
"step": 62500
|
| 11127 |
+
},
|
| 11128 |
+
{
|
| 11129 |
+
"epoch": 0.4204061796345157,
|
| 11130 |
+
"eval_loss": 2.896798849105835,
|
| 11131 |
+
"eval_runtime": 52.8908,
|
| 11132 |
+
"eval_samples_per_second": 94.534,
|
| 11133 |
+
"eval_steps_per_second": 23.634,
|
| 11134 |
+
"num_input_tokens_seen": 16384000000,
|
| 11135 |
+
"step": 62500
|
| 11136 |
+
},
|
| 11137 |
+
{
|
| 11138 |
+
"epoch": 0.4207425045782233,
|
| 11139 |
+
"grad_norm": 0.1667575091123581,
|
| 11140 |
+
"learning_rate": 0.00044610876847862033,
|
| 11141 |
+
"loss": 2.9929,
|
| 11142 |
+
"num_input_tokens_seen": 16397107200,
|
| 11143 |
+
"step": 62550
|
| 11144 |
+
},
|
| 11145 |
+
{
|
| 11146 |
+
"epoch": 0.42107882952193093,
|
| 11147 |
+
"grad_norm": 0.7176526188850403,
|
| 11148 |
+
"learning_rate": 0.00044123130127108126,
|
| 11149 |
+
"loss": 2.9918,
|
| 11150 |
+
"num_input_tokens_seen": 16410214400,
|
| 11151 |
+
"step": 62600
|
| 11152 |
+
},
|
| 11153 |
+
{
|
| 11154 |
+
"epoch": 0.42141515446563854,
|
| 11155 |
+
"grad_norm": 0.20578069984912872,
|
| 11156 |
+
"learning_rate": 0.00043635949831304343,
|
| 11157 |
+
"loss": 3.0037,
|
| 11158 |
+
"num_input_tokens_seen": 16423321600,
|
| 11159 |
+
"step": 62650
|
| 11160 |
+
},
|
| 11161 |
+
{
|
| 11162 |
+
"epoch": 0.42175147940934615,
|
| 11163 |
+
"grad_norm": 0.19712655246257782,
|
| 11164 |
+
"learning_rate": 0.0004314938291590161,
|
| 11165 |
+
"loss": 3.0142,
|
| 11166 |
+
"num_input_tokens_seen": 16436428800,
|
| 11167 |
+
"step": 62700
|
| 11168 |
+
},
|
| 11169 |
+
{
|
| 11170 |
+
"epoch": 0.42208780435305376,
|
| 11171 |
+
"grad_norm": 0.20189446210861206,
|
| 11172 |
+
"learning_rate": 0.00042663476277231917,
|
| 11173 |
+
"loss": 2.9983,
|
| 11174 |
+
"num_input_tokens_seen": 16449536000,
|
| 11175 |
+
"step": 62750
|
| 11176 |
+
},
|
| 11177 |
+
{
|
| 11178 |
+
"epoch": 0.4224241292967614,
|
| 11179 |
+
"grad_norm": 0.18463867902755737,
|
| 11180 |
+
"learning_rate": 0.0004217827674798845,
|
| 11181 |
+
"loss": 2.9971,
|
| 11182 |
+
"num_input_tokens_seen": 16462643200,
|
| 11183 |
+
"step": 62800
|
| 11184 |
+
},
|
| 11185 |
+
{
|
| 11186 |
+
"epoch": 0.422760454240469,
|
| 11187 |
+
"grad_norm": 0.17639389634132385,
|
| 11188 |
+
"learning_rate": 0.0004169383109271174,
|
| 11189 |
+
"loss": 3.0032,
|
| 11190 |
+
"num_input_tokens_seen": 16475750400,
|
| 11191 |
+
"step": 62850
|
| 11192 |
+
},
|
| 11193 |
+
{
|
| 11194 |
+
"epoch": 0.4230967791841766,
|
| 11195 |
+
"grad_norm": 0.1733781099319458,
|
| 11196 |
+
"learning_rate": 0.00041210186003282274,
|
| 11197 |
+
"loss": 2.9932,
|
| 11198 |
+
"num_input_tokens_seen": 16488857600,
|
| 11199 |
+
"step": 62900
|
| 11200 |
+
},
|
| 11201 |
+
{
|
| 11202 |
+
"epoch": 0.4234331041278842,
|
| 11203 |
+
"grad_norm": 0.17753124237060547,
|
| 11204 |
+
"learning_rate": 0.00040727388094420456,
|
| 11205 |
+
"loss": 3.0012,
|
| 11206 |
+
"num_input_tokens_seen": 16501964800,
|
| 11207 |
+
"step": 62950
|
| 11208 |
+
},
|
| 11209 |
+
{
|
| 11210 |
+
"epoch": 0.4237694290715918,
|
| 11211 |
+
"grad_norm": 0.180925652384758,
|
| 11212 |
+
"learning_rate": 0.00040245483899193594,
|
| 11213 |
+
"loss": 2.9823,
|
| 11214 |
+
"num_input_tokens_seen": 16515072000,
|
| 11215 |
+
"step": 63000
|
| 11216 |
+
},
|
| 11217 |
+
{
|
| 11218 |
+
"epoch": 0.4237694290715918,
|
| 11219 |
+
"eval_loss": 2.8929545879364014,
|
| 11220 |
+
"eval_runtime": 53.37,
|
| 11221 |
+
"eval_samples_per_second": 93.686,
|
| 11222 |
+
"eval_steps_per_second": 23.421,
|
| 11223 |
+
"num_input_tokens_seen": 16515072000,
|
| 11224 |
+
"step": 63000
|
| 11225 |
+
},
|
| 11226 |
+
{
|
| 11227 |
+
"epoch": 0.42410575401529943,
|
| 11228 |
+
"grad_norm": 0.15995506942272186,
|
| 11229 |
+
"learning_rate": 0.00039764519864531023,
|
| 11230 |
+
"loss": 2.9898,
|
| 11231 |
+
"num_input_tokens_seen": 16528179200,
|
| 11232 |
+
"step": 63050
|
| 11233 |
+
},
|
| 11234 |
+
{
|
| 11235 |
+
"epoch": 0.42444207895900704,
|
| 11236 |
+
"grad_norm": 0.16034817695617676,
|
| 11237 |
+
"learning_rate": 0.0003928454234674747,
|
| 11238 |
+
"loss": 2.9884,
|
| 11239 |
+
"num_input_tokens_seen": 16541286400,
|
| 11240 |
+
"step": 63100
|
| 11241 |
+
},
|
| 11242 |
+
{
|
| 11243 |
+
"epoch": 0.42477840390271465,
|
| 11244 |
+
"grad_norm": 0.17681469023227692,
|
| 11245 |
+
"learning_rate": 0.00038805597607075075,
|
| 11246 |
+
"loss": 2.9952,
|
| 11247 |
+
"num_input_tokens_seen": 16554393600,
|
| 11248 |
+
"step": 63150
|
| 11249 |
+
},
|
| 11250 |
+
{
|
| 11251 |
+
"epoch": 0.42511472884642226,
|
| 11252 |
+
"grad_norm": 0.18527273833751678,
|
| 11253 |
+
"learning_rate": 0.00038327731807204744,
|
| 11254 |
+
"loss": 2.9947,
|
| 11255 |
+
"num_input_tokens_seen": 16567500800,
|
| 11256 |
+
"step": 63200
|
| 11257 |
+
},
|
| 11258 |
+
{
|
| 11259 |
+
"epoch": 0.4254510537901299,
|
| 11260 |
+
"grad_norm": 0.16262546181678772,
|
| 11261 |
+
"learning_rate": 0.0003785099100483681,
|
| 11262 |
+
"loss": 2.9972,
|
| 11263 |
+
"num_input_tokens_seen": 16580608000,
|
| 11264 |
+
"step": 63250
|
| 11265 |
+
},
|
| 11266 |
+
{
|
| 11267 |
+
"epoch": 0.4257873787338375,
|
| 11268 |
+
"grad_norm": 0.1709870994091034,
|
| 11269 |
+
"learning_rate": 0.00037375421149242103,
|
| 11270 |
+
"loss": 2.999,
|
| 11271 |
+
"num_input_tokens_seen": 16593715200,
|
| 11272 |
+
"step": 63300
|
| 11273 |
+
},
|
| 11274 |
+
{
|
| 11275 |
+
"epoch": 0.4261237036775451,
|
| 11276 |
+
"grad_norm": 0.1716383844614029,
|
| 11277 |
+
"learning_rate": 0.0003690106807683313,
|
| 11278 |
+
"loss": 2.9964,
|
| 11279 |
+
"num_input_tokens_seen": 16606822400,
|
| 11280 |
+
"step": 63350
|
| 11281 |
+
},
|
| 11282 |
+
{
|
| 11283 |
+
"epoch": 0.4264600286212527,
|
| 11284 |
+
"grad_norm": 0.18682868778705597,
|
| 11285 |
+
"learning_rate": 0.0003642797750674629,
|
| 11286 |
+
"loss": 3.0037,
|
| 11287 |
+
"num_input_tokens_seen": 16619929600,
|
| 11288 |
+
"step": 63400
|
| 11289 |
+
},
|
| 11290 |
+
{
|
| 11291 |
+
"epoch": 0.4267963535649603,
|
| 11292 |
+
"grad_norm": 0.16003596782684326,
|
| 11293 |
+
"learning_rate": 0.00035956195036435405,
|
| 11294 |
+
"loss": 2.9893,
|
| 11295 |
+
"num_input_tokens_seen": 16633036800,
|
| 11296 |
+
"step": 63450
|
| 11297 |
+
},
|
| 11298 |
+
{
|
| 11299 |
+
"epoch": 0.42713267850866793,
|
| 11300 |
+
"grad_norm": 0.17876048386096954,
|
| 11301 |
+
"learning_rate": 0.0003548576613727689,
|
| 11302 |
+
"loss": 3.0004,
|
| 11303 |
+
"num_input_tokens_seen": 16646144000,
|
| 11304 |
+
"step": 63500
|
| 11305 |
+
},
|
| 11306 |
+
{
|
| 11307 |
+
"epoch": 0.42713267850866793,
|
| 11308 |
+
"eval_loss": 2.8903579711914062,
|
| 11309 |
+
"eval_runtime": 53.0482,
|
| 11310 |
+
"eval_samples_per_second": 94.254,
|
| 11311 |
+
"eval_steps_per_second": 23.563,
|
| 11312 |
+
"num_input_tokens_seen": 16646144000,
|
| 11313 |
+
"step": 63500
|
| 11314 |
+
},
|
| 11315 |
+
{
|
| 11316 |
+
"epoch": 0.42746900345237554,
|
| 11317 |
+
"grad_norm": 0.21229425072669983,
|
| 11318 |
+
"learning_rate": 0.00035016736150187165,
|
| 11319 |
+
"loss": 2.9925,
|
| 11320 |
+
"num_input_tokens_seen": 16659251200,
|
| 11321 |
+
"step": 63550
|
| 11322 |
+
},
|
| 11323 |
+
{
|
| 11324 |
+
"epoch": 0.42780532839608315,
|
| 11325 |
+
"grad_norm": 0.19477584958076477,
|
| 11326 |
+
"learning_rate": 0.00034549150281252633,
|
| 11327 |
+
"loss": 2.9892,
|
| 11328 |
+
"num_input_tokens_seen": 16672358400,
|
| 11329 |
+
"step": 63600
|
| 11330 |
+
},
|
| 11331 |
+
{
|
| 11332 |
+
"epoch": 0.42814165333979076,
|
| 11333 |
+
"grad_norm": 0.1866609901189804,
|
| 11334 |
+
"learning_rate": 0.0003408305359737252,
|
| 11335 |
+
"loss": 2.9913,
|
| 11336 |
+
"num_input_tokens_seen": 16685465600,
|
| 11337 |
+
"step": 63650
|
| 11338 |
+
},
|
| 11339 |
+
{
|
| 11340 |
+
"epoch": 0.4284779782834984,
|
| 11341 |
+
"grad_norm": 0.19487887620925903,
|
| 11342 |
+
"learning_rate": 0.0003361849102191533,
|
| 11343 |
+
"loss": 2.9875,
|
| 11344 |
+
"num_input_tokens_seen": 16698572800,
|
| 11345 |
+
"step": 63700
|
| 11346 |
+
},
|
| 11347 |
+
{
|
| 11348 |
+
"epoch": 0.428814303227206,
|
| 11349 |
+
"grad_norm": 0.15979841351509094,
|
| 11350 |
+
"learning_rate": 0.00033155507330389,
|
| 11351 |
+
"loss": 2.9894,
|
| 11352 |
+
"num_input_tokens_seen": 16711680000,
|
| 11353 |
+
"step": 63750
|
| 11354 |
+
},
|
| 11355 |
+
{
|
| 11356 |
+
"epoch": 0.4291506281709136,
|
| 11357 |
+
"grad_norm": 0.1749998778104782,
|
| 11358 |
+
"learning_rate": 0.0003269414714612534,
|
| 11359 |
+
"loss": 2.9945,
|
| 11360 |
+
"num_input_tokens_seen": 16724787200,
|
| 11361 |
+
"step": 63800
|
| 11362 |
+
},
|
| 11363 |
+
{
|
| 11364 |
+
"epoch": 0.4294869531146212,
|
| 11365 |
+
"grad_norm": 0.16839075088500977,
|
| 11366 |
+
"learning_rate": 0.00032234454935979205,
|
| 11367 |
+
"loss": 2.9989,
|
| 11368 |
+
"num_input_tokens_seen": 16737894400,
|
| 11369 |
+
"step": 63850
|
| 11370 |
+
},
|
| 11371 |
+
{
|
| 11372 |
+
"epoch": 0.4298232780583288,
|
| 11373 |
+
"grad_norm": 0.19226372241973877,
|
| 11374 |
+
"learning_rate": 0.0003177647500604252,
|
| 11375 |
+
"loss": 2.9854,
|
| 11376 |
+
"num_input_tokens_seen": 16751001600,
|
| 11377 |
+
"step": 63900
|
| 11378 |
+
},
|
| 11379 |
+
{
|
| 11380 |
+
"epoch": 0.43015960300203643,
|
| 11381 |
+
"grad_norm": 0.15530380606651306,
|
| 11382 |
+
"learning_rate": 0.0003132025149737419,
|
| 11383 |
+
"loss": 2.9903,
|
| 11384 |
+
"num_input_tokens_seen": 16764108800,
|
| 11385 |
+
"step": 63950
|
| 11386 |
+
},
|
| 11387 |
+
{
|
| 11388 |
+
"epoch": 0.43049592794574404,
|
| 11389 |
+
"grad_norm": 0.17773845791816711,
|
| 11390 |
+
"learning_rate": 0.0003086582838174551,
|
| 11391 |
+
"loss": 2.9839,
|
| 11392 |
+
"num_input_tokens_seen": 16777216000,
|
| 11393 |
+
"step": 64000
|
| 11394 |
+
},
|
| 11395 |
+
{
|
| 11396 |
+
"epoch": 0.43049592794574404,
|
| 11397 |
+
"eval_loss": 2.8860437870025635,
|
| 11398 |
+
"eval_runtime": 53.1514,
|
| 11399 |
+
"eval_samples_per_second": 94.071,
|
| 11400 |
+
"eval_steps_per_second": 23.518,
|
| 11401 |
+
"num_input_tokens_seen": 16777216000,
|
| 11402 |
+
"step": 64000
|
| 11403 |
+
},
|
| 11404 |
+
{
|
| 11405 |
+
"epoch": 0.43083225288945165,
|
| 11406 |
+
"grad_norm": 0.15883377194404602,
|
| 11407 |
+
"learning_rate": 0.000304132494574022,
|
| 11408 |
+
"loss": 2.9851,
|
| 11409 |
+
"num_input_tokens_seen": 16790323200,
|
| 11410 |
+
"step": 64050
|
| 11411 |
+
},
|
| 11412 |
+
{
|
| 11413 |
+
"epoch": 0.43116857783315926,
|
| 11414 |
+
"grad_norm": 0.176467627286911,
|
| 11415 |
+
"learning_rate": 0.00029962558344842963,
|
| 11416 |
+
"loss": 2.9865,
|
| 11417 |
+
"num_input_tokens_seen": 16803430400,
|
| 11418 |
+
"step": 64100
|
| 11419 |
+
},
|
| 11420 |
+
{
|
| 11421 |
+
"epoch": 0.43150490277686687,
|
| 11422 |
+
"grad_norm": 0.16392388939857483,
|
| 11423 |
+
"learning_rate": 0.00029513798482615227,
|
| 11424 |
+
"loss": 2.9788,
|
| 11425 |
+
"num_input_tokens_seen": 16816537600,
|
| 11426 |
+
"step": 64150
|
| 11427 |
+
},
|
| 11428 |
+
{
|
| 11429 |
+
"epoch": 0.4318412277205745,
|
| 11430 |
+
"grad_norm": 0.15614169836044312,
|
| 11431 |
+
"learning_rate": 0.0002906701312312861,
|
| 11432 |
+
"loss": 2.9769,
|
| 11433 |
+
"num_input_tokens_seen": 16829644800,
|
| 11434 |
+
"step": 64200
|
| 11435 |
+
},
|
| 11436 |
+
{
|
| 11437 |
+
"epoch": 0.43217755266428215,
|
| 11438 |
+
"grad_norm": 0.16225555539131165,
|
| 11439 |
+
"learning_rate": 0.00028622245328485907,
|
| 11440 |
+
"loss": 2.9881,
|
| 11441 |
+
"num_input_tokens_seen": 16842752000,
|
| 11442 |
+
"step": 64250
|
| 11443 |
+
},
|
| 11444 |
+
{
|
| 11445 |
+
"epoch": 0.43251387760798976,
|
| 11446 |
+
"grad_norm": 0.16419048607349396,
|
| 11447 |
+
"learning_rate": 0.0002817953796633289,
|
| 11448 |
+
"loss": 2.99,
|
| 11449 |
+
"num_input_tokens_seen": 16855859200,
|
| 11450 |
+
"step": 64300
|
| 11451 |
+
},
|
| 11452 |
+
{
|
| 11453 |
+
"epoch": 0.43285020255169737,
|
| 11454 |
+
"grad_norm": 0.16654469072818756,
|
| 11455 |
+
"learning_rate": 0.000277389337057266,
|
| 11456 |
+
"loss": 2.9919,
|
| 11457 |
+
"num_input_tokens_seen": 16868966400,
|
| 11458 |
+
"step": 64350
|
| 11459 |
+
},
|
| 11460 |
+
{
|
| 11461 |
+
"epoch": 0.433186527495405,
|
| 11462 |
+
"grad_norm": 0.1688661277294159,
|
| 11463 |
+
"learning_rate": 0.00027300475013022663,
|
| 11464 |
+
"loss": 2.9844,
|
| 11465 |
+
"num_input_tokens_seen": 16882073600,
|
| 11466 |
+
"step": 64400
|
| 11467 |
+
},
|
| 11468 |
+
{
|
| 11469 |
+
"epoch": 0.4335228524391126,
|
| 11470 |
+
"grad_norm": 0.162180095911026,
|
| 11471 |
+
"learning_rate": 0.000268642041477825,
|
| 11472 |
+
"loss": 2.9847,
|
| 11473 |
+
"num_input_tokens_seen": 16895180800,
|
| 11474 |
+
"step": 64450
|
| 11475 |
+
},
|
| 11476 |
+
{
|
| 11477 |
+
"epoch": 0.4338591773828202,
|
| 11478 |
+
"grad_norm": 0.18244421482086182,
|
| 11479 |
+
"learning_rate": 0.00026430163158700117,
|
| 11480 |
+
"loss": 2.9789,
|
| 11481 |
+
"num_input_tokens_seen": 16908288000,
|
| 11482 |
+
"step": 64500
|
| 11483 |
+
},
|
| 11484 |
+
{
|
| 11485 |
+
"epoch": 0.4338591773828202,
|
| 11486 |
+
"eval_loss": 2.8813860416412354,
|
| 11487 |
+
"eval_runtime": 53.1806,
|
| 11488 |
+
"eval_samples_per_second": 94.019,
|
| 11489 |
+
"eval_steps_per_second": 23.505,
|
| 11490 |
+
"num_input_tokens_seen": 16908288000,
|
| 11491 |
+
"step": 64500
|
| 11492 |
+
},
|
| 11493 |
+
{
|
| 11494 |
+
"epoch": 0.4341955023265278,
|
| 11495 |
+
"grad_norm": 0.15887753665447235,
|
| 11496 |
+
"learning_rate": 0.00025998393879549445,
|
| 11497 |
+
"loss": 2.9723,
|
| 11498 |
+
"num_input_tokens_seen": 16921395200,
|
| 11499 |
+
"step": 64550
|
| 11500 |
+
},
|
| 11501 |
+
{
|
| 11502 |
+
"epoch": 0.4345318272702354,
|
| 11503 |
+
"grad_norm": 0.17573221027851105,
|
| 11504 |
+
"learning_rate": 0.0002556893792515227,
|
| 11505 |
+
"loss": 2.99,
|
| 11506 |
+
"num_input_tokens_seen": 16934502400,
|
| 11507 |
+
"step": 64600
|
| 11508 |
+
},
|
| 11509 |
+
{
|
| 11510 |
+
"epoch": 0.43486815221394304,
|
| 11511 |
+
"grad_norm": 0.1790430247783661,
|
| 11512 |
+
"learning_rate": 0.0002514183668736727,
|
| 11513 |
+
"loss": 2.9887,
|
| 11514 |
+
"num_input_tokens_seen": 16947609600,
|
| 11515 |
+
"step": 64650
|
| 11516 |
+
},
|
| 11517 |
+
{
|
| 11518 |
+
"epoch": 0.43520447715765065,
|
| 11519 |
+
"grad_norm": 0.16031622886657715,
|
| 11520 |
+
"learning_rate": 0.0002471713133110078,
|
| 11521 |
+
"loss": 2.9835,
|
| 11522 |
+
"num_input_tokens_seen": 16960716800,
|
| 11523 |
+
"step": 64700
|
| 11524 |
+
},
|
| 11525 |
+
{
|
| 11526 |
+
"epoch": 0.43554080210135826,
|
| 11527 |
+
"grad_norm": 0.1702345311641693,
|
| 11528 |
+
"learning_rate": 0.0002429486279033892,
|
| 11529 |
+
"loss": 2.9862,
|
| 11530 |
+
"num_input_tokens_seen": 16973824000,
|
| 11531 |
+
"step": 64750
|
| 11532 |
+
},
|
| 11533 |
+
{
|
| 11534 |
+
"epoch": 0.43587712704506587,
|
| 11535 |
+
"grad_norm": 0.16080138087272644,
|
| 11536 |
+
"learning_rate": 0.00023875071764202561,
|
| 11537 |
+
"loss": 2.9785,
|
| 11538 |
+
"num_input_tokens_seen": 16986931200,
|
| 11539 |
+
"step": 64800
|
| 11540 |
+
},
|
| 11541 |
+
{
|
| 11542 |
+
"epoch": 0.4362134519887735,
|
| 11543 |
+
"grad_norm": 0.17694465816020966,
|
| 11544 |
+
"learning_rate": 0.0002345779871302453,
|
| 11545 |
+
"loss": 2.9962,
|
| 11546 |
+
"num_input_tokens_seen": 17000038400,
|
| 11547 |
+
"step": 64850
|
| 11548 |
+
},
|
| 11549 |
+
{
|
| 11550 |
+
"epoch": 0.4365497769324811,
|
| 11551 |
+
"grad_norm": 0.15310978889465332,
|
| 11552 |
+
"learning_rate": 0.00023043083854449987,
|
| 11553 |
+
"loss": 2.98,
|
| 11554 |
+
"num_input_tokens_seen": 17013145600,
|
| 11555 |
+
"step": 64900
|
| 11556 |
+
},
|
| 11557 |
+
{
|
| 11558 |
+
"epoch": 0.4368861018761887,
|
| 11559 |
+
"grad_norm": 0.15505504608154297,
|
| 11560 |
+
"learning_rate": 0.0002263096715956019,
|
| 11561 |
+
"loss": 2.9825,
|
| 11562 |
+
"num_input_tokens_seen": 17026252800,
|
| 11563 |
+
"step": 64950
|
| 11564 |
+
},
|
| 11565 |
+
{
|
| 11566 |
+
"epoch": 0.4372224268198963,
|
| 11567 |
+
"grad_norm": 0.15211448073387146,
|
| 11568 |
+
"learning_rate": 0.00022221488349019903,
|
| 11569 |
+
"loss": 2.9876,
|
| 11570 |
+
"num_input_tokens_seen": 17039360000,
|
| 11571 |
+
"step": 65000
|
| 11572 |
+
},
|
| 11573 |
+
{
|
| 11574 |
+
"epoch": 0.4372224268198963,
|
| 11575 |
+
"eval_loss": 2.8792829513549805,
|
| 11576 |
+
"eval_runtime": 53.0249,
|
| 11577 |
+
"eval_samples_per_second": 94.295,
|
| 11578 |
+
"eval_steps_per_second": 23.574,
|
| 11579 |
+
"num_input_tokens_seen": 17039360000,
|
| 11580 |
+
"step": 65000
|
| 11581 |
+
},
|
| 11582 |
+
{
|
| 11583 |
+
"epoch": 0.4375587517636039,
|
| 11584 |
+
"grad_norm": 0.16188842058181763,
|
| 11585 |
+
"learning_rate": 0.00021814686889249158,
|
| 11586 |
+
"loss": 2.9812,
|
| 11587 |
+
"num_input_tokens_seen": 17052467200,
|
| 11588 |
+
"step": 65050
|
| 11589 |
+
},
|
| 11590 |
+
{
|
| 11591 |
+
"epoch": 0.43789507670731154,
|
| 11592 |
+
"grad_norm": 0.14550812542438507,
|
| 11593 |
+
"learning_rate": 0.00021410601988619394,
|
| 11594 |
+
"loss": 2.9856,
|
| 11595 |
+
"num_input_tokens_seen": 17065574400,
|
| 11596 |
+
"step": 65100
|
| 11597 |
+
},
|
| 11598 |
+
{
|
| 11599 |
+
"epoch": 0.43823140165101915,
|
| 11600 |
+
"grad_norm": 0.1500539779663086,
|
| 11601 |
+
"learning_rate": 0.00021009272593674322,
|
| 11602 |
+
"loss": 2.9827,
|
| 11603 |
+
"num_input_tokens_seen": 17078681600,
|
| 11604 |
+
"step": 65150
|
| 11605 |
+
},
|
| 11606 |
+
{
|
| 11607 |
+
"epoch": 0.43856772659472676,
|
| 11608 |
+
"grad_norm": 0.1571357101202011,
|
| 11609 |
+
"learning_rate": 0.00020610737385376348,
|
| 11610 |
+
"loss": 2.9788,
|
| 11611 |
+
"num_input_tokens_seen": 17091788800,
|
| 11612 |
+
"step": 65200
|
| 11613 |
+
},
|
| 11614 |
+
{
|
| 11615 |
+
"epoch": 0.43890405153843437,
|
| 11616 |
+
"grad_norm": 0.1671544760465622,
|
| 11617 |
+
"learning_rate": 0.00020215034775378332,
|
| 11618 |
+
"loss": 2.9758,
|
| 11619 |
+
"num_input_tokens_seen": 17104896000,
|
| 11620 |
+
"step": 65250
|
| 11621 |
+
},
|
| 11622 |
+
{
|
| 11623 |
+
"epoch": 0.439240376482142,
|
| 11624 |
+
"grad_norm": 0.15525776147842407,
|
| 11625 |
+
"learning_rate": 0.0001982220290232143,
|
| 11626 |
+
"loss": 2.9823,
|
| 11627 |
+
"num_input_tokens_seen": 17118003200,
|
| 11628 |
+
"step": 65300
|
| 11629 |
+
},
|
| 11630 |
+
{
|
| 11631 |
+
"epoch": 0.4395767014258496,
|
| 11632 |
+
"grad_norm": 0.14799903333187103,
|
| 11633 |
+
"learning_rate": 0.00019432279628159188,
|
| 11634 |
+
"loss": 2.9781,
|
| 11635 |
+
"num_input_tokens_seen": 17131110400,
|
| 11636 |
+
"step": 65350
|
| 11637 |
+
},
|
| 11638 |
+
{
|
| 11639 |
+
"epoch": 0.4399130263695572,
|
| 11640 |
+
"grad_norm": 0.16087676584720612,
|
| 11641 |
+
"learning_rate": 0.00019045302534508295,
|
| 11642 |
+
"loss": 2.9805,
|
| 11643 |
+
"num_input_tokens_seen": 17144217600,
|
| 11644 |
+
"step": 65400
|
| 11645 |
+
},
|
| 11646 |
+
{
|
| 11647 |
+
"epoch": 0.4402493513132648,
|
| 11648 |
+
"grad_norm": 0.15892113745212555,
|
| 11649 |
+
"learning_rate": 0.0001866130891902653,
|
| 11650 |
+
"loss": 2.9823,
|
| 11651 |
+
"num_input_tokens_seen": 17157324800,
|
| 11652 |
+
"step": 65450
|
| 11653 |
+
},
|
| 11654 |
+
{
|
| 11655 |
+
"epoch": 0.4405856762569724,
|
| 11656 |
+
"grad_norm": 0.187602236866951,
|
| 11657 |
+
"learning_rate": 0.00018280335791817732,
|
| 11658 |
+
"loss": 2.9804,
|
| 11659 |
+
"num_input_tokens_seen": 17170432000,
|
| 11660 |
+
"step": 65500
|
| 11661 |
+
},
|
| 11662 |
+
{
|
| 11663 |
+
"epoch": 0.4405856762569724,
|
| 11664 |
+
"eval_loss": 2.875824451446533,
|
| 11665 |
+
"eval_runtime": 53.0867,
|
| 11666 |
+
"eval_samples_per_second": 94.186,
|
| 11667 |
+
"eval_steps_per_second": 23.546,
|
| 11668 |
+
"num_input_tokens_seen": 17170432000,
|
| 11669 |
+
"step": 65500
|
| 11670 |
+
},
|
| 11671 |
+
{
|
| 11672 |
+
"epoch": 0.44092200120068004,
|
| 11673 |
+
"grad_norm": 0.15579210221767426,
|
| 11674 |
+
"learning_rate": 0.0001790241987186485,
|
| 11675 |
+
"loss": 2.9734,
|
| 11676 |
+
"num_input_tokens_seen": 17183539200,
|
| 11677 |
+
"step": 65550
|
| 11678 |
+
},
|
| 11679 |
+
{
|
| 11680 |
+
"epoch": 0.44125832614438765,
|
| 11681 |
+
"grad_norm": 0.15250550210475922,
|
| 11682 |
+
"learning_rate": 0.00017527597583490823,
|
| 11683 |
+
"loss": 2.9787,
|
| 11684 |
+
"num_input_tokens_seen": 17196646400,
|
| 11685 |
+
"step": 65600
|
| 11686 |
+
},
|
| 11687 |
+
{
|
| 11688 |
+
"epoch": 0.44159465108809526,
|
| 11689 |
+
"grad_norm": 0.15954890847206116,
|
| 11690 |
+
"learning_rate": 0.00017155905052847938,
|
| 11691 |
+
"loss": 2.978,
|
| 11692 |
+
"num_input_tokens_seen": 17209753600,
|
| 11693 |
+
"step": 65650
|
| 11694 |
+
},
|
| 11695 |
+
{
|
| 11696 |
+
"epoch": 0.44193097603180287,
|
| 11697 |
+
"grad_norm": 0.15598754584789276,
|
| 11698 |
+
"learning_rate": 0.00016787378104435928,
|
| 11699 |
+
"loss": 2.9809,
|
| 11700 |
+
"num_input_tokens_seen": 17222860800,
|
| 11701 |
+
"step": 65700
|
| 11702 |
+
},
|
| 11703 |
+
{
|
| 11704 |
+
"epoch": 0.4422673009755105,
|
| 11705 |
+
"grad_norm": 0.14709477126598358,
|
| 11706 |
+
"learning_rate": 0.00016422052257649078,
|
| 11707 |
+
"loss": 2.9793,
|
| 11708 |
+
"num_input_tokens_seen": 17235968000,
|
| 11709 |
+
"step": 65750
|
| 11710 |
+
},
|
| 11711 |
+
{
|
| 11712 |
+
"epoch": 0.4426036259192181,
|
| 11713 |
+
"grad_norm": 0.15505217015743256,
|
| 11714 |
+
"learning_rate": 0.0001605996272335291,
|
| 11715 |
+
"loss": 2.9763,
|
| 11716 |
+
"num_input_tokens_seen": 17249075200,
|
| 11717 |
+
"step": 65800
|
| 11718 |
+
},
|
| 11719 |
+
{
|
| 11720 |
+
"epoch": 0.4429399508629257,
|
| 11721 |
+
"grad_norm": 0.14491549134254456,
|
| 11722 |
+
"learning_rate": 0.0001570114440049037,
|
| 11723 |
+
"loss": 2.9756,
|
| 11724 |
+
"num_input_tokens_seen": 17262182400,
|
| 11725 |
+
"step": 65850
|
| 11726 |
+
},
|
| 11727 |
+
{
|
| 11728 |
+
"epoch": 0.4432762758066333,
|
| 11729 |
+
"grad_norm": 0.1571652740240097,
|
| 11730 |
+
"learning_rate": 0.00015345631872718213,
|
| 11731 |
+
"loss": 2.977,
|
| 11732 |
+
"num_input_tokens_seen": 17275289600,
|
| 11733 |
+
"step": 65900
|
| 11734 |
+
},
|
| 11735 |
+
{
|
| 11736 |
+
"epoch": 0.4436126007503409,
|
| 11737 |
+
"grad_norm": 0.18299035727977753,
|
| 11738 |
+
"learning_rate": 0.00014993459405073824,
|
| 11739 |
+
"loss": 2.9788,
|
| 11740 |
+
"num_input_tokens_seen": 17288396800,
|
| 11741 |
+
"step": 65950
|
| 11742 |
+
},
|
| 11743 |
+
{
|
| 11744 |
+
"epoch": 0.44394892569404854,
|
| 11745 |
+
"grad_norm": 0.14829285442829132,
|
| 11746 |
+
"learning_rate": 0.00014644660940672628,
|
| 11747 |
+
"loss": 2.9851,
|
| 11748 |
+
"num_input_tokens_seen": 17301504000,
|
| 11749 |
+
"step": 66000
|
| 11750 |
+
},
|
| 11751 |
+
{
|
| 11752 |
+
"epoch": 0.44394892569404854,
|
| 11753 |
+
"eval_loss": 2.8729286193847656,
|
| 11754 |
+
"eval_runtime": 53.2839,
|
| 11755 |
+
"eval_samples_per_second": 93.837,
|
| 11756 |
+
"eval_steps_per_second": 23.459,
|
| 11757 |
+
"num_input_tokens_seen": 17301504000,
|
| 11758 |
+
"step": 66000
|
| 11759 |
+
},
|
| 11760 |
+
{
|
| 11761 |
+
"epoch": 0.4442852506377562,
|
| 11762 |
+
"grad_norm": 0.14435406029224396,
|
| 11763 |
+
"learning_rate": 0.0001429927009743659,
|
| 11764 |
+
"loss": 2.9718,
|
| 11765 |
+
"num_input_tokens_seen": 17314611200,
|
| 11766 |
+
"step": 66050
|
| 11767 |
+
},
|
| 11768 |
+
{
|
| 11769 |
+
"epoch": 0.4446215755814638,
|
| 11770 |
+
"grad_norm": 0.1603071242570877,
|
| 11771 |
+
"learning_rate": 0.0001395732016485406,
|
| 11772 |
+
"loss": 2.9731,
|
| 11773 |
+
"num_input_tokens_seen": 17327718400,
|
| 11774 |
+
"step": 66100
|
| 11775 |
+
},
|
| 11776 |
+
{
|
| 11777 |
+
"epoch": 0.4449579005251714,
|
| 11778 |
+
"grad_norm": 0.14310726523399353,
|
| 11779 |
+
"learning_rate": 0.00013618844100771256,
|
| 11780 |
+
"loss": 2.9665,
|
| 11781 |
+
"num_input_tokens_seen": 17340825600,
|
| 11782 |
+
"step": 66150
|
| 11783 |
+
},
|
| 11784 |
+
{
|
| 11785 |
+
"epoch": 0.44529422546887903,
|
| 11786 |
+
"grad_norm": 0.276594340801239,
|
| 11787 |
+
"learning_rate": 0.00013283874528215734,
|
| 11788 |
+
"loss": 2.9711,
|
| 11789 |
+
"num_input_tokens_seen": 17353932800,
|
| 11790 |
+
"step": 66200
|
| 11791 |
+
},
|
| 11792 |
+
{
|
| 11793 |
+
"epoch": 0.44563055041258665,
|
| 11794 |
+
"grad_norm": 0.1535540074110031,
|
| 11795 |
+
"learning_rate": 0.00012952443732252057,
|
| 11796 |
+
"loss": 2.9693,
|
| 11797 |
+
"num_input_tokens_seen": 17367040000,
|
| 11798 |
+
"step": 66250
|
| 11799 |
+
},
|
| 11800 |
+
{
|
| 11801 |
+
"epoch": 0.44596687535629426,
|
| 11802 |
+
"grad_norm": 0.15807458758354187,
|
| 11803 |
+
"learning_rate": 0.00012624583656870153,
|
| 11804 |
+
"loss": 2.9754,
|
| 11805 |
+
"num_input_tokens_seen": 17380147200,
|
| 11806 |
+
"step": 66300
|
| 11807 |
+
},
|
| 11808 |
+
{
|
| 11809 |
+
"epoch": 0.44630320030000187,
|
| 11810 |
+
"grad_norm": 0.14477893710136414,
|
| 11811 |
+
"learning_rate": 0.00012300325901906528,
|
| 11812 |
+
"loss": 2.9735,
|
| 11813 |
+
"num_input_tokens_seen": 17393254400,
|
| 11814 |
+
"step": 66350
|
| 11815 |
+
},
|
| 11816 |
+
{
|
| 11817 |
+
"epoch": 0.4466395252437095,
|
| 11818 |
+
"grad_norm": 0.14505073428153992,
|
| 11819 |
+
"learning_rate": 0.00011979701719998454,
|
| 11820 |
+
"loss": 2.9783,
|
| 11821 |
+
"num_input_tokens_seen": 17406361600,
|
| 11822 |
+
"step": 66400
|
| 11823 |
+
},
|
| 11824 |
+
{
|
| 11825 |
+
"epoch": 0.4469758501874171,
|
| 11826 |
+
"grad_norm": 0.15850161015987396,
|
| 11827 |
+
"learning_rate": 0.00011662742013571926,
|
| 11828 |
+
"loss": 2.967,
|
| 11829 |
+
"num_input_tokens_seen": 17419468800,
|
| 11830 |
+
"step": 66450
|
| 11831 |
+
},
|
| 11832 |
+
{
|
| 11833 |
+
"epoch": 0.4473121751311247,
|
| 11834 |
+
"grad_norm": 0.14653578400611877,
|
| 11835 |
+
"learning_rate": 0.00011349477331863151,
|
| 11836 |
+
"loss": 2.9651,
|
| 11837 |
+
"num_input_tokens_seen": 17432576000,
|
| 11838 |
+
"step": 66500
|
| 11839 |
+
},
|
| 11840 |
+
{
|
| 11841 |
+
"epoch": 0.4473121751311247,
|
| 11842 |
+
"eval_loss": 2.8710148334503174,
|
| 11843 |
+
"eval_runtime": 53.2889,
|
| 11844 |
+
"eval_samples_per_second": 93.828,
|
| 11845 |
+
"eval_steps_per_second": 23.457,
|
| 11846 |
+
"num_input_tokens_seen": 17432576000,
|
| 11847 |
+
"step": 66500
|
| 11848 |
+
},
|
| 11849 |
+
{
|
| 11850 |
+
"epoch": 0.4476485000748323,
|
| 11851 |
+
"grad_norm": 0.15636616945266724,
|
| 11852 |
+
"learning_rate": 0.00011039937867974164,
|
| 11853 |
+
"loss": 2.9758,
|
| 11854 |
+
"num_input_tokens_seen": 17445683200,
|
| 11855 |
+
"step": 66550
|
| 11856 |
+
},
|
| 11857 |
+
{
|
| 11858 |
+
"epoch": 0.4479848250185399,
|
| 11859 |
+
"grad_norm": 0.14427579939365387,
|
| 11860 |
+
"learning_rate": 0.00010734153455962764,
|
| 11861 |
+
"loss": 2.9594,
|
| 11862 |
+
"num_input_tokens_seen": 17458790400,
|
| 11863 |
+
"step": 66600
|
| 11864 |
+
},
|
| 11865 |
+
{
|
| 11866 |
+
"epoch": 0.44832114996224753,
|
| 11867 |
+
"grad_norm": 0.15148353576660156,
|
| 11868 |
+
"learning_rate": 0.00010432153567966984,
|
| 11869 |
+
"loss": 2.9684,
|
| 11870 |
+
"num_input_tokens_seen": 17471897600,
|
| 11871 |
+
"step": 66650
|
| 11872 |
+
},
|
| 11873 |
+
{
|
| 11874 |
+
"epoch": 0.44865747490595514,
|
| 11875 |
+
"grad_norm": 0.1541094332933426,
|
| 11876 |
+
"learning_rate": 0.0001013396731136465,
|
| 11877 |
+
"loss": 2.9685,
|
| 11878 |
+
"num_input_tokens_seen": 17485004800,
|
| 11879 |
+
"step": 66700
|
| 11880 |
+
},
|
| 11881 |
+
{
|
| 11882 |
+
"epoch": 0.44899379984966276,
|
| 11883 |
+
"grad_norm": 0.14267295598983765,
|
| 11884 |
+
"learning_rate": 9.839623425967759e-05,
|
| 11885 |
+
"loss": 2.9728,
|
| 11886 |
+
"num_input_tokens_seen": 17498112000,
|
| 11887 |
+
"step": 66750
|
| 11888 |
+
},
|
| 11889 |
+
{
|
| 11890 |
+
"epoch": 0.44933012479337037,
|
| 11891 |
+
"grad_norm": 0.1437918245792389,
|
| 11892 |
+
"learning_rate": 9.549150281252633e-05,
|
| 11893 |
+
"loss": 2.9752,
|
| 11894 |
+
"num_input_tokens_seen": 17511219200,
|
| 11895 |
+
"step": 66800
|
| 11896 |
+
},
|
| 11897 |
+
{
|
| 11898 |
+
"epoch": 0.449666449737078,
|
| 11899 |
+
"grad_norm": 0.1517232209444046,
|
| 11900 |
+
"learning_rate": 9.262575873625529e-05,
|
| 11901 |
+
"loss": 2.9729,
|
| 11902 |
+
"num_input_tokens_seen": 17524326400,
|
| 11903 |
+
"step": 66850
|
| 11904 |
+
},
|
| 11905 |
+
{
|
| 11906 |
+
"epoch": 0.4500027746807856,
|
| 11907 |
+
"grad_norm": 0.15286608040332794,
|
| 11908 |
+
"learning_rate": 8.979927823724321e-05,
|
| 11909 |
+
"loss": 2.9687,
|
| 11910 |
+
"num_input_tokens_seen": 17537433600,
|
| 11911 |
+
"step": 66900
|
| 11912 |
+
},
|
| 11913 |
+
{
|
| 11914 |
+
"epoch": 0.4503390996244932,
|
| 11915 |
+
"grad_norm": 0.14875057339668274,
|
| 11916 |
+
"learning_rate": 8.70123337375635e-05,
|
| 11917 |
+
"loss": 2.9758,
|
| 11918 |
+
"num_input_tokens_seen": 17550540800,
|
| 11919 |
+
"step": 66950
|
| 11920 |
+
},
|
| 11921 |
+
{
|
| 11922 |
+
"epoch": 0.4506754245682008,
|
| 11923 |
+
"grad_norm": 0.1493612825870514,
|
| 11924 |
+
"learning_rate": 8.426519384872733e-05,
|
| 11925 |
+
"loss": 2.9704,
|
| 11926 |
+
"num_input_tokens_seen": 17563648000,
|
| 11927 |
+
"step": 67000
|
| 11928 |
+
},
|
| 11929 |
+
{
|
| 11930 |
+
"epoch": 0.4506754245682008,
|
| 11931 |
+
"eval_loss": 2.869231939315796,
|
| 11932 |
+
"eval_runtime": 53.2491,
|
| 11933 |
+
"eval_samples_per_second": 93.898,
|
| 11934 |
+
"eval_steps_per_second": 23.475,
|
| 11935 |
+
"num_input_tokens_seen": 17563648000,
|
| 11936 |
+
"step": 67000
|
| 11937 |
+
},
|
| 11938 |
+
{
|
| 11939 |
+
"epoch": 0.4510117495119084,
|
| 11940 |
+
"grad_norm": 0.14675357937812805,
|
| 11941 |
+
"learning_rate": 8.155812334579532e-05,
|
| 11942 |
+
"loss": 2.9682,
|
| 11943 |
+
"num_input_tokens_seen": 17576755200,
|
| 11944 |
+
"step": 67050
|
| 11945 |
+
},
|
| 11946 |
+
{
|
| 11947 |
+
"epoch": 0.45134807445561603,
|
| 11948 |
+
"grad_norm": 0.14341385662555695,
|
| 11949 |
+
"learning_rate": 7.889138314185678e-05,
|
| 11950 |
+
"loss": 2.9749,
|
| 11951 |
+
"num_input_tokens_seen": 17589862400,
|
| 11952 |
+
"step": 67100
|
| 11953 |
+
},
|
| 11954 |
+
{
|
| 11955 |
+
"epoch": 0.45168439939932364,
|
| 11956 |
+
"grad_norm": 0.1442009061574936,
|
| 11957 |
+
"learning_rate": 7.626523026288279e-05,
|
| 11958 |
+
"loss": 2.9637,
|
| 11959 |
+
"num_input_tokens_seen": 17602969600,
|
| 11960 |
+
"step": 67150
|
| 11961 |
+
},
|
| 11962 |
+
{
|
| 11963 |
+
"epoch": 0.45202072434303125,
|
| 11964 |
+
"grad_norm": 0.14580078423023224,
|
| 11965 |
+
"learning_rate": 7.367991782295391e-05,
|
| 11966 |
+
"loss": 2.9636,
|
| 11967 |
+
"num_input_tokens_seen": 17616076800,
|
| 11968 |
+
"step": 67200
|
| 11969 |
+
},
|
| 11970 |
+
{
|
| 11971 |
+
"epoch": 0.45235704928673887,
|
| 11972 |
+
"grad_norm": 0.13888555765151978,
|
| 11973 |
+
"learning_rate": 7.1135694999864e-05,
|
| 11974 |
+
"loss": 2.9737,
|
| 11975 |
+
"num_input_tokens_seen": 17629184000,
|
| 11976 |
+
"step": 67250
|
| 11977 |
+
},
|
| 11978 |
+
{
|
| 11979 |
+
"epoch": 0.4526933742304465,
|
| 11980 |
+
"grad_norm": 0.14820803701877594,
|
| 11981 |
+
"learning_rate": 6.863280701110408e-05,
|
| 11982 |
+
"loss": 2.9778,
|
| 11983 |
+
"num_input_tokens_seen": 17642291200,
|
| 11984 |
+
"step": 67300
|
| 11985 |
+
},
|
| 11986 |
+
{
|
| 11987 |
+
"epoch": 0.4530296991741541,
|
| 11988 |
+
"grad_norm": 0.14933691918849945,
|
| 11989 |
+
"learning_rate": 6.617149509022808e-05,
|
| 11990 |
+
"loss": 2.9667,
|
| 11991 |
+
"num_input_tokens_seen": 17655398400,
|
| 11992 |
+
"step": 67350
|
| 11993 |
+
},
|
| 11994 |
+
{
|
| 11995 |
+
"epoch": 0.4533660241178617,
|
| 11996 |
+
"grad_norm": 0.14829853177070618,
|
| 11997 |
+
"learning_rate": 6.375199646360142e-05,
|
| 11998 |
+
"loss": 2.9691,
|
| 11999 |
+
"num_input_tokens_seen": 17668505600,
|
| 12000 |
+
"step": 67400
|
| 12001 |
+
},
|
| 12002 |
+
{
|
| 12003 |
+
"epoch": 0.4537023490615693,
|
| 12004 |
+
"grad_norm": 0.14731477200984955,
|
| 12005 |
+
"learning_rate": 6.137454432753797e-05,
|
| 12006 |
+
"loss": 2.9731,
|
| 12007 |
+
"num_input_tokens_seen": 17681612800,
|
| 12008 |
+
"step": 67450
|
| 12009 |
+
},
|
| 12010 |
+
{
|
| 12011 |
+
"epoch": 0.4540386740052769,
|
| 12012 |
+
"grad_norm": 0.14357906579971313,
|
| 12013 |
+
"learning_rate": 5.903936782582253e-05,
|
| 12014 |
+
"loss": 2.9785,
|
| 12015 |
+
"num_input_tokens_seen": 17694720000,
|
| 12016 |
+
"step": 67500
|
| 12017 |
+
},
|
| 12018 |
+
{
|
| 12019 |
+
"epoch": 0.4540386740052769,
|
| 12020 |
+
"eval_loss": 2.867840528488159,
|
| 12021 |
+
"eval_runtime": 53.8197,
|
| 12022 |
+
"eval_samples_per_second": 92.903,
|
| 12023 |
+
"eval_steps_per_second": 23.226,
|
| 12024 |
+
"num_input_tokens_seen": 17694720000,
|
| 12025 |
+
"step": 67500
|
| 12026 |
+
},
|
| 12027 |
+
{
|
| 12028 |
+
"epoch": 0.45437499894898453,
|
| 12029 |
+
"grad_norm": 0.1438903659582138,
|
| 12030 |
+
"learning_rate": 5.6746692027626835e-05,
|
| 12031 |
+
"loss": 2.9733,
|
| 12032 |
+
"num_input_tokens_seen": 17707827200,
|
| 12033 |
+
"step": 67550
|
| 12034 |
+
},
|
| 12035 |
+
{
|
| 12036 |
+
"epoch": 0.45471132389269214,
|
| 12037 |
+
"grad_norm": 0.14171506464481354,
|
| 12038 |
+
"learning_rate": 5.449673790581611e-05,
|
| 12039 |
+
"loss": 2.9637,
|
| 12040 |
+
"num_input_tokens_seen": 17720934400,
|
| 12041 |
+
"step": 67600
|
| 12042 |
+
},
|
| 12043 |
+
{
|
| 12044 |
+
"epoch": 0.45504764883639975,
|
| 12045 |
+
"grad_norm": 0.1645549088716507,
|
| 12046 |
+
"learning_rate": 5.2289722315651546e-05,
|
| 12047 |
+
"loss": 2.9668,
|
| 12048 |
+
"num_input_tokens_seen": 17734041600,
|
| 12049 |
+
"step": 67650
|
| 12050 |
+
},
|
| 12051 |
+
{
|
| 12052 |
+
"epoch": 0.45538397378010737,
|
| 12053 |
+
"grad_norm": 0.1390199065208435,
|
| 12054 |
+
"learning_rate": 5.0125857973889355e-05,
|
| 12055 |
+
"loss": 2.9762,
|
| 12056 |
+
"num_input_tokens_seen": 17747148800,
|
| 12057 |
+
"step": 67700
|
| 12058 |
+
},
|
| 12059 |
+
{
|
| 12060 |
+
"epoch": 0.455720298723815,
|
| 12061 |
+
"grad_norm": 0.14667369425296783,
|
| 12062 |
+
"learning_rate": 4.800535343827833e-05,
|
| 12063 |
+
"loss": 2.9724,
|
| 12064 |
+
"num_input_tokens_seen": 17760256000,
|
| 12065 |
+
"step": 67750
|
| 12066 |
+
},
|
| 12067 |
+
{
|
| 12068 |
+
"epoch": 0.4560566236675226,
|
| 12069 |
+
"grad_norm": 0.14203302562236786,
|
| 12070 |
+
"learning_rate": 4.592841308745932e-05,
|
| 12071 |
+
"loss": 2.9679,
|
| 12072 |
+
"num_input_tokens_seen": 17773363200,
|
| 12073 |
+
"step": 67800
|
| 12074 |
+
},
|
| 12075 |
+
{
|
| 12076 |
+
"epoch": 0.45639294861123025,
|
| 12077 |
+
"grad_norm": 0.1517883837223053,
|
| 12078 |
+
"learning_rate": 4.389523710126619e-05,
|
| 12079 |
+
"loss": 2.9723,
|
| 12080 |
+
"num_input_tokens_seen": 17786470400,
|
| 12081 |
+
"step": 67850
|
| 12082 |
+
},
|
| 12083 |
+
{
|
| 12084 |
+
"epoch": 0.45672927355493786,
|
| 12085 |
+
"grad_norm": 0.1438019722700119,
|
| 12086 |
+
"learning_rate": 4.190602144143207e-05,
|
| 12087 |
+
"loss": 2.973,
|
| 12088 |
+
"num_input_tokens_seen": 17799577600,
|
| 12089 |
+
"step": 67900
|
| 12090 |
+
},
|
| 12091 |
+
{
|
| 12092 |
+
"epoch": 0.4570655984986455,
|
| 12093 |
+
"grad_norm": 0.14281606674194336,
|
| 12094 |
+
"learning_rate": 3.9960957832702595e-05,
|
| 12095 |
+
"loss": 2.9733,
|
| 12096 |
+
"num_input_tokens_seen": 17812684800,
|
| 12097 |
+
"step": 67950
|
| 12098 |
+
},
|
| 12099 |
+
{
|
| 12100 |
+
"epoch": 0.4574019234423531,
|
| 12101 |
+
"grad_norm": 0.14911025762557983,
|
| 12102 |
+
"learning_rate": 3.806023374435663e-05,
|
| 12103 |
+
"loss": 2.9724,
|
| 12104 |
+
"num_input_tokens_seen": 17825792000,
|
| 12105 |
+
"step": 68000
|
| 12106 |
+
},
|
| 12107 |
+
{
|
| 12108 |
+
"epoch": 0.4574019234423531,
|
| 12109 |
+
"eval_loss": 2.8663442134857178,
|
| 12110 |
+
"eval_runtime": 53.8853,
|
| 12111 |
+
"eval_samples_per_second": 92.79,
|
| 12112 |
+
"eval_steps_per_second": 23.197,
|
| 12113 |
+
"num_input_tokens_seen": 17825792000,
|
| 12114 |
+
"step": 68000
|
| 12115 |
+
},
|
| 12116 |
+
{
|
| 12117 |
+
"epoch": 0.4577382483860607,
|
| 12118 |
+
"grad_norm": 0.14517797529697418,
|
| 12119 |
+
"learning_rate": 3.6204032372137984e-05,
|
| 12120 |
+
"loss": 2.9674,
|
| 12121 |
+
"num_input_tokens_seen": 17838899200,
|
| 12122 |
+
"step": 68050
|
| 12123 |
+
},
|
| 12124 |
+
{
|
| 12125 |
+
"epoch": 0.4580745733297683,
|
| 12126 |
+
"grad_norm": 0.14154207706451416,
|
| 12127 |
+
"learning_rate": 3.439253262059822e-05,
|
| 12128 |
+
"loss": 2.9627,
|
| 12129 |
+
"num_input_tokens_seen": 17852006400,
|
| 12130 |
+
"step": 68100
|
| 12131 |
+
},
|
| 12132 |
+
{
|
| 12133 |
+
"epoch": 0.4584108982734759,
|
| 12134 |
+
"grad_norm": 0.14251314103603363,
|
| 12135 |
+
"learning_rate": 3.2625909085853776e-05,
|
| 12136 |
+
"loss": 2.9681,
|
| 12137 |
+
"num_input_tokens_seen": 17865113600,
|
| 12138 |
+
"step": 68150
|
| 12139 |
+
},
|
| 12140 |
+
{
|
| 12141 |
+
"epoch": 0.45874722321718353,
|
| 12142 |
+
"grad_norm": 0.15670983493328094,
|
| 12143 |
+
"learning_rate": 3.0904332038757974e-05,
|
| 12144 |
+
"loss": 2.9708,
|
| 12145 |
+
"num_input_tokens_seen": 17878220800,
|
| 12146 |
+
"step": 68200
|
| 12147 |
+
},
|
| 12148 |
+
{
|
| 12149 |
+
"epoch": 0.45908354816089114,
|
| 12150 |
+
"grad_norm": 0.1453925371170044,
|
| 12151 |
+
"learning_rate": 2.9227967408489654e-05,
|
| 12152 |
+
"loss": 2.9686,
|
| 12153 |
+
"num_input_tokens_seen": 17891328000,
|
| 12154 |
+
"step": 68250
|
| 12155 |
+
},
|
| 12156 |
+
{
|
| 12157 |
+
"epoch": 0.45941987310459875,
|
| 12158 |
+
"grad_norm": 0.13307476043701172,
|
| 12159 |
+
"learning_rate": 2.7596976766560976e-05,
|
| 12160 |
+
"loss": 2.9595,
|
| 12161 |
+
"num_input_tokens_seen": 17904435200,
|
| 12162 |
+
"step": 68300
|
| 12163 |
+
},
|
| 12164 |
+
{
|
| 12165 |
+
"epoch": 0.45975619804830636,
|
| 12166 |
+
"grad_norm": 0.14958307147026062,
|
| 12167 |
+
"learning_rate": 2.6011517311244848e-05,
|
| 12168 |
+
"loss": 2.9661,
|
| 12169 |
+
"num_input_tokens_seen": 17917542400,
|
| 12170 |
+
"step": 68350
|
| 12171 |
+
},
|
| 12172 |
+
{
|
| 12173 |
+
"epoch": 0.460092522992014,
|
| 12174 |
+
"grad_norm": 0.14210085570812225,
|
| 12175 |
+
"learning_rate": 2.4471741852423235e-05,
|
| 12176 |
+
"loss": 2.9737,
|
| 12177 |
+
"num_input_tokens_seen": 17930649600,
|
| 12178 |
+
"step": 68400
|
| 12179 |
+
},
|
| 12180 |
+
{
|
| 12181 |
+
"epoch": 0.4604288479357216,
|
| 12182 |
+
"grad_norm": 0.15127155184745789,
|
| 12183 |
+
"learning_rate": 2.2977798796859794e-05,
|
| 12184 |
+
"loss": 2.9627,
|
| 12185 |
+
"num_input_tokens_seen": 17943756800,
|
| 12186 |
+
"step": 68450
|
| 12187 |
+
},
|
| 12188 |
+
{
|
| 12189 |
+
"epoch": 0.4607651728794292,
|
| 12190 |
+
"grad_norm": 0.14184921979904175,
|
| 12191 |
+
"learning_rate": 2.152983213389559e-05,
|
| 12192 |
+
"loss": 2.9732,
|
| 12193 |
+
"num_input_tokens_seen": 17956864000,
|
| 12194 |
+
"step": 68500
|
| 12195 |
+
},
|
| 12196 |
+
{
|
| 12197 |
+
"epoch": 0.4607651728794292,
|
| 12198 |
+
"eval_loss": 2.865307331085205,
|
| 12199 |
+
"eval_runtime": 53.2908,
|
| 12200 |
+
"eval_samples_per_second": 93.825,
|
| 12201 |
+
"eval_steps_per_second": 23.456,
|
| 12202 |
+
"num_input_tokens_seen": 17956864000,
|
| 12203 |
+
"step": 68500
|
| 12204 |
+
},
|
| 12205 |
+
{
|
| 12206 |
+
"epoch": 0.4611014978231368,
|
| 12207 |
+
"grad_norm": 0.14755961298942566,
|
| 12208 |
+
"learning_rate": 2.0127981421571295e-05,
|
| 12209 |
+
"loss": 2.9687,
|
| 12210 |
+
"num_input_tokens_seen": 17969971200,
|
| 12211 |
+
"step": 68550
|
| 12212 |
+
},
|
| 12213 |
+
{
|
| 12214 |
+
"epoch": 0.4614378227668444,
|
| 12215 |
+
"grad_norm": 0.1370965540409088,
|
| 12216 |
+
"learning_rate": 1.8772381773176416e-05,
|
| 12217 |
+
"loss": 2.9711,
|
| 12218 |
+
"num_input_tokens_seen": 17983078400,
|
| 12219 |
+
"step": 68600
|
| 12220 |
+
},
|
| 12221 |
+
{
|
| 12222 |
+
"epoch": 0.46177414771055203,
|
| 12223 |
+
"grad_norm": 0.14454130828380585,
|
| 12224 |
+
"learning_rate": 1.7463163844226305e-05,
|
| 12225 |
+
"loss": 2.9633,
|
| 12226 |
+
"num_input_tokens_seen": 17996185600,
|
| 12227 |
+
"step": 68650
|
| 12228 |
+
},
|
| 12229 |
+
{
|
| 12230 |
+
"epoch": 0.46211047265425964,
|
| 12231 |
+
"grad_norm": 0.13908445835113525,
|
| 12232 |
+
"learning_rate": 1.620045381987012e-05,
|
| 12233 |
+
"loss": 2.9662,
|
| 12234 |
+
"num_input_tokens_seen": 18009292800,
|
| 12235 |
+
"step": 68700
|
| 12236 |
+
},
|
| 12237 |
+
{
|
| 12238 |
+
"epoch": 0.46244679759796725,
|
| 12239 |
+
"grad_norm": 0.2359876185655594,
|
| 12240 |
+
"learning_rate": 1.4984373402728013e-05,
|
| 12241 |
+
"loss": 2.9671,
|
| 12242 |
+
"num_input_tokens_seen": 18022400000,
|
| 12243 |
+
"step": 68750
|
| 12244 |
+
},
|
| 12245 |
+
{
|
| 12246 |
+
"epoch": 0.46278312254167486,
|
| 12247 |
+
"grad_norm": 0.13809122145175934,
|
| 12248 |
+
"learning_rate": 1.3815039801161721e-05,
|
| 12249 |
+
"loss": 2.9684,
|
| 12250 |
+
"num_input_tokens_seen": 18035507200,
|
| 12251 |
+
"step": 68800
|
| 12252 |
+
},
|
| 12253 |
+
{
|
| 12254 |
+
"epoch": 0.4631194474853825,
|
| 12255 |
+
"grad_norm": 0.14375115931034088,
|
| 12256 |
+
"learning_rate": 1.26925657179775e-05,
|
| 12257 |
+
"loss": 2.9677,
|
| 12258 |
+
"num_input_tokens_seen": 18048614400,
|
| 12259 |
+
"step": 68850
|
| 12260 |
+
},
|
| 12261 |
+
{
|
| 12262 |
+
"epoch": 0.4634557724290901,
|
| 12263 |
+
"grad_norm": 0.14648525416851044,
|
| 12264 |
+
"learning_rate": 1.1617059339563806e-05,
|
| 12265 |
+
"loss": 2.9625,
|
| 12266 |
+
"num_input_tokens_seen": 18061721600,
|
| 12267 |
+
"step": 68900
|
| 12268 |
+
},
|
| 12269 |
+
{
|
| 12270 |
+
"epoch": 0.4637920973727977,
|
| 12271 |
+
"grad_norm": 0.1428016871213913,
|
| 12272 |
+
"learning_rate": 1.058862432546387e-05,
|
| 12273 |
+
"loss": 2.9717,
|
| 12274 |
+
"num_input_tokens_seen": 18074828800,
|
| 12275 |
+
"step": 68950
|
| 12276 |
+
},
|
| 12277 |
+
{
|
| 12278 |
+
"epoch": 0.4641284223165053,
|
| 12279 |
+
"grad_norm": 0.14518927037715912,
|
| 12280 |
+
"learning_rate": 9.607359798384786e-06,
|
| 12281 |
+
"loss": 2.9622,
|
| 12282 |
+
"num_input_tokens_seen": 18087936000,
|
| 12283 |
+
"step": 69000
|
| 12284 |
+
},
|
| 12285 |
+
{
|
| 12286 |
+
"epoch": 0.4641284223165053,
|
| 12287 |
+
"eval_loss": 2.8647797107696533,
|
| 12288 |
+
"eval_runtime": 53.1259,
|
| 12289 |
+
"eval_samples_per_second": 94.116,
|
| 12290 |
+
"eval_steps_per_second": 23.529,
|
| 12291 |
+
"num_input_tokens_seen": 18087936000,
|
| 12292 |
+
"step": 69000
|
| 12293 |
+
},
|
| 12294 |
+
{
|
| 12295 |
+
"epoch": 0.4644647472602129,
|
| 12296 |
+
"grad_norm": 0.1424110382795334,
|
| 12297 |
+
"learning_rate": 8.67336033464411e-06,
|
| 12298 |
+
"loss": 2.9591,
|
| 12299 |
+
"num_input_tokens_seen": 18101043200,
|
| 12300 |
+
"step": 69050
|
| 12301 |
+
},
|
| 12302 |
+
{
|
| 12303 |
+
"epoch": 0.46480107220392053,
|
| 12304 |
+
"grad_norm": 0.14686723053455353,
|
| 12305 |
+
"learning_rate": 7.786715955054202e-06,
|
| 12306 |
+
"loss": 2.9561,
|
| 12307 |
+
"num_input_tokens_seen": 18114150400,
|
| 12308 |
+
"step": 69100
|
| 12309 |
+
},
|
| 12310 |
+
{
|
| 12311 |
+
"epoch": 0.46513739714762814,
|
| 12312 |
+
"grad_norm": 0.13719068467617035,
|
| 12313 |
+
"learning_rate": 6.947512116245669e-06,
|
| 12314 |
+
"loss": 2.9629,
|
| 12315 |
+
"num_input_tokens_seen": 18127257600,
|
| 12316 |
+
"step": 69150
|
| 12317 |
+
},
|
| 12318 |
+
{
|
| 12319 |
+
"epoch": 0.46547372209133575,
|
| 12320 |
+
"grad_norm": 0.14337210357189178,
|
| 12321 |
+
"learning_rate": 6.15582970243117e-06,
|
| 12322 |
+
"loss": 2.9713,
|
| 12323 |
+
"num_input_tokens_seen": 18140364800,
|
| 12324 |
+
"step": 69200
|
| 12325 |
+
},
|
| 12326 |
+
{
|
| 12327 |
+
"epoch": 0.46581004703504336,
|
| 12328 |
+
"grad_norm": 0.18305008113384247,
|
| 12329 |
+
"learning_rate": 5.411745017609493e-06,
|
| 12330 |
+
"loss": 2.9659,
|
| 12331 |
+
"num_input_tokens_seen": 18153472000,
|
| 12332 |
+
"step": 69250
|
| 12333 |
+
},
|
| 12334 |
+
{
|
| 12335 |
+
"epoch": 0.466146371978751,
|
| 12336 |
+
"grad_norm": 0.137322798371315,
|
| 12337 |
+
"learning_rate": 4.715329778211374e-06,
|
| 12338 |
+
"loss": 2.9678,
|
| 12339 |
+
"num_input_tokens_seen": 18166579200,
|
| 12340 |
+
"step": 69300
|
| 12341 |
+
},
|
| 12342 |
+
{
|
| 12343 |
+
"epoch": 0.4664826969224586,
|
| 12344 |
+
"grad_norm": 0.13300293684005737,
|
| 12345 |
+
"learning_rate": 4.066651106186981e-06,
|
| 12346 |
+
"loss": 2.9647,
|
| 12347 |
+
"num_input_tokens_seen": 18179686400,
|
| 12348 |
+
"step": 69350
|
| 12349 |
+
},
|
| 12350 |
+
{
|
| 12351 |
+
"epoch": 0.4668190218661662,
|
| 12352 |
+
"grad_norm": 0.13357709348201752,
|
| 12353 |
+
"learning_rate": 3.4657715225368535e-06,
|
| 12354 |
+
"loss": 2.965,
|
| 12355 |
+
"num_input_tokens_seen": 18192793600,
|
| 12356 |
+
"step": 69400
|
| 12357 |
+
},
|
| 12358 |
+
{
|
| 12359 |
+
"epoch": 0.4671553468098738,
|
| 12360 |
+
"grad_norm": 0.13399702310562134,
|
| 12361 |
+
"learning_rate": 2.9127489412859033e-06,
|
| 12362 |
+
"loss": 2.9614,
|
| 12363 |
+
"num_input_tokens_seen": 18205900800,
|
| 12364 |
+
"step": 69450
|
| 12365 |
+
},
|
| 12366 |
+
{
|
| 12367 |
+
"epoch": 0.4674916717535814,
|
| 12368 |
+
"grad_norm": 0.13703274726867676,
|
| 12369 |
+
"learning_rate": 2.4076366639015913e-06,
|
| 12370 |
+
"loss": 2.964,
|
| 12371 |
+
"num_input_tokens_seen": 18219008000,
|
| 12372 |
+
"step": 69500
|
| 12373 |
+
},
|
| 12374 |
+
{
|
| 12375 |
+
"epoch": 0.4674916717535814,
|
| 12376 |
+
"eval_loss": 2.8645894527435303,
|
| 12377 |
+
"eval_runtime": 53.3524,
|
| 12378 |
+
"eval_samples_per_second": 93.716,
|
| 12379 |
+
"eval_steps_per_second": 23.429,
|
| 12380 |
+
"num_input_tokens_seen": 18219008000,
|
| 12381 |
+
"step": 69500
|
| 12382 |
+
},
|
| 12383 |
+
{
|
| 12384 |
+
"epoch": 0.46782799669728903,
|
| 12385 |
+
"grad_norm": 0.3837803900241852,
|
| 12386 |
+
"learning_rate": 1.950483374156431e-06,
|
| 12387 |
+
"loss": 2.9665,
|
| 12388 |
+
"num_input_tokens_seen": 18232115200,
|
| 12389 |
+
"step": 69550
|
| 12390 |
+
},
|
| 12391 |
+
{
|
| 12392 |
+
"epoch": 0.46816432164099664,
|
| 12393 |
+
"grad_norm": 0.13585589826107025,
|
| 12394 |
+
"learning_rate": 1.541333133436018e-06,
|
| 12395 |
+
"loss": 2.9579,
|
| 12396 |
+
"num_input_tokens_seen": 18245222400,
|
| 12397 |
+
"step": 69600
|
| 12398 |
+
},
|
| 12399 |
+
{
|
| 12400 |
+
"epoch": 0.4685006465847043,
|
| 12401 |
+
"grad_norm": 0.13347585499286652,
|
| 12402 |
+
"learning_rate": 1.18022537649215e-06,
|
| 12403 |
+
"loss": 2.9636,
|
| 12404 |
+
"num_input_tokens_seen": 18258329600,
|
| 12405 |
+
"step": 69650
|
| 12406 |
+
},
|
| 12407 |
+
{
|
| 12408 |
+
"epoch": 0.4688369715284119,
|
| 12409 |
+
"grad_norm": 0.13726544380187988,
|
| 12410 |
+
"learning_rate": 8.671949076420882e-07,
|
| 12411 |
+
"loss": 2.9626,
|
| 12412 |
+
"num_input_tokens_seen": 18271436800,
|
| 12413 |
+
"step": 69700
|
| 12414 |
+
},
|
| 12415 |
+
{
|
| 12416 |
+
"epoch": 0.4691732964721195,
|
| 12417 |
+
"grad_norm": 0.14254987239837646,
|
| 12418 |
+
"learning_rate": 6.022718974137975e-07,
|
| 12419 |
+
"loss": 2.9698,
|
| 12420 |
+
"num_input_tokens_seen": 18284544000,
|
| 12421 |
+
"step": 69750
|
| 12422 |
+
},
|
| 12423 |
+
{
|
| 12424 |
+
"epoch": 0.46950962141582714,
|
| 12425 |
+
"grad_norm": 0.1329219937324524,
|
| 12426 |
+
"learning_rate": 3.854818796385495e-07,
|
| 12427 |
+
"loss": 2.96,
|
| 12428 |
+
"num_input_tokens_seen": 18297651200,
|
| 12429 |
+
"step": 69800
|
| 12430 |
+
},
|
| 12431 |
+
{
|
| 12432 |
+
"epoch": 0.46984594635953475,
|
| 12433 |
+
"grad_norm": 0.1384582668542862,
|
| 12434 |
+
"learning_rate": 2.1684574898939157e-07,
|
| 12435 |
+
"loss": 2.9693,
|
| 12436 |
+
"num_input_tokens_seen": 18310758400,
|
| 12437 |
+
"step": 69850
|
| 12438 |
+
},
|
| 12439 |
+
{
|
| 12440 |
+
"epoch": 0.47018227130324236,
|
| 12441 |
+
"grad_norm": 0.14365264773368835,
|
| 12442 |
+
"learning_rate": 9.637975896759077e-08,
|
| 12443 |
+
"loss": 2.9686,
|
| 12444 |
+
"num_input_tokens_seen": 18323865600,
|
| 12445 |
+
"step": 69900
|
| 12446 |
+
},
|
| 12447 |
+
{
|
| 12448 |
+
"epoch": 0.47051859624694997,
|
| 12449 |
+
"grad_norm": 0.13613733649253845,
|
| 12450 |
+
"learning_rate": 2.4095520335998265e-08,
|
| 12451 |
+
"loss": 2.9607,
|
| 12452 |
+
"num_input_tokens_seen": 18336972800,
|
| 12453 |
+
"step": 69950
|
| 12454 |
+
},
|
| 12455 |
+
{
|
| 12456 |
+
"epoch": 0.4708549211906576,
|
| 12457 |
+
"grad_norm": 0.14377959072589874,
|
| 12458 |
+
"learning_rate": 0.0,
|
| 12459 |
+
"loss": 2.9684,
|
| 12460 |
+
"num_input_tokens_seen": 18350080000,
|
| 12461 |
+
"step": 70000
|
| 12462 |
+
},
|
| 12463 |
+
{
|
| 12464 |
+
"epoch": 0.4708549211906576,
|
| 12465 |
+
"eval_loss": 2.8644959926605225,
|
| 12466 |
+
"eval_runtime": 54.0337,
|
| 12467 |
+
"eval_samples_per_second": 92.535,
|
| 12468 |
+
"eval_steps_per_second": 23.134,
|
| 12469 |
+
"num_input_tokens_seen": 18350080000,
|
| 12470 |
+
"step": 70000
|
| 12471 |
+
},
|
| 12472 |
+
{
|
| 12473 |
+
"epoch": 0.4708549211906576,
|
| 12474 |
+
"num_input_tokens_seen": 18350080000,
|
| 12475 |
+
"step": 70000,
|
| 12476 |
+
"total_flos": 4.9088254967808e+18,
|
| 12477 |
+
"train_loss": 0.4265073311941964,
|
| 12478 |
+
"train_runtime": 14322.5496,
|
| 12479 |
+
"train_samples_per_second": 1251.174,
|
| 12480 |
+
"train_steps_per_second": 4.887,
|
| 12481 |
+
"train_tokens_per_second": 1281202.057
|
| 12482 |
}
|
| 12483 |
],
|
| 12484 |
"logging_steps": 50,
|
| 12485 |
+
"max_steps": 70000,
|
| 12486 |
+
"num_input_tokens_seen": 18350080000,
|
| 12487 |
"num_train_epochs": 1,
|
| 12488 |
"save_steps": 1000,
|
| 12489 |
"stateful_callbacks": {
|
|
|
|
| 12498 |
"attributes": {}
|
| 12499 |
}
|
| 12500 |
},
|
| 12501 |
+
"total_flos": 4.9088254967808e+18,
|
| 12502 |
"train_batch_size": 64,
|
| 12503 |
"trial_name": null,
|
| 12504 |
"trial_params": null
|